In [217]:
#Import dependencies
import json
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import psycopg2
from config import db_password
import time


In [218]:
#Import datasets
advanced_df = pd.read_csv("resources/advanced.csv")
team_summary_df = pd.read_csv("resources/team_summaries.csv")
player_totals_df = pd.read_csv("resources/player_totals.csv")
team_totals_df = pd.read_csv("resources/team_totals.csv")

In [219]:
#Obtain column headers which contain stat abbreviations
df_cols = [advanced_df.columns, team_summary_df.columns, 
       player_totals_df.columns, team_totals_df.columns]

In [220]:
#List of the four data frames
dfs = [advanced_df, team_summary_df, 
       player_totals_df, team_totals_df]

In [221]:
#Generate list of all column headers for the four dataframes
abbrev = []
for df_col in df_cols:
    for item in df_col:
        if item not in abbrev:
            abbrev.append(item)
abbrev

['seas_id',
 'season',
 'player_id',
 'player',
 'birth_year',
 'pos',
 'age',
 'experience',
 'lg',
 'tm',
 'g',
 'mp',
 'per',
 'ts_percent',
 'x3p_ar',
 'f_tr',
 'orb_percent',
 'drb_percent',
 'trb_percent',
 'ast_percent',
 'stl_percent',
 'blk_percent',
 'tov_percent',
 'usg_percent',
 'ows',
 'dws',
 'ws',
 'ws_48',
 'obpm',
 'dbpm',
 'bpm',
 'vorp',
 'team',
 'abbreviation',
 'playoffs',
 'w',
 'l',
 'pw',
 'pl',
 'mov',
 'sos',
 'srs',
 'o_rtg',
 'd_rtg',
 'n_rtg',
 'pace',
 'e_fg_percent',
 'ft_fga',
 'opp_e_fg_percent',
 'opp_tov_percent',
 'opp_drb_percent',
 'opp_ft_fga',
 'arena',
 'attend',
 'attend_g',
 'gs',
 'fg',
 'fga',
 'fg_percent',
 'x3p',
 'x3pa',
 'x3p_percent',
 'x2p',
 'x2pa',
 'x2p_percent',
 'ft',
 'fta',
 'ft_percent',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts']

In [222]:
#List of all corresponding terms referred to by column header abbreviations
meanings = ['season_id',
 'season',
 'player_id',
 'player',
 'birth_year',
 'position',
 'age',
 'experience',
 'league',
 'team',
 'games',
 'minutes_played',
 'player_efficiency_rating',
 'true_shooting_percentage',
 'three_point_attempt_rate',
 'free_throw_attempt_rate',
 'offensive_rebound_percentage',
 'defensive_rebound_percentage',
 'total_rebound_percentage',
 'assist_percentage',
 'steal_percentage',
 'block_percentage',
 'turnover_percentage',
 'usage_percentage',
 'offensive_win_shares',
 'defensive_win_shares',
 'win_shares',
 'win_shares_per_48_min',
 'offensive_box_plus/minus',
 'defensive_box_plus/minus',
 'box_plus/minus',
 'value_over_replacement_plyr',
 'team',
 'abbreviation',
 'playoffs',
 'wins',
 'losses',
 'pythagorean_wins',
 'pythagorean_losses',
 'margin_of_victory',
 'strength_of_schedule',
 'simple_rating_system',
 'offensive_rating',
 'defensive_rating',
 'net_rating',
 'pace',
 'effective_fieldgoal_percentage',
 'free_throws/fieldgoal_attempts',
 'opponent_effective_fieldgoal_percentage',
 'opponent_turnover_percentage',
 'opponent_defensive_rebound_percentage',
 'opponent_freethrows/fieldgoal_attempts',
 'arena',
 'attend',
 'attend_g',
 'games_started',
 'fieldgoals',
 'fieldgoals_attemped',
 'fieldgoal_percentage',
 'three_points_made',
 'three_point_attempts',
 'three_point_percentage',
 'two_point_made',
 'two_point_attempts',
 'two_point_percentage',
 'freethrows_made',
 'freethrows_attempted',
 'freethrow_percentage',
 'offensive_rebounds',
 'defensive_rebounds',
 'total_rebounds',
 'assists',
 'steals',
 'blocks',
 'turnovers',
 'personal_fouls',
 'points']

In [223]:
#Generate a dictionary of abbreviations (keys) and full terms (values)
nba_gloss = {}
for i in range(len(abbrev)):
    nba_gloss[abbrev[i]] = meanings[i]

In [224]:
#Replace all column headers in dataframes with new headers containing full terms
for df in dfs:
    df.set_axis([nba_gloss[i] for i in list(df.columns)], inplace=True, axis=1)


In [225]:
#View the team_totals dataframe
team_totals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   season                  1783 non-null   int64  
 1   league                  1783 non-null   object 
 2   team                    1783 non-null   object 
 3   abbreviation            1698 non-null   object 
 4   playoffs                1783 non-null   bool   
 5   games                   1782 non-null   float64
 6   minutes_played          1593 non-null   float64
 7   fieldgoals              1782 non-null   float64
 8   fieldgoals_attemped     1782 non-null   float64
 9   fieldgoal_percentage    1782 non-null   float64
 10  three_points_made       1340 non-null   float64
 11  three_point_attempts    1340 non-null   float64
 12  three_point_percentage  1340 non-null   float64
 13  two_point_made          1782 non-null   float64
 14  two_point_attempts      1782 non-null   

In [255]:
#Drop 'League Average' from team columns in team_totals_df
team_totals_filtered_df = team_totals_df.loc[(team_totals_df['team'] != 'League Average')]
team_totals_filtered_df = team_totals_filtered_df.loc[(team_totals_filtered_df['season'] < 2022) & 
                                   (team_totals_filtered_df['season'] > 1978)]
team_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1186 entries, 31 to 1258
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   season                  1186 non-null   int64  
 1   league                  1186 non-null   object 
 2   team                    1186 non-null   object 
 3   abbreviation            1186 non-null   object 
 4   playoffs                1186 non-null   bool   
 5   games                   1186 non-null   float64
 6   minutes_played          1186 non-null   float64
 7   fieldgoals              1186 non-null   float64
 8   fieldgoals_attemped     1186 non-null   float64
 9   fieldgoal_percentage    1186 non-null   float64
 10  three_points_made       1164 non-null   float64
 11  three_point_attempts    1164 non-null   float64
 12  three_point_percentage  1164 non-null   float64
 13  two_point_made          1186 non-null   float64
 14  two_point_attempts      1186 non-null  

In [261]:
team_totals_filtered_df.head()

Unnamed: 0,season,league,team,abbreviation,playoffs,games,minutes_played,fieldgoals,fieldgoals_attemped,fieldgoal_percentage,...,freethrow_percentage,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
31,2021,NBA,Atlanta Hawks,ATL,True,72.0,17405.0,2937.0,6281.0,0.468,...,0.812,760.0,2525.0,3285.0,1737.0,503.0,342.0,953.0,1392.0,8186.0
32,2021,NBA,Boston Celtics,BOS,True,72.0,17380.0,2985.0,6401.0,0.466,...,0.775,765.0,2421.0,3186.0,1689.0,556.0,383.0,1012.0,1471.0,8109.0
33,2021,NBA,Brooklyn Nets,BRK,True,72.0,17405.0,3106.0,6289.0,0.494,...,0.804,640.0,2559.0,3199.0,1929.0,484.0,379.0,975.0,1371.0,8537.0
34,2021,NBA,Chicago Bulls,CHI,False,72.0,17380.0,3035.0,6380.0,0.476,...,0.791,693.0,2544.0,3237.0,1927.0,482.0,304.0,1089.0,1362.0,7969.0
35,2021,NBA,Charlotte Hornets,CHO,False,72.0,17355.0,2875.0,6324.0,0.455,...,0.761,762.0,2389.0,3151.0,1933.0,565.0,344.0,1069.0,1298.0,7881.0


In [275]:
#Use the filtered team_totals_df to create two dictionaries of team abbrev and team_names and vis a vis
updated_abbrev_team_name_dict = {}
full_team_names = list(team_totals_filtered_df['team'])
updated_abbrev = list(team_totals_filtered_df['abbreviation'])
for i in range(len(updated_abbrev)):
    if updated_abbrev[i] not in updated_abbrev_team_name_dict.keys():
        updated_abbrev_team_name_dict[updated_abbrev[i]]=full_team_names[i]
team_name_updated_abrrev_dict = {}
for i in range(len(full_team_names)):
    if full_team_names[i] not in team_name_updated_abrrev_dict.keys():
        team_name_updated_abrrev_dict[full_team_names[i]]=updated_abbrev[i]

In [278]:
#Update dictionaries to include 'TOT' values in dataframes:
team_name_updated_abrrev_dict['Total'] = 'TOT'
updated_abbrev_team_name_dict ['TOT'] = 'Total'

In [280]:
#Get info on player_totals_df
player_totals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30296 entries, 0 to 30295
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season_id                       30296 non-null  int64  
 1   season                          30296 non-null  int64  
 2   player_id                       30296 non-null  int64  
 3   player                          30296 non-null  object 
 4   birth_year                      2867 non-null   float64
 5   position                        30296 non-null  object 
 6   age                             30272 non-null  float64
 7   experience                      30296 non-null  int64  
 8   league                          30296 non-null  object 
 9   team                            30296 non-null  object 
 10  games                           30267 non-null  float64
 11  games_started                   21655 non-null  float64
 12  minutes_played                  

In [281]:
#Filter player_totals_df to only include data from 1979-2021
player_totals_filtered_df = player_totals_df.loc[(player_totals_df['season'] < 2022) & 
                                   (player_totals_df['season'] > 1978)]
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season_id                       21999 non-null  int64  
 1   season                          21999 non-null  int64  
 2   player_id                       21999 non-null  int64  
 3   player                          21999 non-null  object 
 4   birth_year                      682 non-null    float64
 5   position                        21999 non-null  object 
 6   age                             21999 non-null  float64
 7   experience                      21999 non-null  int64  
 8   league                          21999 non-null  object 
 9   team                            21999 non-null  object 
 10  games                           21999 non-null  float64
 11  games_started                   20989 non-null  float64
 12  minutes_played                

In [282]:
#Drop ineffective columns from resulting player_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df.drop([
    'birth_year',
    'three_point_percentage',
    'fieldgoal_percentage',
    'two_point_percentage',
    'effective_fieldgoal_percentage',
    'freethrow_percentage'
], axis=1)
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season_id             21999 non-null  int64  
 1   season                21999 non-null  int64  
 2   player_id             21999 non-null  int64  
 3   player                21999 non-null  object 
 4   position              21999 non-null  object 
 5   age                   21999 non-null  float64
 6   experience            21999 non-null  int64  
 7   league                21999 non-null  object 
 8   team                  21999 non-null  object 
 9   games                 21999 non-null  float64
 10  games_started         20989 non-null  float64
 11  minutes_played        21999 non-null  float64
 12  fieldgoals            21999 non-null  float64
 13  fieldgoals_attemped   21999 non-null  float64
 14  three_points_made     21655 non-null  float64
 15  three_point_attem

In [283]:
#Remove remaining rows with nulls in any column from players_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df.dropna()
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20981 entries, 649 to 22280
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season_id             20981 non-null  int64  
 1   season                20981 non-null  int64  
 2   player_id             20981 non-null  int64  
 3   player                20981 non-null  object 
 4   position              20981 non-null  object 
 5   age                   20981 non-null  float64
 6   experience            20981 non-null  int64  
 7   league                20981 non-null  object 
 8   team                  20981 non-null  object 
 9   games                 20981 non-null  float64
 10  games_started         20981 non-null  float64
 11  minutes_played        20981 non-null  float64
 12  fieldgoals            20981 non-null  float64
 13  fieldgoals_attemped   20981 non-null  float64
 14  three_points_made     20981 non-null  float64
 15  three_point_attem

In [284]:
#Re-name team column and add full team name to player_totals_filtered_Df
player_totals_filtered_df.rename(columns={"team": "abbreviation"}, inplace=True)
player_totals_filtered_df['team']  =  player_totals_filtered_df['abbreviation'].apply(
    lambda x: updated_abbrev_team_name_dict[x])
   
player_totals_filtered_df.head()

Unnamed: 0,season_id,season,player_id,player,position,age,experience,league,abbreviation,games,...,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,team
649,28943,2021,4219,Aaron Gordon,PF,25.0,7,NBA,TOT,50.0,...,77.0,207.0,284.0,161.0,33.0,34.0,97.0,89.0,618.0,Total
650,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,ORL,25.0,...,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0,Orlando Magic
651,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,DEN,25.0,...,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0,Denver Nuggets
652,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,IND,66.0,...,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0,Indiana Pacers
653,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,BOS,46.0,...,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0,Boston Celtics


In [285]:
#Re-order columns for players_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df[['season_id', 'season', 'player_id', 'player', 'position', 'age',
       'experience', 'league', 'team', 'abbreviation', 'games', 'games_started',
       'minutes_played', 'fieldgoals', 'fieldgoals_attemped',
       'three_points_made', 'three_point_attempts', 'two_point_made',
       'two_point_attempts', 'freethrows_made', 'freethrows_attempted',
       'offensive_rebounds', 'defensive_rebounds', 'total_rebounds', 'assists',
       'steals', 'blocks', 'turnovers', 'personal_fouls', 'points']]

In [286]:
#View result
player_totals_filtered_df.head()

Unnamed: 0,season_id,season,player_id,player,position,age,experience,league,team,abbreviation,...,freethrows_attempted,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
649,28943,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Total,TOT,...,149.0,77.0,207.0,284.0,161.0,33.0,34.0,97.0,89.0,618.0
650,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Orlando Magic,ORL,...,105.0,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0
651,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Denver Nuggets,DEN,...,44.0,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0
652,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,Indiana Pacers,IND,...,83.0,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0
653,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,Boston Celtics,BOS,...,28.0,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0


In [287]:
#View team summary dataset
team_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 31 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   season                                   1783 non-null   int64  
 1   league                                   1783 non-null   object 
 2   team                                     1783 non-null   object 
 3   abbreviation                             1682 non-null   object 
 4   playoffs                                 1783 non-null   bool   
 5   age                                      1719 non-null   float64
 6   wins                                     1697 non-null   float64
 7   losses                                   1697 non-null   float64
 8   pythagorean_wins                         1782 non-null   float64
 9   pythagorean_losses                       1782 non-null   float64
 10  margin_of_victory                        1782 no

In [288]:
#Drop unnecessary columns from team_summary_df
team_summary_filtered_df = team_summary_df[['season',
                                   'league',
                                   'team',
                                   'strength_of_schedule',
                                   'playoffs']]
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1783 non-null   int64  
 1   league                1783 non-null   object 
 2   team                  1783 non-null   object 
 3   strength_of_schedule  1782 non-null   float64
 4   playoffs              1783 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 57.6+ KB


In [289]:
#Filter team_summary_df to include years 1979-2021
team_summary_filtered_df = team_summary_filtered_df.loc[(team_summary_filtered_df['season'] < 2022) & 
                                   (team_summary_filtered_df['season'] > 1978)]

In [290]:
#View results
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1229 entries, 31 to 1259
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1229 non-null   int64  
 1   league                1229 non-null   object 
 2   team                  1229 non-null   object 
 3   strength_of_schedule  1229 non-null   float64
 4   playoffs              1229 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 49.2+ KB


In [291]:
#Drop all rows that contain 'league Average' under team column
team_summary_filtered_df = team_summary_filtered_df.loc[(
    team_summary_filtered_df['team'] != 'League Average')]
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1186 entries, 31 to 1258
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1186 non-null   int64  
 1   league                1186 non-null   object 
 2   team                  1186 non-null   object 
 3   strength_of_schedule  1186 non-null   float64
 4   playoffs              1186 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 47.5+ KB


In [292]:
#View team_summary_filtered_df
team_summary_filtered_df.head()

Unnamed: 0,season,league,team,strength_of_schedule,playoffs
31,2021,NBA,Chicago Bulls,-0.05,False
32,2021,NBA,Charlotte Hornets,-0.01,False
33,2021,NBA,Cleveland Cavaliers,0.25,False
34,2021,NBA,Detroit Pistons,0.09,False
35,2021,NBA,Golden State Warriors,0.04,False


In [293]:
#View the advanced_df dataset
advanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30296 entries, 0 to 30295
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season_id                     30296 non-null  int64  
 1   season                        30296 non-null  int64  
 2   player_id                     30296 non-null  int64  
 3   player                        30296 non-null  object 
 4   birth_year                    2867 non-null   float64
 5   position                      30296 non-null  object 
 6   age                           30272 non-null  float64
 7   experience                    30296 non-null  int64  
 8   league                        30296 non-null  object 
 9   team                          30296 non-null  object 
 10  games                         30267 non-null  float64
 11  minutes_played                29162 non-null  float64
 12  player_efficiency_rating      29114 non-null  float64
 13  t

In [294]:
#filter the advanced_df to only include years 1979-2021
advanced_filtered_df =  advanced_df.loc[(advanced_df['season'] < 2022) & 
                                   (advanced_df['season'] > 1978)]
advanced_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season_id                     21999 non-null  int64  
 1   season                        21999 non-null  int64  
 2   player_id                     21999 non-null  int64  
 3   player                        21999 non-null  object 
 4   birth_year                    682 non-null    float64
 5   position                      21999 non-null  object 
 6   age                           21999 non-null  float64
 7   experience                    21999 non-null  int64  
 8   league                        21999 non-null  object 
 9   team                          21999 non-null  object 
 10  games                         21999 non-null  float64
 11  minutes_played                21999 non-null  float64
 12  player_efficiency_rating      21994 non-null  float64
 13 

In [295]:
#re-generate advanced_filtered_df containing columns of interest
advanced_filtered_df = advanced_filtered_df[['season', 'team', 'player', 'player_efficiency_rating',
                                             'true_shooting_percentage', 'three_point_attempt_rate',
                                            'free_throw_attempt_rate', 'offensive_rebound_percentage',
                                            'defensive_rebound_percentage', 'total_rebound_percentage',
                                            'assist_percentage', 'steal_percentage', 'block_percentage',
                                            'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
                                            'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
                                            'defensive_box_plus/minus', 'box_plus/minus', 
                                             'value_over_replacement_plyr']]
#Drop rows containing nulls
advanced_filtered_df.dropna(inplace = True)
advanced_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21553 entries, 649 to 22303
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season                        21553 non-null  int64  
 1   team                          21553 non-null  object 
 2   player                        21553 non-null  object 
 3   player_efficiency_rating      21553 non-null  float64
 4   true_shooting_percentage      21553 non-null  float64
 5   three_point_attempt_rate      21553 non-null  float64
 6   free_throw_attempt_rate       21553 non-null  float64
 7   offensive_rebound_percentage  21553 non-null  float64
 8   defensive_rebound_percentage  21553 non-null  float64
 9   total_rebound_percentage      21553 non-null  float64
 10  assist_percentage             21553 non-null  float64
 11  steal_percentage              21553 non-null  float64
 12  block_percentage              21553 non-null  float64
 13 

In [297]:
#Update advanced_filtered_df to include full_team_name and updated_abbrev 
advanced_filtered_df.rename(columns={"team": "abbreviation"}, inplace=True)
advanced_filtered_df['team'] = [updated_abbrev_team_name_dict[i] for i in advanced_filtered_df[
    'abbreviation']]
advanced_filtered_df.head()

Unnamed: 0,season,abbreviation,player,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,...,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr,team
649,2021,TOT,Aaron Gordon,14.5,0.547,0.353,0.299,6.0,16.4,11.1,...,20.7,0.7,1.2,1.9,0.066,0.2,-0.2,0.0,0.7,Total
650,2021,ORL,Aaron Gordon,14.9,0.537,0.382,0.358,5.3,18.5,11.7,...,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5,Orlando Magic
651,2021,DEN,Aaron Gordon,14.1,0.564,0.311,0.214,6.7,13.9,10.3,...,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2,Denver Nuggets
652,2021,IND,Aaron Holiday,9.2,0.503,0.417,0.19,1.4,6.8,4.1,...,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6,Indiana Pacers
653,2021,BOS,Aaron Nesmith,9.4,0.573,0.607,0.157,4.6,16.6,10.6,...,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2,Boston Celtics


In [298]:
#Re-order columns in advanced_df
advanced_filtered_df = advanced_filtered_df[['season','team', 'abbreviation', 'player', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
       'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
       'defensive_box_plus/minus', 'box_plus/minus',
       'value_over_replacement_plyr' ]]

In [300]:
#View Result
advanced_filtered_df.head()

Unnamed: 0,season,team,abbreviation,player,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,...,block_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr
649,2021,Total,TOT,Aaron Gordon,14.5,0.547,0.353,0.299,6.0,16.4,...,2.3,20.7,0.7,1.2,1.9,0.066,0.2,-0.2,0.0,0.7
650,2021,Orlando Magic,ORL,Aaron Gordon,14.9,0.537,0.382,0.358,5.3,18.5,...,2.5,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5
651,2021,Denver Nuggets,DEN,Aaron Gordon,14.1,0.564,0.311,0.214,6.7,13.9,...,2.1,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2
652,2021,Indiana Pacers,IND,Aaron Holiday,9.2,0.503,0.417,0.19,1.4,6.8,...,0.9,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6
653,2021,Boston Celtics,BOS,Aaron Nesmith,9.4,0.573,0.607,0.157,4.6,16.6,...,1.3,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2


In [301]:
#Add additional columns to distinguish players that may have played on two different teams in a season
team_list = list(advanced_filtered_df['abbreviation'])
player_list = list(advanced_filtered_df['player'])
left_due_to_trade_during_season = []
one_team_during_season=[]
joined_due_to_trade_during_season = []
n=0
while n < len(team_list):
    if team_list[n] == 'TOT':
        left_due_to_trade_during_season.append(0)
        one_team_during_season.append(0)
        joined_due_to_trade_during_season.append(0)
        n+=1
        if player_list[n] == player_list[n-1]:
            left_due_to_trade_during_season.append(1)
            joined_due_to_trade_during_season.append(0)
            one_team_during_season.append(0)
            n+=1
            if player_list[n] == player_list[n-1] and player_list[n] == player_list[n+1]:
                left_due_to_trade_during_season.append(1)
                joined_due_to_trade_during_season.append(1)
                one_team_during_season.append(0)
                n+=1 
            elif player_list[n] == player_list[n-1] and player_list[n] != player_list[n+1]:
                left_due_to_trade_during_season.append(0)
                joined_due_to_trade_during_season.append(1)
                one_team_during_season.append(0)
                n+=1
    else:
        left_due_to_trade_during_season.append(0)
        joined_due_to_trade_during_season.append(0)
        one_team_during_season.append(1)
        n+=1

advanced_filtered_df['one_team_during_season'] = one_team_during_season
advanced_filtered_df['left_due_to_trade_during_season'] = left_due_to_trade_during_season
advanced_filtered_df['joined_due_to_trade_during_season'] = joined_due_to_trade_during_season



   
 

In [302]:
#Re-order columns in advanced_filtered_df
advanced_filtered_df = advanced_filtered_df[['season', 'player','team', 'abbreviation','one_team_during_season',
       'left_due_to_trade_during_season', 'joined_due_to_trade_during_season', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
       'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
       'defensive_box_plus/minus', 'box_plus/minus',
       'value_over_replacement_plyr']]

In [303]:
#View result
advanced_filtered_df.head()

Unnamed: 0,season,player,team,abbreviation,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,...,block_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr
649,2021,Aaron Gordon,Total,TOT,0,0,0,14.5,0.547,0.353,...,2.3,20.7,0.7,1.2,1.9,0.066,0.2,-0.2,0.0,0.7
650,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,0.382,...,2.5,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5
651,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,0.311,...,2.1,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2
652,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,0.417,...,0.9,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6
653,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,0.607,...,1.3,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2


In [304]:
#Delete row containing 'TOT' under abbreviation column in advanced_filtered_df and player_totals_filtered_df
advanced_filtered_df = advanced_filtered_df.loc[(advanced_filtered_df['abbreviation'] != 'TOT')]
player_totals_filtered_df = player_totals_filtered_df.loc[(player_totals_filtered_df['abbreviation'] != 'TOT')]

In [305]:
#Reset the indices of all the resulting dataframes 
advanced_filtered_df.reset_index(inplace=True)
team_summary_filtered_df.reset_index(inplace=True)
player_totals_filtered_df.reset_index(inplace=True)

In [306]:
#Drop index column
advanced_filtered_df = advanced_filtered_df.drop(['index'],axis=1)
team_summary_filtered_df = team_summary_filtered_df.drop(['index'],axis=1)
player_totals_filtered_df = player_totals_filtered_df.drop(['index'],axis=1)

In [307]:
#View dfs:
advanced_filtered_df.head()

Unnamed: 0,season,player,team,abbreviation,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,...,block_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr
0,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,0.382,...,2.5,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5
1,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,0.311,...,2.1,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2
2,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,0.417,...,0.9,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6
3,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,0.607,...,1.3,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2
4,2021,Abdel Nader,Phoenix Suns,PHO,1,0,0,13.4,0.605,0.371,...,2.3,19.0,0.3,0.4,0.7,0.095,-1.2,0.8,-0.4,0.2


In [308]:
team_summary_filtered_df.head()

Unnamed: 0,season,league,team,strength_of_schedule,playoffs
0,2021,NBA,Chicago Bulls,-0.05,False
1,2021,NBA,Charlotte Hornets,-0.01,False
2,2021,NBA,Cleveland Cavaliers,0.25,False
3,2021,NBA,Detroit Pistons,0.09,False
4,2021,NBA,Golden State Warriors,0.04,False


In [310]:
#Create a column of unique values for each player, season and team:
player_totals_filtered_df['season_player_team'] = (player_totals_filtered_df['player'] + '_' + 
player_totals_filtered_df['team'] +'_'+ player_totals_filtered_df['season'].map(str))

   
advanced_filtered_df['season_player_team'] = (advanced_filtered_df['player'] + '_' + 
advanced_filtered_df['team'] +'_'+ advanced_filtered_df['season'].map(str))

In [311]:
player_totals_filtered_df.head()

Unnamed: 0,season_id,season,player_id,player,position,age,experience,league,team,abbreviation,...,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,season_player_team
0,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Orlando Magic,ORL,...,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0,Aaron Gordon_Orlando Magic_2021
1,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Denver Nuggets,DEN,...,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0,Aaron Gordon_Denver Nuggets_2021
2,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,Indiana Pacers,IND,...,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0,Aaron Holiday_Indiana Pacers_2021
3,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,Boston Celtics,BOS,...,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0,Aaron Nesmith_Boston Celtics_2021
4,28948,2021,4463,Abdel Nader,SF,27.0,4,NBA,Phoenix Suns,PHO,...,7.0,55.0,62.0,19.0,10.0,9.0,19.0,34.0,160.0,Abdel Nader_Phoenix Suns_2021


In [312]:
advanced_filtered_df.head()

Unnamed: 0,season,player,team,abbreviation,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,...,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr,season_player_team
0,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,0.382,...,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5,Aaron Gordon_Orlando Magic_2021
1,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,0.311,...,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2,Aaron Gordon_Denver Nuggets_2021
2,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,0.417,...,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6,Aaron Holiday_Indiana Pacers_2021
3,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,0.607,...,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2,Aaron Nesmith_Boston Celtics_2021
4,2021,Abdel Nader,Phoenix Suns,PHO,1,0,0,13.4,0.605,0.371,...,19.0,0.3,0.4,0.7,0.095,-1.2,0.8,-0.4,0.2,Abdel Nader_Phoenix Suns_2021


In [313]:
#Re-order columns of both dfs:
advanced_filtered_df = advanced_filtered_df[['season_player_team','season', 'player', 'team', 'abbreviation', 'one_team_during_season',
       'left_due_to_trade_during_season', 'joined_due_to_trade_during_season',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_min',
       'offensive_box_plus/minus', 'defensive_box_plus/minus',
       'box_plus/minus', 'value_over_replacement_plyr']]
player_totals_filtered_df = player_totals_filtered_df[['season_player_team','season_id', 'season', 'player_id', 'player', 'position', 'age',
       'experience', 'league', 'team', 'abbreviation', 'games',
       'games_started', 'minutes_played', 'fieldgoals', 'fieldgoals_attemped',
       'three_points_made', 'three_point_attempts', 'two_point_made',
       'two_point_attempts', 'freethrows_made', 'freethrows_attempted',
       'offensive_rebounds', 'defensive_rebounds', 'total_rebounds', 'assists',
       'steals', 'blocks', 'turnovers', 'personal_fouls', 'points'
       ]]

In [314]:
#view results:
advanced_filtered_df.head()

Unnamed: 0,season_player_team,season,player,team,abbreviation,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,...,block_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr
0,Aaron Gordon_Orlando Magic_2021,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,...,2.5,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5
1,Aaron Gordon_Denver Nuggets_2021,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,...,2.1,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2
2,Aaron Holiday_Indiana Pacers_2021,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,...,0.9,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6
3,Aaron Nesmith_Boston Celtics_2021,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,...,1.3,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2
4,Abdel Nader_Phoenix Suns_2021,2021,Abdel Nader,Phoenix Suns,PHO,1,0,0,13.4,0.605,...,2.3,19.0,0.3,0.4,0.7,0.095,-1.2,0.8,-0.4,0.2


In [315]:
player_totals_filtered_df.head()

Unnamed: 0,season_player_team,season_id,season,player_id,player,position,age,experience,league,team,...,freethrows_attempted,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,Aaron Gordon_Orlando Magic_2021,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Orlando Magic,...,105.0,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0
1,Aaron Gordon_Denver Nuggets_2021,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Denver Nuggets,...,44.0,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0
2,Aaron Holiday_Indiana Pacers_2021,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,Indiana Pacers,...,83.0,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0
3,Aaron Nesmith_Boston Celtics_2021,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,Boston Celtics,...,28.0,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0
4,Abdel Nader_Phoenix Suns_2021,28948,2021,4463,Abdel Nader,SF,27.0,4,NBA,Phoenix Suns,...,37.0,7.0,55.0,62.0,19.0,10.0,9.0,19.0,34.0,160.0


In [316]:
team_summary_filtered_df.head()

Unnamed: 0,season,league,team,strength_of_schedule,playoffs
0,2021,NBA,Chicago Bulls,-0.05,False
1,2021,NBA,Charlotte Hornets,-0.01,False
2,2021,NBA,Cleveland Cavaliers,0.25,False
3,2021,NBA,Detroit Pistons,0.09,False
4,2021,NBA,Golden State Warriors,0.04,False


In [318]:
#Write cleaned-up datasets to file
advanced_filtered_df.to_csv("resources/advanced_stats_filtered.csv", sep=',')
team_summary_filtered_df.to_csv("resources/team_summary_filtered.csv", sep=',')
player_totals_filtered_df.to_csv("resources/player_totals_filtered.csv", sep=',')

In [319]:
#View the dfs in preparation for merging
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19122 entries, 0 to 19121
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season_player_team    19122 non-null  object 
 1   season_id             19122 non-null  int64  
 2   season                19122 non-null  int64  
 3   player_id             19122 non-null  int64  
 4   player                19122 non-null  object 
 5   position              19122 non-null  object 
 6   age                   19122 non-null  float64
 7   experience            19122 non-null  int64  
 8   league                19122 non-null  object 
 9   team                  19122 non-null  object 
 10  abbreviation          19122 non-null  object 
 11  games                 19122 non-null  float64
 12  games_started         19122 non-null  float64
 13  minutes_played        19122 non-null  float64
 14  fieldgoals            19122 non-null  float64
 15  fieldgoals_attemped

In [320]:
advanced_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19633 entries, 0 to 19632
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   season_player_team                 19633 non-null  object 
 1   season                             19633 non-null  int64  
 2   player                             19633 non-null  object 
 3   team                               19633 non-null  object 
 4   abbreviation                       19633 non-null  object 
 5   one_team_during_season             19633 non-null  int64  
 6   left_due_to_trade_during_season    19633 non-null  int64  
 7   joined_due_to_trade_during_season  19633 non-null  int64  
 8   player_efficiency_rating           19633 non-null  float64
 9   true_shooting_percentage           19633 non-null  float64
 10  three_point_attempt_rate           19633 non-null  float64
 11  free_throw_attempt_rate            19633 non-null  flo

In [321]:
#merge the player_totals and advanced filtered dataframes
merged_df = advanced_filtered_df.merge(player_totals_filtered_df, 
                                  left_on='season_player_team', right_on='season_player_team', how='inner')

In [322]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 0 to 19022
Data columns (total 57 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   season_player_team                 19023 non-null  object 
 1   season_x                           19023 non-null  int64  
 2   player_x                           19023 non-null  object 
 3   team_x                             19023 non-null  object 
 4   abbreviation_x                     19023 non-null  object 
 5   one_team_during_season             19023 non-null  int64  
 6   left_due_to_trade_during_season    19023 non-null  int64  
 7   joined_due_to_trade_during_season  19023 non-null  int64  
 8   player_efficiency_rating           19023 non-null  float64
 9   true_shooting_percentage           19023 non-null  float64
 10  three_point_attempt_rate           19023 non-null  float64
 11  free_throw_attempt_rate            19023 non-null  flo

In [323]:
#View result 
merged_df.head()

Unnamed: 0,season_player_team,season_x,player_x,team_x,abbreviation_x,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,...,freethrows_attempted,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,Aaron Gordon_Orlando Magic_2021,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,...,105.0,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0
1,Aaron Gordon_Denver Nuggets_2021,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,...,44.0,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0
2,Aaron Holiday_Indiana Pacers_2021,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,...,83.0,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0
3,Aaron Nesmith_Boston Celtics_2021,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,...,28.0,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0
4,Abdel Nader_Phoenix Suns_2021,2021,Abdel Nader,Phoenix Suns,PHO,1,0,0,13.4,0.605,...,37.0,7.0,55.0,62.0,19.0,10.0,9.0,19.0,34.0,160.0


In [324]:
#several teams are inconsistent between dataframes due to name changes that occurred:
def change_team_name(team, year):
    if team == 'Washington Bullets' or team =='Washington Wizards':
        if year > 1974 and year < 1997:
            return 'Washington Bullets'
        elif year >= 1997:
            return 'Washington Wizards'
    elif team == 'Vancouver Grizzlies' or team == 'Memphis Grizzlies':
        if year > 1994 and year < 2001:
            return 'Vancouver Grizzlies'
        elif year >= 2001:
            return 'Memphis Grizzlies'
    elif team == 'Kansas City Kings' or team == 'Sacramento Kings':
        if year > 1975 and year <= 1984:
            return 'Kansas City Kings'
        elif year > 1984:
            return 'Sacramento Kings'
    elif team == 'San Diego Clippers' or team  == 'Los Angeles Clippers':
        if year >= 1978 and year < 1984:
            return 'San Diego Clippers'
        elif year >= 1984:
            return 'Los Angeles Clippers'
    elif team == 'New Orleans Jazz' or team == 'Utah Jazz':
        if year >=1974 and year < 1979:
            return 'New Orleans Jazz'
        elif year >=1979:
            return 'Utah Jazz'
    else:
        return team
#Add a column with the corrected teams referenced in the function above to team_summary_filtered_df:
team_summary_filtered_df['corrected_team'] = team_summary_filtered_df.apply(lambda x: change_team_name(
x.team, x.season), axis=1)
#Add a column with corrected teams referenced in the function above to merged_df
merged_df['corrected_team'] = merged_df.apply(lambda x: change_team_name(x.team_x, x.season_x), axis=1)

In [325]:
#Before merging team_summary into this df, appropriate columns of unique values need to be created again:
team_summary_filtered_df['season_team'] = (team_summary_filtered_df['corrected_team'] + '_' 
+ team_summary_filtered_df['season'].map(str))

team_summary_filtered_df.head()


Unnamed: 0,season,league,team,strength_of_schedule,playoffs,corrected_team,season_team
0,2021,NBA,Chicago Bulls,-0.05,False,Chicago Bulls,Chicago Bulls_2021
1,2021,NBA,Charlotte Hornets,-0.01,False,Charlotte Hornets,Charlotte Hornets_2021
2,2021,NBA,Cleveland Cavaliers,0.25,False,Cleveland Cavaliers,Cleveland Cavaliers_2021
3,2021,NBA,Detroit Pistons,0.09,False,Detroit Pistons,Detroit Pistons_2021
4,2021,NBA,Golden State Warriors,0.04,False,Golden State Warriors,Golden State Warriors_2021


In [326]:
merged_df['season_team'] = (merged_df['corrected_team'] + '_' 
+ merged_df['season_x'].map(str))
merged_df.head()

Unnamed: 0,season_player_team,season_x,player_x,team_x,abbreviation_x,one_team_during_season,left_due_to_trade_during_season,joined_due_to_trade_during_season,player_efficiency_rating,true_shooting_percentage,...,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,corrected_team,season_team
0,Aaron Gordon_Orlando Magic_2021,2021,Aaron Gordon,Orlando Magic,ORL,0,1,0,14.9,0.537,...,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0,Orlando Magic,Orlando Magic_2021
1,Aaron Gordon_Denver Nuggets_2021,2021,Aaron Gordon,Denver Nuggets,DEN,0,0,1,14.1,0.564,...,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0,Denver Nuggets,Denver Nuggets_2021
2,Aaron Holiday_Indiana Pacers_2021,2021,Aaron Holiday,Indiana Pacers,IND,1,0,0,9.2,0.503,...,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0,Indiana Pacers,Indiana Pacers_2021
3,Aaron Nesmith_Boston Celtics_2021,2021,Aaron Nesmith,Boston Celtics,BOS,1,0,0,9.4,0.573,...,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0,Boston Celtics,Boston Celtics_2021
4,Abdel Nader_Phoenix Suns_2021,2021,Abdel Nader,Phoenix Suns,PHO,1,0,0,13.4,0.605,...,55.0,62.0,19.0,10.0,9.0,19.0,34.0,160.0,Phoenix Suns,Phoenix Suns_2021


In [327]:
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1186 non-null   int64  
 1   league                1186 non-null   object 
 2   team                  1186 non-null   object 
 3   strength_of_schedule  1186 non-null   float64
 4   playoffs              1186 non-null   bool   
 5   corrected_team        1186 non-null   object 
 6   season_team           1186 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 56.9+ KB


In [351]:
final_merged_df = merged_df.merge(team_summary_filtered_df, 
                                  left_on='season_team', right_on='season_team', how='left')
final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 0 to 19022
Data columns (total 65 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   season_player_team                 19023 non-null  object 
 1   season_x                           19023 non-null  int64  
 2   player_x                           19023 non-null  object 
 3   team_x                             19023 non-null  object 
 4   abbreviation_x                     19023 non-null  object 
 5   one_team_during_season             19023 non-null  int64  
 6   left_due_to_trade_during_season    19023 non-null  int64  
 7   joined_due_to_trade_during_season  19023 non-null  int64  
 8   player_efficiency_rating           19023 non-null  float64
 9   true_shooting_percentage           19023 non-null  float64
 10  three_point_attempt_rate           19023 non-null  float64
 11  free_throw_attempt_rate            19023 non-null  flo

In [354]:
#Drop all the extra columns including the unique ones used for merging:
final_merged_df = final_merged_df.drop([
    'season_y',
    'player_y',
    'team_y',
    'abbreviation_y',
    'league_y',
    'corrected_team_y',
    'season_player_team',
    'season_team', 
    'season_x'
    ], axis=1)

In [355]:
#Re-name columns:
final_merged_df.rename(columns={
                                     "player_x": "player",
                                      "team_x": "team",
                                       "abbreviation_x": "abbreviation",
                                       "league_x": "league",
                                       "corrected_team_x": "corrected_team"}, inplace=True)

final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 0 to 19022
Data columns (total 56 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   player                             19023 non-null  object 
 1   team                               19023 non-null  object 
 2   abbreviation                       19023 non-null  object 
 3   one_team_during_season             19023 non-null  int64  
 4   left_due_to_trade_during_season    19023 non-null  int64  
 5   joined_due_to_trade_during_season  19023 non-null  int64  
 6   player_efficiency_rating           19023 non-null  float64
 7   true_shooting_percentage           19023 non-null  float64
 8   three_point_attempt_rate           19023 non-null  float64
 9   free_throw_attempt_rate            19023 non-null  float64
 10  offensive_rebound_percentage       19023 non-null  float64
 11  defensive_rebound_percentage       19023 non-null  flo

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [356]:
#Drop 'team' column and change 'corrected_team' to 'team'
final_merged_df = final_merged_df.drop([
    'team'
    ], axis=1)
final_merged_df.rename(columns={"corrected_team": "team"}, inplace=True)

In [357]:
#View result
final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 0 to 19022
Data columns (total 54 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   player                             19023 non-null  object 
 1   abbreviation                       19023 non-null  object 
 2   one_team_during_season             19023 non-null  int64  
 3   left_due_to_trade_during_season    19023 non-null  int64  
 4   joined_due_to_trade_during_season  19023 non-null  int64  
 5   player_efficiency_rating           19023 non-null  float64
 6   true_shooting_percentage           19023 non-null  float64
 7   three_point_attempt_rate           19023 non-null  float64
 8   free_throw_attempt_rate            19023 non-null  float64
 9   offensive_rebound_percentage       19023 non-null  float64
 10  defensive_rebound_percentage       19023 non-null  float64
 11  total_rebound_percentage           19023 non-null  flo

In [358]:
#Re-arrange_columns:
final_merged_df = final_merged_df[['season',
 'player',
 'team',
 'abbreviation',
 'one_team_during_season',
 'left_due_to_trade_during_season',
 'joined_due_to_trade_during_season',
 'player_efficiency_rating',
 'true_shooting_percentage',
 'three_point_attempt_rate',
 'free_throw_attempt_rate',
 'offensive_rebound_percentage',
 'defensive_rebound_percentage',
 'total_rebound_percentage',
 'assist_percentage',
 'steal_percentage',
 'block_percentage',
 'usage_percentage',
 'offensive_win_shares',
 'defensive_win_shares',
 'win_shares',
 'win_shares_per_48_min',
 'offensive_box_plus/minus',
 'defensive_box_plus/minus',
 'box_plus/minus',
 'value_over_replacement_plyr',
 'season_id',
 'player_id',
 'position',
 'age',
 'experience',
 'league',
 'games',
 'games_started',
 'minutes_played',
 'fieldgoals',
 'fieldgoals_attemped',
 'three_points_made',
 'three_point_attempts',
 'two_point_made',
 'two_point_attempts',
 'freethrows_made',
 'freethrows_attempted',
 'offensive_rebounds',
 'defensive_rebounds',
 'total_rebounds',
 'assists',
 'steals',
 'blocks',
 'turnovers',
 'personal_fouls',
 'points',
 
 'strength_of_schedule',
 'playoffs']]

In [359]:
#View result:
final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19023 entries, 0 to 19022
Data columns (total 54 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   season                             19023 non-null  int64  
 1   player                             19023 non-null  object 
 2   team                               19023 non-null  object 
 3   abbreviation                       19023 non-null  object 
 4   one_team_during_season             19023 non-null  int64  
 5   left_due_to_trade_during_season    19023 non-null  int64  
 6   joined_due_to_trade_during_season  19023 non-null  int64  
 7   player_efficiency_rating           19023 non-null  float64
 8   true_shooting_percentage           19023 non-null  float64
 9   three_point_attempt_rate           19023 non-null  float64
 10  free_throw_attempt_rate            19023 non-null  float64
 11  offensive_rebound_percentage       19023 non-null  flo

In [361]:
#Write to file:
final_merged_df.to_csv("resources/final_nba_player_dataset.csv", sep=',')

In [367]:
#upload tables to database postgres
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/nba_analysis"
engine = create_engine(db_string)
advanced_filtered_df.to_sql(name='advanced_data', if_exists='replace', con=engine)
team_summary_filtered_df.to_sql(name='team_data', if_exists='replace', con=engine)
player_totals_filtered_df.to_sql(name='player_data', if_exists='replace', con=engine)
final_merged_df.to_sql(name='final_nba_player_dataset', if_exists='replace', con=engine)

In [None]:
#Upload tables to AWS RDS
from getpass import getpass
password = getpass('enter password')
mode = "append"
jdbc_url="jdbc:postgresql://vineaws.cooqh3qenrpu.us-west-1.rds.amazonaws.com:5432/my_vine_db"
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}