In [303]:
#Import dependencies
import json
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import psycopg2
from config import db_password
import time


In [324]:
#Import datasets
advanced_df = pd.read_csv("resources/advanced.csv")
team_summary_df = pd.read_csv("resources/team_summaries.csv")
player_totals_df = pd.read_csv("resources/player_totals.csv")
team_totals_df = pd.read_csv("resources/team_totals.csv")

In [325]:
#Obtain column headers which contain stat abbreviations
df_cols = [advanced_df.columns, team_summary_df.columns, 
       player_totals_df.columns, team_totals_df.columns]

In [326]:
#List of the four data frames
dfs = [advanced_df, team_summary_df, 
       player_totals_df, team_totals_df]

In [327]:
#Generate list of all column headers for the four dataframes
abbrev = []
for df_col in df_cols:
    for item in df_col:
        if item not in abbrev:
            abbrev.append(item)
abbrev

['seas_id',
 'season',
 'player_id',
 'player',
 'birth_year',
 'pos',
 'age',
 'experience',
 'lg',
 'tm',
 'g',
 'mp',
 'per',
 'ts_percent',
 'x3p_ar',
 'f_tr',
 'orb_percent',
 'drb_percent',
 'trb_percent',
 'ast_percent',
 'stl_percent',
 'blk_percent',
 'tov_percent',
 'usg_percent',
 'ows',
 'dws',
 'ws',
 'ws_48',
 'obpm',
 'dbpm',
 'bpm',
 'vorp',
 'team',
 'abbreviation',
 'playoffs',
 'w',
 'l',
 'pw',
 'pl',
 'mov',
 'sos',
 'srs',
 'o_rtg',
 'd_rtg',
 'n_rtg',
 'pace',
 'e_fg_percent',
 'ft_fga',
 'opp_e_fg_percent',
 'opp_tov_percent',
 'opp_drb_percent',
 'opp_ft_fga',
 'arena',
 'attend',
 'attend_g',
 'gs',
 'fg',
 'fga',
 'fg_percent',
 'x3p',
 'x3pa',
 'x3p_percent',
 'x2p',
 'x2pa',
 'x2p_percent',
 'ft',
 'fta',
 'ft_percent',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts']

In [328]:
#List of all corresponding terms referred to by column header abbreviations
meanings = ['season_id',
 'season',
 'player_id',
 'player',
 'birth_year',
 'position',
 'age',
 'experience',
 'league',
 'team',
 'games',
 'minutes_played',
 'player_efficiency_rating',
 'true_shooting_percentage',
 'three_point_attempt_rate',
 'free_throw_attempt_rate',
 'offensive_rebound_percentage',
 'defensive_rebound_percentage',
 'total_rebound_percentage',
 'assist_percentage',
 'steal_percentage',
 'block_percentage',
 'turnover_percentage',
 'usage_percentage',
 'offensive_win_shares',
 'defensive_win_shares',
 'win_shares',
 'win_shares_per_48_min',
 'offensive_box_plus/minus',
 'defensive_box_plus/minus',
 'box_plus/minus',
 'value_over_replacement_plyr',
 'team',
 'abbreviation',
 'playoffs',
 'wins',
 'losses',
 'pythagorean_wins',
 'pythagorean_losses',
 'margin_of_victory',
 'strength_of_schedule',
 'simple_rating_system',
 'offensive_rating',
 'defensive_rating',
 'net_rating',
 'pace',
 'effective_fieldgoal_percentage',
 'free_throws/fieldgoal_attempts',
 'opponent_effective_fieldgoal_percentage',
 'opponent_turnover_percentage',
 'opponent_defensive_rebound_percentage',
 'opponent_freethrows/fieldgoal_attempts',
 'arena',
 'attend',
 'attend_g',
 'games_started',
 'fieldgoals',
 'fieldgoals_attemped',
 'fieldgoal_percentage',
 'three_points_made',
 'three_point_attempts',
 'three_point_percentage',
 'two_point_made',
 'two_point_attempts',
 'two_point_percentage',
 'freethrows_made',
 'freethrows_attempted',
 'freethrow_percentage',
 'offensive_rebounds',
 'defensive_rebounds',
 'total_rebounds',
 'assists',
 'steals',
 'blocks',
 'turnovers',
 'personal_fouls',
 'points']

In [329]:
#Generate a dictionary of abbreviations (keys) and full terms (values)
nba_gloss = {}
for i in range(len(abbrev)):
    nba_gloss[abbrev[i]] = meanings[i]

In [330]:
#Replace all column headers in dataframes with new headers containing full terms
for df in dfs:
    df.set_axis([nba_gloss[i] for i in list(df.columns)], inplace=True, axis=1)


In [333]:
#View the team_totals dataframe
team_totals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   season                  1783 non-null   int64  
 1   league                  1783 non-null   object 
 2   team                    1783 non-null   object 
 3   abbreviation            1698 non-null   object 
 4   playoffs                1783 non-null   bool   
 5   games                   1782 non-null   float64
 6   minutes_played          1593 non-null   float64
 7   fieldgoals              1782 non-null   float64
 8   fieldgoals_attemped     1782 non-null   float64
 9   fieldgoal_percentage    1782 non-null   float64
 10  three_points_made       1340 non-null   float64
 11  three_point_attempts    1340 non-null   float64
 12  three_point_percentage  1340 non-null   float64
 13  two_point_made          1782 non-null   float64
 14  two_point_attempts      1782 non-null   

In [334]:
#Drop 'League Average' from team columns in team_totals_df
team_totals_filtered_df = team_totals_df.loc[(team_totals_df['team'] != 'League Average')]
team_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1698 entries, 0 to 1781
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   season                  1698 non-null   int64  
 1   league                  1698 non-null   object 
 2   team                    1698 non-null   object 
 3   abbreviation            1698 non-null   object 
 4   playoffs                1698 non-null   bool   
 5   games                   1697 non-null   float64
 6   minutes_played          1526 non-null   float64
 7   fieldgoals              1697 non-null   float64
 8   fieldgoals_attemped     1697 non-null   float64
 9   fieldgoal_percentage    1697 non-null   float64
 10  three_points_made       1288 non-null   float64
 11  three_point_attempts    1288 non-null   float64
 12  three_point_percentage  1288 non-null   float64
 13  two_point_made          1697 non-null   float64
 14  two_point_attempts      1697 non-null   

In [336]:
#Use the filtered team_totals_df to create two dictionaries of team abbrev and team_names and vis a vis
full_team_names = list(team_totals_filtered_df['team'].unique())
updated_abbrev = list(team_totals_filtered_df['abbreviation'].unique())
team_name_updated_abrrev_dict = {}
for i in range(len(full_team_names)):
    team_name_updated_abrrev_dict[full_team_names[i]] = updated_abbrev[i]
updated_abbrev_team_name_dict = {}
for i in range(len(full_team_names)):
    updated_abbrev_team_name_dict[updated_abbrev[i]] = full_team_names[i]                                  

In [351]:
#Update dictionaries to include 'TOT' values in dataframes:
team_name_updated_abrrev_dict['Total'] = 'TOT'
updated_abbrev_team_name_dict ['TOT'] = 'Total'

In [345]:
#Get info on player_totals_df
player_totals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30296 entries, 0 to 30295
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season_id                       30296 non-null  int64  
 1   season                          30296 non-null  int64  
 2   player_id                       30296 non-null  int64  
 3   player                          30296 non-null  object 
 4   birth_year                      2867 non-null   float64
 5   position                        30296 non-null  object 
 6   age                             30272 non-null  float64
 7   experience                      30296 non-null  int64  
 8   league                          30296 non-null  object 
 9   team                            30296 non-null  object 
 10  games                           30267 non-null  float64
 11  games_started                   21655 non-null  float64
 12  minutes_played                  

In [346]:
#Filter player_totals_df to only include data from 1979-2021
player_totals_filtered_df = player_totals_df.loc[(player_totals_df['season'] < 2022) & 
                                   (player_totals_df['season'] > 1978)]
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season_id                       21999 non-null  int64  
 1   season                          21999 non-null  int64  
 2   player_id                       21999 non-null  int64  
 3   player                          21999 non-null  object 
 4   birth_year                      682 non-null    float64
 5   position                        21999 non-null  object 
 6   age                             21999 non-null  float64
 7   experience                      21999 non-null  int64  
 8   league                          21999 non-null  object 
 9   team                            21999 non-null  object 
 10  games                           21999 non-null  float64
 11  games_started                   20989 non-null  float64
 12  minutes_played                

In [347]:
#Drop ineffective columns from resulting player_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df.drop([
    'birth_year',
    'three_point_percentage',
    'fieldgoal_percentage',
    'two_point_percentage',
    'effective_fieldgoal_percentage',
    'freethrow_percentage'
], axis=1)
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season_id             21999 non-null  int64  
 1   season                21999 non-null  int64  
 2   player_id             21999 non-null  int64  
 3   player                21999 non-null  object 
 4   position              21999 non-null  object 
 5   age                   21999 non-null  float64
 6   experience            21999 non-null  int64  
 7   league                21999 non-null  object 
 8   team                  21999 non-null  object 
 9   games                 21999 non-null  float64
 10  games_started         20989 non-null  float64
 11  minutes_played        21999 non-null  float64
 12  fieldgoals            21999 non-null  float64
 13  fieldgoals_attemped   21999 non-null  float64
 14  three_points_made     21655 non-null  float64
 15  three_point_attem

In [348]:
#Remove remaining rows with nulls in any column from players_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df.dropna()
player_totals_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20981 entries, 649 to 22280
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season_id             20981 non-null  int64  
 1   season                20981 non-null  int64  
 2   player_id             20981 non-null  int64  
 3   player                20981 non-null  object 
 4   position              20981 non-null  object 
 5   age                   20981 non-null  float64
 6   experience            20981 non-null  int64  
 7   league                20981 non-null  object 
 8   team                  20981 non-null  object 
 9   games                 20981 non-null  float64
 10  games_started         20981 non-null  float64
 11  minutes_played        20981 non-null  float64
 12  fieldgoals            20981 non-null  float64
 13  fieldgoals_attemped   20981 non-null  float64
 14  three_points_made     20981 non-null  float64
 15  three_point_attem

In [358]:
#Re-name team column and add full team name to player_totals_filtered_Df
player_totals_filtered_df.rename(columns={"team": "abbreviation"}, inplace=True)
player_totals_filtered_df['team']  =  player_totals_filtered_df['abbreviation'].apply(
    lambda x: updated_abbrev_team_name_dict[x])
   
player_totals_filtered_df.head()

Unnamed: 0,season_id,season,player_id,player,position,age,experience,league,abbreviation,games,...,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,team
649,28943,2021,4219,Aaron Gordon,PF,25.0,7,NBA,TOT,50.0,...,77.0,207.0,284.0,161.0,33.0,34.0,97.0,89.0,618.0,Total
650,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,ORL,25.0,...,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0,Orlando Magic
651,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,DEN,25.0,...,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0,Denver Nuggets
652,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,IND,66.0,...,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0,Indiana Pacers
653,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,BOS,46.0,...,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0,Boston Celtics


In [361]:
#Re-order columns for players_totals_filtered_df
player_totals_filtered_df = player_totals_filtered_df[['season_id', 'season', 'player_id', 'player', 'position', 'age',
       'experience', 'league', 'team', 'abbreviation', 'games', 'games_started',
       'minutes_played', 'fieldgoals', 'fieldgoals_attemped',
       'three_points_made', 'three_point_attempts', 'two_point_made',
       'two_point_attempts', 'freethrows_made', 'freethrows_attempted',
       'offensive_rebounds', 'defensive_rebounds', 'total_rebounds', 'assists',
       'steals', 'blocks', 'turnovers', 'personal_fouls', 'points']]

In [362]:
#View result
player_totals_filtered_df.head()

Unnamed: 0,season_id,season,player_id,player,position,age,experience,league,team,abbreviation,...,freethrows_attempted,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
649,28943,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Total,TOT,...,149.0,77.0,207.0,284.0,161.0,33.0,34.0,97.0,89.0,618.0
650,28944,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Orlando Magic,ORL,...,105.0,39.0,127.0,166.0,105.0,16.0,20.0,67.0,49.0,364.0
651,28945,2021,4219,Aaron Gordon,PF,25.0,7,NBA,Denver Nuggets,DEN,...,44.0,38.0,80.0,118.0,56.0,17.0,14.0,30.0,40.0,254.0
652,28946,2021,4582,Aaron Holiday,PG,24.0,3,NBA,Indiana Pacers,IND,...,83.0,15.0,74.0,89.0,123.0,46.0,13.0,66.0,94.0,475.0
653,28947,2021,4805,Aaron Nesmith,SF,21.0,1,NBA,Boston Celtics,BOS,...,28.0,28.0,99.0,127.0,23.0,15.0,9.0,23.0,87.0,218.0


In [363]:
#View team summary dataset
team_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 31 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   season                                   1783 non-null   int64  
 1   league                                   1783 non-null   object 
 2   team                                     1783 non-null   object 
 3   abbreviation                             1682 non-null   object 
 4   playoffs                                 1783 non-null   bool   
 5   age                                      1719 non-null   float64
 6   wins                                     1697 non-null   float64
 7   losses                                   1697 non-null   float64
 8   pythagorean_wins                         1782 non-null   float64
 9   pythagorean_losses                       1782 non-null   float64
 10  margin_of_victory                        1782 no

In [364]:
#Drop unnecessary columns from team_summary_df
team_summary_filtered_df = team_summary_df[['season',
                                   'league',
                                   'team',
                                   'strength_of_schedule',
                                   'playoffs']]
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1783 non-null   int64  
 1   league                1783 non-null   object 
 2   team                  1783 non-null   object 
 3   abbreviation          1682 non-null   object 
 4   strength_of_schedule  1782 non-null   float64
 5   playoffs              1783 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 71.5+ KB


In [365]:
#Filter team_summary_df to include years 1979-2021
team_summary_filtered_df = team_summary_filtered_df.loc[(team_summary_filtered_df['season'] < 2022) & 
                                   (team_summary_filtered_df['season'] > 1978)]

In [366]:
#View results
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1229 entries, 31 to 1259
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1229 non-null   int64  
 1   league                1229 non-null   object 
 2   team                  1229 non-null   object 
 3   abbreviation          1170 non-null   object 
 4   strength_of_schedule  1229 non-null   float64
 5   playoffs              1229 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 58.8+ KB


In [368]:
#Drop all rows that contain 'league Average' under team column
team_summary_filtered_df = team_summary_filtered_df.loc[(
    team_summary_filtered_df['team'] != 'League Average')]
team_summary_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1186 entries, 31 to 1258
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   season                1186 non-null   int64  
 1   league                1186 non-null   object 
 2   team                  1186 non-null   object 
 3   abbreviation          1170 non-null   object 
 4   strength_of_schedule  1186 non-null   float64
 5   playoffs              1186 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 56.8+ KB


In [369]:
#View the advanced_df dataset
advanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30296 entries, 0 to 30295
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season_id                     30296 non-null  int64  
 1   season                        30296 non-null  int64  
 2   player_id                     30296 non-null  int64  
 3   player                        30296 non-null  object 
 4   birth_year                    2867 non-null   float64
 5   position                      30296 non-null  object 
 6   age                           30272 non-null  float64
 7   experience                    30296 non-null  int64  
 8   league                        30296 non-null  object 
 9   team                          30296 non-null  object 
 10  games                         30267 non-null  float64
 11  minutes_played                29162 non-null  float64
 12  player_efficiency_rating      29114 non-null  float64
 13  t

In [370]:
#filter the advanced_df to only include years 1979-2021
advanced_filtered_df =  advanced_df.loc[(advanced_df['season'] < 2022) & 
                                   (advanced_df['season'] > 1978)]
advanced_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21999 entries, 649 to 22647
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season_id                     21999 non-null  int64  
 1   season                        21999 non-null  int64  
 2   player_id                     21999 non-null  int64  
 3   player                        21999 non-null  object 
 4   birth_year                    682 non-null    float64
 5   position                      21999 non-null  object 
 6   age                           21999 non-null  float64
 7   experience                    21999 non-null  int64  
 8   league                        21999 non-null  object 
 9   team                          21999 non-null  object 
 10  games                         21999 non-null  float64
 11  minutes_played                21999 non-null  float64
 12  player_efficiency_rating      21994 non-null  float64
 13 

In [371]:
#re-generate advanced_filtered_df containing columns of interest
advanced_filtered_df = advanced_filtered_df[['season', 'team', 'player', 'player_efficiency_rating',
                                             'true_shooting_percentage', 'three_point_attempt_rate',
                                            'free_throw_attempt_rate', 'offensive_rebound_percentage',
                                            'defensive_rebound_percentage', 'total_rebound_percentage',
                                            'assist_percentage', 'steal_percentage', 'block_percentage',
                                            'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
                                            'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
                                            'defensive_box_plus/minus', 'box_plus/minus', 
                                             'value_over_replacement_plyr']]
#Drop rows containing nulls
advanced_filtered_df.dropna(inplace = True)
advanced_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21553 entries, 649 to 22303
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   season                        21553 non-null  int64  
 1   team                          21553 non-null  object 
 2   player                        21553 non-null  object 
 3   player_efficiency_rating      21553 non-null  float64
 4   true_shooting_percentage      21553 non-null  float64
 5   three_point_attempt_rate      21553 non-null  float64
 6   free_throw_attempt_rate       21553 non-null  float64
 7   offensive_rebound_percentage  21553 non-null  float64
 8   defensive_rebound_percentage  21553 non-null  float64
 9   total_rebound_percentage      21553 non-null  float64
 10  assist_percentage             21553 non-null  float64
 11  steal_percentage              21553 non-null  float64
 12  block_percentage              21553 non-null  float64
 13 

In [261]:
advanced_filtered_df.head()

Unnamed: 0,season,team,player,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,...,block_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr
649,2021,TOT,Aaron Gordon,14.5,0.547,0.353,0.299,6.0,16.4,11.1,...,2.3,20.7,0.7,1.2,1.9,0.066,0.2,-0.2,0.0,0.7
650,2021,ORL,Aaron Gordon,14.9,0.537,0.382,0.358,5.3,18.5,11.7,...,2.5,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5
651,2021,DEN,Aaron Gordon,14.1,0.564,0.311,0.214,6.7,13.9,10.3,...,2.1,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2
652,2021,IND,Aaron Holiday,9.2,0.503,0.417,0.19,1.4,6.8,4.1,...,0.9,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6
653,2021,BOS,Aaron Nesmith,9.4,0.573,0.607,0.157,4.6,16.6,10.6,...,1.3,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2


In [372]:
#Update advanced_filtered_df to include full_team_name and updated_abbrev 
advanced_filtered_df.rename(columns={"team": "abbreviation"}, inplace=True)
advanced_filtered_df['team'] = [updated_abbrev_team_name_dict[i] for i in advanced_filtered_df[
    'abbreviation']]
advanced_filtered_df.head()

Unnamed: 0,season,abbreviation,player,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,...,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_min,offensive_box_plus/minus,defensive_box_plus/minus,box_plus/minus,value_over_replacement_plyr,team
649,2021,TOT,Aaron Gordon,14.5,0.547,0.353,0.299,6.0,16.4,11.1,...,20.7,0.7,1.2,1.9,0.066,0.2,-0.2,0.0,0.7,Total
650,2021,ORL,Aaron Gordon,14.9,0.537,0.382,0.358,5.3,18.5,11.7,...,23.8,0.0,0.6,0.6,0.037,0.6,0.0,0.6,0.5,Orlando Magic
651,2021,DEN,Aaron Gordon,14.1,0.564,0.311,0.214,6.7,13.9,10.3,...,17.2,0.7,0.6,1.3,0.099,-0.1,-0.5,-0.6,0.2,Denver Nuggets
652,2021,IND,Aaron Holiday,9.2,0.503,0.417,0.19,1.4,6.8,4.1,...,19.5,-0.6,0.8,0.2,0.009,-3.2,-1.0,-4.1,-0.6,Indiana Pacers
653,2021,BOS,Aaron Nesmith,9.4,0.573,0.607,0.157,4.6,16.6,10.6,...,13.7,0.5,0.6,1.1,0.076,-2.9,-0.4,-3.2,-0.2,Boston Celtics


In [374]:
advanced_filtered_df.columns

Index(['season', 'abbreviation', 'player', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
       'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
       'defensive_box_plus/minus', 'box_plus/minus',
       'value_over_replacement_plyr', 'team'],
      dtype='object')

In [375]:
#Re-order columns in advanced_df
advanced_filtered_df = advanced_filtered_df[['season','team', 'abbreviation', 'player', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'usage_percentage', 'offensive_win_shares', 'defensive_win_shares',
       'win_shares', 'win_shares_per_48_min', 'offensive_box_plus/minus',
       'defensive_box_plus/minus', 'box_plus/minus',
       'value_over_replacement_plyr' ]]

In [234]:
#Write cleaned-up dataset to file
advanced_filtered_df.to_csv("resources/advanced_stats_filtered.csv", sep=',')
team_summary_filtered_df.to_csv("resources/team_summary_filtered.csv", sep=',')
player_totals_filtered_df.to_csv("resources/player_totals_filtered.csv", sep=',')