In [107]:
import pandas as pd
import os

from typing import Dict, List

# Formatting Original Data

In [108]:
src_dir = os.getcwd()
data_dir = os.path.join(src_dir, "Data")
os.makedirs(data_dir, exist_ok=True)  # Make directory if it doesn't exist

## nba-players-stats/Seasons_Stats.csv

In [114]:
stat_50_17_file = os.path.join(src_dir, "nba-players-stats/Seasons_Stats.csv")
stat50_17 = pd.read_csv(stat_50_17_file, index_col=0)

In [115]:
def format_stats_50_17(stat50_17: pd.DataFrame, type_conv: Dict, fill_dict: List) -> pd.DataFrame:
    """
    Formats 1950-2017 stats file. Note that this function modifies stat50_17 to save memory.
    In:
        stat50_17: DataFrame from 1950-2017 stats file; modified in-place to save memory.
        type_conv: Dict containing column names to convert to int.
    Return:
        Formatted 1991-2017 stats DataFrame.
    """
    # Drop rows prior to 1990 (our salary data is from 1991 onwards)
    stat50_17.drop(stat50_17[stat50_17["Year"] < 1990].index, inplace=True)
    # Drop NA columns; 
    stat50_17.dropna(axis=1, how='all', inplace=True)
    # For players who had multiple teams for a year, keep the TOTAL row
    stat50_17.drop_duplicates(subset=["Year", "Player"], inplace=True)
    # Fill NAs with 0s for certain columns
    stat50_17.fillna(fill_dict, inplace=True)
    # Drop NA rows
    stat50_17.dropna(inplace=True)  # Defaults: axis=0, how='any'
    # Convert certain cols to int; deep copy here
    stat90_17 = stat50_17.astype(type_conv)
    # Remove asterisks from HOF players
    stat90_17["Player"] = stat90_17["Player"].str.rstrip('*')
    # Reset index
    stat90_17.reset_index(drop=True, inplace=True) # Drop the old index (rather than adding it as a col)
    return stat90_17

In [116]:
cols_to_int = ['Year', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA',
              'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'] 
type_conv = dict.fromkeys(cols_to_int, int)
cols_to_fill = ['FG%', '3P%', '2P%', 'eFG%', 'FT%']
fill_dict = dict.fromkeys(cols_to_fill, 0.0)
stat90_17 = format_stats_50_17(stat50_17, type_conv, fill_dict)
# Rename Year col to Season End for clarity
stat90_17.rename(columns={"Year": "Season End"}, inplace=True) 
# Show first and last 5 rows and columns
# iloc[row_idx_list, col_idx_list]
print(stat90_17.iloc[list(range(5)) + list(range(-5, 0)), 
                   list(range(5)) + list(range(-5, 0))])

       Season End             Player Pos  Age   Tm  STL  BLK  TOV   PF   PTS
0            1990         Mark Acres   C   27  ORL   36   25   70  248   362
1            1990      Michael Adams  PG   27  DEN  121    3  141  133  1221
2            1990       Mark Aguirre  SF   30  DET   34   19  121  201  1099
3            1990        Danny Ainge  PG   30  SAC  113   18  185  238  1342
4            1990        Mark Alarie  PF   26  WSB   60   39  101  219   860
12292        2017        Cody Zeller  PF   24  CHO   62   58   65  189   639
12293        2017       Tyler Zeller   C   27  BOS    7   21   20   61   178
12294        2017  Stephen Zimmerman   C   20  ORL    2    5    3   17    23
12295        2017        Paul Zipser  SF   22  CHI   15   16   40   78   240
12296        2017        Ivica Zubac   C   19  LAL   14   33   30   66   284


In [117]:
out_stats_name = "nba_stats_1990_2017.csv"
out_stats_path = os.path.join(data_dir, out_stats_name)

In [118]:
if not os.path.isfile(out_stats_path): stat90_17.to_csv(out_stats_path, index=False)

## Salaries Per Season (1990-2017)

In [119]:
sal91_18_file = os.path.join(src_dir, "nba-player-salary-19902017/Player - Salaries per Year (1990 - 2017).csv")
sal91_18 = pd.read_csv(sal91_18_file, thousands=",")

In [120]:
def format_sal(sal91_18: pd.DataFrame) -> pd.DataFrame:
    """
    Formats 1991-2018 salaries file. Note that this function modifies sal91_18 to save memory.
    In:
        sal91_18: DataFrame from 1991-2018 salaries file; modified in-place to save memory.
    Return:
        Formatted salaries DataFrame.
    """
    # Remove "Register Value, Team, Full Team Name" columns
    if "Register Value" in sal91_18.columns: sal91_18.pop("Register Value")
    if "Team" in sal91_18.columns: sal91_18.pop("Team")
    if "Full Team Name" in sal91_18.columns: sal91_18.pop("Full Team Name")
    # Rename Salary in $ to Salary
    sal91_18.rename(columns={" Salary in $ ": "Salary", "Player Name": "Player"}, inplace=True) 
    # Convert Salary to int
    sal91_18['Salary'] = sal91_18['Salary'].str.replace(r'[\$,]', '').astype(float)
    return sal91_18

In [121]:
sal91_18 = format_sal(sal91_18)
print(sal91_18.head())
print(sal91_18.tail())

       Player     Salary  Season Start  Season End
0  A.C. Green  1750000.0          1990        1991
1  A.C. Green  1750000.0          1991        1992
2  A.C. Green  1750000.0          1992        1993
3  A.C. Green  1885000.0          1993        1994
4  A.C. Green  6472600.0          1994        1995
                   Player      Salary  Season Start  Season End
11832  Zydrunas Ilgauskas   8740000.0          2005        2006
11833  Zydrunas Ilgauskas   9442697.0          2006        2007
11834  Zydrunas Ilgauskas  10142156.0          2007        2008
11835  Zydrunas Ilgauskas  10841615.0          2008        2009
11836  Zydrunas Ilgauskas  11541074.0          2009        2010


In [122]:
out_sal_name = "nba_salaries_1991_2018.csv"
out_sal_path = os.path.join(data_dir, out_sal_name)

In [123]:
if not os.path.isfile(out_sal_path): sal91_18.to_csv(out_sal_path, index=False)

## Matching Salary Data to Player Data by Year

In [124]:
sal91_18_form_file = os.path.join(data_dir, "nba_salaries_1991_2018.csv")
sal91_18_form = pd.read_csv(sal91_18_form_file)
stat90_17_form_file = os.path.join(data_dir, "nba_stats_1990_2017.csv")
stat90_17_form = pd.read_csv(stat90_17_form_file)

In [138]:
merged = pd.merge(stat90_17_form, sal91_18_form[["Player", "Season Start", "Salary"]], 
                  how='inner', 
                  left_on=['Player', 'Season End'], 
                  right_on=['Player', 'Season Start'])
if "Season Start" in merged.columns: merged.pop("Season Start")

In [139]:
print(merged.iloc[list(range(5)) + list(range(-5, 0)), 
                   list(range(5)) + list(range(-5, 0))])

      Season End             Player Pos  Age   Tm  BLK  TOV   PF   PTS  \
0           1990         Mark Acres   C   27  ORL   25   70  248   362   
1           1990      Michael Adams  PG   27  DEN    3  141  133  1221   
2           1990       Mark Aguirre  SF   30  DET   19  121  201  1099   
3           1990        Danny Ainge  PG   30  SAC   18  185  238  1342   
4           1990        Mark Alarie  PF   26  WSB   39  101  219   860   
9413        2017        Cody Zeller  PF   24  CHO   58   65  189   639   
9414        2017       Tyler Zeller   C   27  BOS   21   20   61   178   
9415        2017  Stephen Zimmerman   C   20  ORL    5    3   17    23   
9416        2017        Paul Zipser  SF   22  CHI   16   40   78   240   
9417        2017        Ivica Zubac   C   19  LAL   33   30   66   284   

          Salary  
0       437000.0  
1       825000.0  
2      1115000.0  
3       725000.0  
4       500000.0  
9413  12584270.0  
9414   1709538.0  
9415   1312611.0  
9416   1312611

In [133]:
out_merged_name = "nba_stats_sal_merged_1990_2017.csv"
out_merged_path = os.path.join(data_dir, out_merged_name)

In [135]:
if not os.path.isfile(out_merged_path): merged.to_csv(out_merged_path, index=False)