In [9]:
import pandas as pd
import os

from typing import Dict

# Formatting Original Data

In [10]:
src_dir = os.getcwd()

## nba-players-stats/Seasons_Stats.csv

In [11]:
stats_1950_2017 = os.path.join(src_dir, "nba-players-stats/Seasons_Stats.csv")
df90_17 = pd.read_csv(stats_1950_2017, index_col=0)

In [12]:
def format_stats_1950_2017(df90_17: pd.DataFrame, type_conv: Dict) -> pd.DataFrame:
    """
    Formats 1950-2017 stats file. Note that this function modifies df90_17 to save memory.
    In:
        df90_17: DataFrame from 1950-2017 stats file; modified in-place to save memory.
        type_conv: Dict containing column names to convert to int.
    Return:
        Formatted DataFrame.
    """
    # Drop rows prior to 1990
    df90_17.drop(df90_17[df90_17["Year"] < 1990].index, inplace=True)
    # Drop NA columns
    df90_17.dropna(axis=1, how='all', inplace=True)
    # Drop NA rows
    df90_17.dropna(inplace=True)  # Defaults: axis=0, how='any'
    # For players who had multiple teams for a year, keep the TOTAL row
    df90_17.drop_duplicates(subset=["Year", "Player"], inplace=True)
    # Convert certain cols to int; deep copy here
    df90_17 = df90_17.astype(type_conv)
    # Remove asterisks from HOF players
    df90_17["Player"] = df90_17["Player"].str.rstrip('*')
    # Reset index
    df90_17.reset_index(drop=True, inplace=True) # Drop the old index (rather than adding it as a col)
    return df90_17

In [15]:
cols_to_int = ['Year', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA',
              'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'] 
type_conv = dict.fromkeys(cols_to_int, int)
df90_17 = format_stats_1950_2017(df90_17, type_conv)
# Show first and last 5 rows and columns
# iloc[row_idx_list, col_idx_list]
print(df90_17.iloc[list(range(5)) + list(range(-5, 0)), 
                   list(range(5)) + list(range(-5, 0))])

       Year          Player Pos  Age   Tm  STL  BLK  TOV   PF   PTS
0      1990      Mark Acres   C   27  ORL   36   25   70  248   362
1      1990   Michael Adams  PG   27  DEN  121    3  141  133  1221
2      1990    Mark Aguirre  SF   30  DET   34   19  121  201  1099
3      1990     Danny Ainge  PG   30  SAC  113   18  185  238  1342
4      1990     Mark Alarie  PF   26  WSB   60   39  101  219   860
10208  2017  Thaddeus Young  PF   28  IND  114   30   96  135   814
10209  2017     Cody Zeller  PF   24  CHO   62   58   65  189   639
10210  2017    Tyler Zeller   C   27  BOS    7   21   20   61   178
10211  2017     Paul Zipser  SF   22  CHI   15   16   40   78   240
10212  2017     Ivica Zubac   C   19  LAL   14   33   30   66   284


## nba17-18

In [6]:
stats_2018_a = os.path.join(src_dir, "nba17-18/nba.csv")
stats_2018_b = os.path.join(src_dir, "nba17-18/nba_extra.csv")
# Merge on the intersection of column names and union the two DataFrames
df2018 = pd.merge(pd.read_csv(stats_2018_a, index_col=0), 
                  pd.read_csv(stats_2018_b, index_col=0), how="outer")

In [7]:
def format_stats_2018(df2018: pd.DataFrame) -> pd.DataFrame:
    """
    Formats 2018 stats file. Note that this function modifies df2018 to save memory.
    This function could make df90_17 an out parameter, but we return a value for consistency.
    In:
        df90_17: DataFrame from 1950-2017 stats file; modified in-place to save memory.
    Return:
        Formatted DataFrame.
    """
    # Drop NA columns
    df2018.dropna(axis=1, how='all', inplace=True)
    # Drop NA rows
    df2018.dropna(inplace=True)  # Defaults: axis=0, how='any'
    # Add Year column
    if not "Year" in df2018.columns: df2018.insert(0, "Year", 2018)
    # For players who had multiple teams for a year, keep the TOTAL row
    df2018.drop_duplicates(subset=["Year", "Player"], inplace=True)
    # Reorder columns; reindex doesn't do in-place and requires returning df2018    
    gs = df2018.pop("GS")
    df2018.insert(df2018.columns.get_loc('G')+1, "GS", gs)
    # Remove name encoding appended to name (escaped backslash)
    df2018["Player"] = df2018["Player"].str.replace(r'\\(.*)', '')
    # Reset index
    df2018.reset_index(drop=True, inplace=True) # Drop the old index (rather than adding it as a col)
    return df2018

In [16]:
df2018 = format_stats_2018(df2018)
# Show first and last 5 rows and columns
# iloc[row_idx_list, col_idx_list]
print(df2018.iloc[list(range(5)) + list(range(-5, 0)), 
                   list(range(5)) + list(range(-5, 0))])

     Year          Player Pos  Age   Tm  STL  BLK  TOV   PF   PTS
0    2018    Alex Abrines  SG   24  OKC   38    8   25  124   353
1    2018      Quincy Acy  PF   27  BRK   33   29   60  149   411
2    2018    Steven Adams   C   24  OKC   92   78  128  215  1056
3    2018     Bam Adebayo   C   20  MIA   32   41   66  138   477
4    2018   Arron Afflalo  SG   32  ORL    4    9   21   56   179
461  2018  Thaddeus Young  PF   29  IND  135   36  105  175   955
462  2018     Cody Zeller   C   25  CHO   14   21   33   81   233
463  2018    Tyler Zeller   C   28  TOT   15   35   47  126   441
464  2018     Paul Zipser  SF   23  CHI   20   15   43   86   218
465  2018     Ivica Zubac   C   20  LAL    8   15   26   47   161


## Combine Stats Data and Export CSV

In [18]:
df90_18 = pd.concat([df90_17, df2018]) # No need to use any join args b/c cols are the same
print(df90_18.iloc[list(range(5)) + list(range(-5, 0)), 
                   list(range(5)) + list(range(-5, 0))])

     Year          Player Pos  Age   Tm  STL  BLK  TOV   PF   PTS
0    1990      Mark Acres   C   27  ORL   36   25   70  248   362
1    1990   Michael Adams  PG   27  DEN  121    3  141  133  1221
2    1990    Mark Aguirre  SF   30  DET   34   19  121  201  1099
3    1990     Danny Ainge  PG   30  SAC  113   18  185  238  1342
4    1990     Mark Alarie  PF   26  WSB   60   39  101  219   860
461  2018  Thaddeus Young  PF   29  IND  135   36  105  175   955
462  2018     Cody Zeller   C   25  CHO   14   21   33   81   233
463  2018    Tyler Zeller   C   28  TOT   15   35   47  126   441
464  2018     Paul Zipser  SF   23  CHI   20   15   43   86   218
465  2018     Ivica Zubac   C   20  LAL    8   15   26   47   161


In [26]:
outfile_name = "nba_stats_1990_2018.csv"
data_dir = os.path.join(src_dir, "Data")
outfile_path = os.path.join(data_dir, outfile_name)

In [27]:
os.makedirs(data_dir, exist_ok=True)  # Make directory if it doesn't exist
if not os.path.isfile(outfile_path): df90_18.to_csv(outfile_path, index=False)