In [225]:
import pandas as pd
import os

from typing import Dict

# Formatting Original Data

In [226]:
src_dir = os.getcwd()

## nba-players-stats/Seasons_Stats.csv

In [232]:
stats_1950_2017 = os.path.join(src_dir, "nba-players-stats/Seasons_Stats.csv")
df90_17 = pd.read_csv(stats_1950_2017, index_col=0)

In [233]:
def format_stats_1950_2017(df90_17: pd.DataFrame, type_conv: Dict) -> pd.DataFrame:
    """
    Formats 1950-2017 stats file. Note that this function modifies df90_17 to save memory.
    In:
        df90_17: DataFrame from 1950-2017 stats file; modified in-place to save memory.
        type_conv: Dict containing column names to convert to int.
    Return:
        Formatted DataFrame.
    """
    # Drop rows prior to 1990
    df90_17.drop(df90_17[df90_17["Year"] < 1990].index, inplace=True)
    # Drop NA columns
    df90_17.dropna(axis=1, how='all', inplace=True)
    # Drop NA rows
    df90_17.dropna(inplace=True)  # Defaults: axis=0, how='any'
    # Convert certain cols to int; deep copy here
    df90_17 = df90_17.astype(type_conv)
    # Remove asterisks from HOF players
    df90_17["Player"] = df90_17["Player"].str.rstrip('*')
    # Reset index
    df90_17.reset_index(drop=True, inplace=True) # Drop the old index (rather than adding it as a col)
    return df90_17

In [234]:
cols_to_int = ['Year', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA',
              'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'] 
type_conv = dict.fromkeys(cols_to_int, int)
df90_17 = format_stats_1950_2017(df90_17, type_conv)
# Show first and last 5 rows and columns
# iloc[row_idx_list, col_idx_list]
print(df90_17.iloc[list(range(5)) + list(range(-1, -6, -1)), 
                   list(range(5)) + list(range(-1, -6, -1))])

       Year          Player Pos  Age   Tm   PTS   PF  TOV  BLK  STL
0      1990      Mark Acres   C   27  ORL   362  248   70   25   36
1      1990   Michael Adams  PG   27  DEN  1221  133  141    3  121
2      1990    Mark Aguirre  SF   30  DET  1099  201  121   19   34
3      1990     Danny Ainge  PG   30  SAC  1342  238  185   18  113
4      1990     Mark Alarie  PF   26  WSB   860  219  101   39   60
12145  2017     Ivica Zubac   C   19  LAL   284   66   30   33   14
12144  2017     Paul Zipser  SF   22  CHI   240   78   40   16   15
12143  2017    Tyler Zeller   C   27  BOS   178   61   20   21    7
12142  2017     Cody Zeller  PF   24  CHO   639  189   65   58   62
12141  2017  Thaddeus Young  PF   28  IND   814  135   96   30  114


## nba17-18

In [235]:
stats_2018_a = os.path.join(src_dir, "nba17-18/nba.csv")
stats_2018_b = os.path.join(src_dir, "nba17-18/nba_extra.csv")
# Merge on the intersection of column names and union the two DataFrames
df2018 = pd.merge(pd.read_csv(stats_2018_a, index_col=0), 
                  pd.read_csv(stats_2018_b, index_col=0), how="outer")

In [236]:
def format_stats_2018(df2018: pd.DataFrame) -> pd.DataFrame:
    """
    Formats 2018 stats file. Note that this function modifies df2018 to save memory.
    This function could make df90_17 an out parameter, but we return a value for consistency.
    In:
        df90_17: DataFrame from 1950-2017 stats file; modified in-place to save memory.
    Return:
        Formatted DataFrame.
    """
    # Drop NA columns
    df2018.dropna(axis=1, how='all', inplace=True)
    # Drop NA rows
    df2018.dropna(inplace=True)  # Defaults: axis=0, how='any'
    # Add Year column
    if not "Year" in df2018.columns: df2018.insert(0, "Year", 2018)
    # Reorder columns; reindex doesn't do in-place and requires returning df2018    
    gs = df2018.pop("GS")
    df2018.insert(df2018.columns.get_loc('G')+1, "GS", gs)
    # Remove name encoding appended to name (escaped backslash)
    df2018["Player"] = df2018["Player"].str.replace(r'\\(.*)', '')
    # Reset index
    df2018.reset_index(drop=True, inplace=True) # Drop the old index (rather than adding it as a col)
    return df2018

In [238]:
df2018 = format_stats_2018(df2018)
# Show first and last 5 rows and columns
# iloc[row_idx_list, col_idx_list]
print(df2018.iloc[list(range(5)) + list(range(-1, -6, -1)), 
                   list(range(5)) + list(range(-1, -6, -1))])

     Year         Player Pos  Age   Tm   PTS   PF  TOV  BLK  STL
0    2018   Alex Abrines  SG   24  OKC   353  124   25    8   38
1    2018     Quincy Acy  PF   27  BRK   411  149   60   29   33
2    2018   Steven Adams   C   24  OKC  1056  215  128   78   92
3    2018    Bam Adebayo   C   20  MIA   477  138   66   41   32
4    2018  Arron Afflalo  SG   32  ORL   179   56   21    9    4
560  2018    Ivica Zubac   C   20  LAL   161   47   26   15    8
559  2018    Paul Zipser  SF   23  CHI   218   86   43   15   20
558  2018   Tyler Zeller   C   28  MIL   141   48   12   14    7
557  2018   Tyler Zeller   C   28  BRK   300   78   35   21    8
556  2018   Tyler Zeller   C   28  TOT   441  126   47   35   15
