In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [3]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_Home', '_Away'],
                      on=['SEASON_ID', 'GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_Home != joined.TEAM_ID_Away]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_Home.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result


In [4]:
with open('data/pickles/boxscoreadv21.p', 'rb') as readfile: 
    bs21 = pickle.load(readfile)
with open('data/pickles/fourfactors21.p', 'rb') as readfile: 
    ff21 = pickle.load(readfile)
with open('data/pickles/season21.p', 'rb') as readfile: 
    season21 = pickle.load(readfile)

In [5]:
with open('data/pickles/boxscoreadv20.p', 'rb') as readfile: 
    bs20 = pickle.load(readfile)
with open('data/pickles/fourfactors20.p', 'rb') as readfile: 
    ff20 = pickle.load(readfile)
with open('data/pickles/season20.p', 'rb') as readfile: 
    season20 = pickle.load(readfile)

In [6]:
with open('data/pickles/boxscoreadv19.p', 'rb') as readfile: 
    bs19 = pickle.load(readfile)
with open('data/pickles/fourfactors19.p', 'rb') as readfile: 
    ff19 = pickle.load(readfile)
with open('data/pickles/season19.p', 'rb') as readfile: 
    season19 = pickle.load(readfile)

In [7]:
with open('data/pickles/boxscoreadv18.p', 'rb') as readfile: 
    bs18 = pickle.load(readfile)
with open('data/pickles/fourfactors18.p', 'rb') as readfile: 
    ff18 = pickle.load(readfile)
with open('data/pickles/season18.p', 'rb') as readfile: 
    season18 = pickle.load(readfile)

In [8]:
with open('data/pickles/boxscoreadv17.p', 'rb') as readfile: 
    bs17 = pickle.load(readfile)
with open('data/pickles/fourfactors17.p', 'rb') as readfile: 
    ff17 = pickle.load(readfile)
with open('data/pickles/season17.p', 'rb') as readfile: 
    season17 = pickle.load(readfile)

In [11]:
from nba_api.stats.endpoints import boxscorefourfactorsv2
fix = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id = '0022001069')
fixdf = fix.get_data_frames()[1]


ff21.reset_index(drop=True, inplace=True)

ff21.loc[0:1]=fixdf

In [14]:
def combine_dfs(season_df, bs_df, ff_df):
    merge1 = pd.merge(season_df, bs_df, on=['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME_y', 'TEAM_ABBREVIATION_y', 'MIN_y', 'TEAM_CITY'], axis=1)
    merge2 = pd.merge(merge1, ff_df, on = ['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN','EFG_PCT_y', 'TM_TOV_PCT_y'], axis=1)
    df = combine_team_games(merge2, keep_method='home')
    df.set_index(pd.to_datetime(df['GAME_DATE']), drop=True, inplace=True)
    df.sort_index(inplace=True)
    return df
    

In [96]:
def combine_dfs_seperate(season_df, bs_df, ff_df):
    merge1 = pd.merge(season_df, bs_df, on=['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME_y', 'TEAM_ABBREVIATION_y', 'MIN_y', 'TEAM_CITY'], axis=1)
    merge2 = pd.merge(merge1, ff_df, on = ['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN','EFG_PCT_y', 'TM_TOV_PCT_y'], axis=1)
    merge2.sort_values(by=["GAME_DATE", 'GAME_ID'], ascending=[1,1], inplace=True)
    merge2.reset_index(inplace=True, drop=True)
    return merge2

In [97]:
splitdf21 = combine_dfs_seperate(season21, bs21, ff21)
splitdf20 = combine_dfs_seperate(season20, bs20, ff20)
splitdf19 = combine_dfs_seperate(season19, bs19, ff19)
splitdf18 = combine_dfs_seperate(season18, bs18, ff18)
splitdf17 = combine_dfs_seperate(season17, bs17, ff17)

In [100]:
with open('data/pickles/splitdf21.p', 'wb') as writefile: 
    pickle.dump(splitdf21, writefile)
    
with open('data/pickles/splitdf20.p', 'wb') as writefile: 
    pickle.dump(splitdf20, writefile)
    
with open('data/pickles/splitdf19.p', 'wb') as writefile: 
    pickle.dump(splitdf19, writefile)
    
with open('data/pickles/splitdf18.p', 'wb') as writefile: 
    pickle.dump(splitdf18, writefile)
    
with open('data/pickles/splitdf17.p', 'wb') as writefile: 
    pickle.dump(splitdf17, writefile)

In [99]:
splitdf21.head(14)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION_x,TEAM_NAME_x,GAME_ID,GAME_DATE,MATCHUP,WL,MIN_x,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT_x,DREB_PCT,REB_PCT,E_TM_TOV_PCT,TM_TOV_PCT_x,EFG_PCT_x,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,FTA_RATE,OREB_PCT_y,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22020,1610612744,GSW,Golden State Warriors,22000001,2020-12-22,GSW @ BKN,L,240,99,37,99,0.374,10,33,0.303,15,23,0.652,13,34,47,26,6,6,18,24,-26.0,86.8,88.4,110.5,111.6,-23.8,-23.2,0.703,1.44,17.0,0.281,0.685,0.466,15.773,16.1,0.424,0.454,1.0,0.201,113.6,112.0,93.33,112,0.35,0.232,0.203,0.538,0.348,0.177,0.315
1,22020,1610612751,BKN,Brooklyn Nets,22000001,2020-12-22,BKN vs. GSW,W,242,125,42,92,0.457,15,35,0.429,26,32,0.813,13,44,57,24,11,7,20,22,26.0,110.5,111.6,86.8,88.4,23.8,23.2,0.571,1.2,16.0,0.315,0.719,0.534,17.687,17.9,0.538,0.589,1.0,0.201,113.6,112.0,93.33,112,0.65,0.348,0.241,0.424,0.232,0.158,0.281
2,22020,1610612747,LAL,Los Angeles Lakers,22000002,2020-12-22,LAL vs. LAC,L,240,109,38,81,0.469,9,29,0.31,24,31,0.774,8,37,45,22,4,2,19,20,-7.0,103.2,104.8,109.1,111.5,-5.9,-6.7,0.579,1.16,16.2,0.25,0.731,0.51,17.986,18.3,0.525,0.576,1.0,0.199,106.0,104.0,86.67,104,0.509,0.383,0.182,0.548,0.204,0.15,0.269
3,22020,1610612746,LAC,LA Clippers,22000002,2020-12-22,LAC @ LAL,W,241,116,44,93,0.473,14,40,0.35,14,19,0.737,11,29,40,22,10,3,16,29,7.0,109.1,111.5,103.2,104.8,5.9,6.7,0.5,1.38,15.8,0.269,0.75,0.49,15.043,15.4,0.548,0.572,1.0,0.198,106.0,104.0,86.67,104,0.491,0.204,0.212,0.525,0.383,0.18,0.25
4,22020,1610612749,MIL,Milwaukee Bucks,22000003,2020-12-23,MIL @ BOS,L,239,121,46,90,0.511,14,35,0.4,15,18,0.833,11,41,52,19,4,6,16,20,-1.0,117.6,119.8,118.6,122.0,-1.1,-2.2,0.413,1.19,14.3,0.333,0.764,0.57,15.546,15.8,0.589,0.618,1.0,0.198,102.88,100.5,83.75,101,0.506,0.2,0.244,0.564,0.109,0.068,0.236
5,22020,1610612738,BOS,Boston Celtics,22000003,2020-12-23,BOS vs. MIL,W,240,122,48,101,0.475,18,40,0.45,8,11,0.727,10,27,37,23,8,6,6,17,1.0,118.6,122.0,117.6,119.8,1.1,2.2,0.479,3.29,16.9,0.236,0.667,0.43,6.807,7.0,0.564,0.576,1.0,0.197,102.88,100.5,83.75,100,0.494,0.109,0.182,0.589,0.2,0.155,0.333
6,22020,1610612742,DAL,Dallas Mavericks,22000004,2020-12-23,DAL @ PHX,L,241,102,36,85,0.424,9,37,0.243,21,26,0.808,6,33,39,17,6,0,11,23,-4.0,99.6,99.0,102.0,102.9,-2.5,-3.9,0.472,1.42,13.6,0.157,0.795,0.453,11.714,11.7,0.476,0.529,1.0,0.199,103.16,103.0,85.83,103,0.449,0.306,0.118,0.551,0.342,0.183,0.205
7,22020,1610612756,PHX,Phoenix Suns,22000004,2020-12-23,PHX vs. DAL,W,241,106,38,79,0.481,11,30,0.367,19,27,0.704,6,42,48,21,7,7,18,26,4.0,102.0,102.9,99.6,99.0,2.5,3.9,0.553,1.11,16.0,0.205,0.843,0.547,18.29,18.4,0.551,0.583,1.0,0.198,103.16,103.0,85.83,103,0.551,0.342,0.136,0.476,0.306,0.117,0.157
8,22020,1610612739,CLE,Cleveland Cavaliers,22000010,2020-12-23,CLE vs. CHA,W,241,121,46,87,0.529,14,30,0.467,15,20,0.75,10,40,50,34,12,3,18,22,7.0,113.3,118.6,110.0,111.8,3.3,6.9,0.739,1.62,22.5,0.341,0.8,0.585,19.663,20.6,0.609,0.632,1.0,0.192,105.2,102.0,85.0,102,0.557,0.23,0.227,0.589,0.167,0.145,0.2
9,22020,1610612766,CHA,Charlotte Hornets,22000010,2020-12-23,CHA @ CLE,L,240,114,45,90,0.5,16,44,0.364,8,15,0.533,8,24,32,29,10,4,15,17,-7.0,110.0,111.8,113.3,118.6,-3.3,-6.9,0.644,1.93,20.6,0.2,0.659,0.415,14.479,14.7,0.589,0.59,1.0,0.197,105.2,102.0,85.0,102,0.443,0.167,0.16,0.609,0.23,0.197,0.341


In [16]:
df21 = combine_dfs(season21, bs21, ff21)
df20 = combine_dfs(season20, bs20, ff20)
df19 = combine_dfs(season19, bs19, ff19)
df18 = combine_dfs(season18, bs18, ff18)
df17 = combine_dfs(season17, bs17, ff17)

In [19]:
with open('data/pickles/df21.p', 'wb') as writefile: 
    pickle.dump(df21, writefile)
    
with open('data/pickles/df20.p', 'wb') as writefile: 
    pickle.dump(df20, writefile)
    
with open('data/pickles/df19.p', 'wb') as writefile: 
    pickle.dump(df19, writefile)
    
with open('data/pickles/df18.p', 'wb') as writefile: 
    pickle.dump(df18, writefile)
    
with open('data/pickles/df17.p', 'wb') as writefile: 
    pickle.dump(df17, writefile)


In [102]:
ff21

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,0022001069,1610612748,Heat,MIA,Miami,240:00,0.637,0.238,0.121,0.154,0.529,0.256,0.153,0.245
1,0022001069,1610612765,Pistons,DET,Detroit,240:00,0.529,0.256,0.153,0.143,0.637,0.238,0.121,0.231
2,0022001066,1610612745,Rockets,HOU,Houston,240:00,0.435,0.217,0.151,0.172,0.583,0.289,0.126,0.298
3,0022001066,1610612737,Hawks,ATL,Atlanta,240:00,0.583,0.289,0.126,0.234,0.435,0.217,0.151,0.250
4,0022001079,1610612761,Raptors,TOR,Toronto,240:00,0.464,0.299,0.113,0.271,0.584,0.189,0.123,0.271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,0022000017,1610612763,Grizzlies,MEM,Memphis,240:00,0.568,0.189,0.101,0.102,0.558,0.305,0.099,0.224
2156,0022000001,1610612744,Warriors,GSW,Golden State,240:00,0.424,0.232,0.158,0.203,0.538,0.348,0.177,0.315
2157,0022000001,1610612751,Nets,BKN,Brooklyn,240:00,0.538,0.348,0.177,0.241,0.424,0.232,0.158,0.281
2158,0022000002,1610612747,Lakers,LAL,Los Angeles,240:00,0.525,0.383,0.180,0.182,0.548,0.204,0.150,0.269
