In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

In [3]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_Home', '_Away'],
                      on=['SEASON_ID', 'GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_Home != joined.TEAM_ID_Away]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_Home.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result


In [4]:
with open('data/pickles/boxscoreadv21.p', 'rb') as readfile: 
    bs21 = pickle.load(readfile)
with open('data/pickles/fourfactors21.p', 'rb') as readfile: 
    ff21 = pickle.load(readfile)
with open('data/pickles/season21.p', 'rb') as readfile: 
    season21 = pickle.load(readfile)

In [5]:
with open('data/pickles/boxscoreadv20.p', 'rb') as readfile: 
    bs20 = pickle.load(readfile)
with open('data/pickles/fourfactors20.p', 'rb') as readfile: 
    ff20 = pickle.load(readfile)
with open('data/pickles/season20.p', 'rb') as readfile: 
    season20 = pickle.load(readfile)

In [6]:
with open('data/pickles/boxscoreadv19.p', 'rb') as readfile: 
    bs19 = pickle.load(readfile)
with open('data/pickles/fourfactors19.p', 'rb') as readfile: 
    ff19 = pickle.load(readfile)
with open('data/pickles/season19.p', 'rb') as readfile: 
    season19 = pickle.load(readfile)

In [7]:
with open('data/pickles/boxscoreadv18.p', 'rb') as readfile: 
    bs18 = pickle.load(readfile)
with open('data/pickles/fourfactors18.p', 'rb') as readfile: 
    ff18 = pickle.load(readfile)
with open('data/pickles/season18.p', 'rb') as readfile: 
    season18 = pickle.load(readfile)

In [8]:
with open('data/pickles/boxscoreadv17.p', 'rb') as readfile: 
    bs17 = pickle.load(readfile)
with open('data/pickles/fourfactors17.p', 'rb') as readfile: 
    ff17 = pickle.load(readfile)
with open('data/pickles/season17.p', 'rb') as readfile: 
    season17 = pickle.load(readfile)

In [11]:
from nba_api.stats.endpoints import boxscorefourfactorsv2
fix = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id = '0022001069')
fixdf = fix.get_data_frames()[1]


ff21.reset_index(drop=True, inplace=True)

ff21.loc[0:1]=fixdf

In [14]:
def combine_dfs(season_df, bs_df, ff_df):
    merge1 = pd.merge(season_df, bs_df, on=['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME_y', 'TEAM_ABBREVIATION_y', 'MIN_y', 'TEAM_CITY'], axis=1)
    merge2 = pd.merge(merge1, ff_df, on = ['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN','EFG_PCT_y', 'TM_TOV_PCT_y'], axis=1)
    df = combine_team_games(merge2, keep_method='home')
    df.set_index(pd.to_datetime(df['GAME_DATE']), drop=True, inplace=True)
    df.sort_index(inplace=True)
    return df
    

In [46]:
def combine_dfs_seperate(season_df, bs_df, ff_df):
    merge1 = pd.merge(season_df, bs_df, on=['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME_y', 'TEAM_ABBREVIATION_y', 'MIN_y', 'TEAM_CITY'], axis=1)
    merge2 = pd.merge(merge1, ff_df, on = ['GAME_ID', 'TEAM_ID']).drop(labels=['TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'MIN','EFG_PCT_y', 'TM_TOV_PCT_y'], axis=1)
    merge2.set_index(pd.to_datetime(merge2['GAME_DATE']), drop=True, inplace=True)
    merge2.sort_index(inplace=True)
    return merge2

In [60]:
splitdf21 = combine_dfs_seperate(season21, bs21, ff21)
splitdf20 = combine_dfs_seperate(season20, bs20, ff20)
splitdf19 = combine_dfs_seperate(season19, bs19, ff19)
splitdf18 = combine_dfs_seperate(season18, bs18, ff18)
splitdf17 = combine_dfs_seperate(season17, bs17, ff17)

In [63]:
with open('data/pickles/splitdf21.p', 'wb') as writefile: 
    pickle.dump(splitdf21, writefile)
    
with open('data/pickles/splitdf20.p', 'wb') as writefile: 
    pickle.dump(splitdf20, writefile)
    
with open('data/pickles/splitdf19.p', 'wb') as writefile: 
    pickle.dump(splitdf19, writefile)
    
with open('data/pickles/splitdf18.p', 'wb') as writefile: 
    pickle.dump(splitdf18, writefile)
    
with open('data/pickles/splitdf17.p', 'wb') as writefile: 
    pickle.dump(splitdf17, writefile)

In [16]:
df21 = combine_dfs(season21, bs21, ff21)
df20 = combine_dfs(season20, bs20, ff20)
df19 = combine_dfs(season19, bs19, ff19)
df18 = combine_dfs(season18, bs18, ff18)
df17 = combine_dfs(season17, bs17, ff17)

In [19]:
with open('data/pickles/df21.p', 'wb') as writefile: 
    pickle.dump(df21, writefile)
    
with open('data/pickles/df20.p', 'wb') as writefile: 
    pickle.dump(df20, writefile)
    
with open('data/pickles/df19.p', 'wb') as writefile: 
    pickle.dump(df19, writefile)
    
with open('data/pickles/df18.p', 'wb') as writefile: 
    pickle.dump(df18, writefile)
    
with open('data/pickles/df17.p', 'wb') as writefile: 
    pickle.dump(df17, writefile)


PLUS_MINUS_Home      1.000000
NET_RATING_Home      0.993704
E_NET_RATING_Home    0.976128
PIE_Home             0.938534
E_OFF_RATING_Home    0.614020
E_DEF_RATING_Away    0.614020
DEF_RATING_Away      0.606194
OFF_RATING_Home      0.606194
TS_PCT_Home          0.561642
EFG_PCT_x_Home       0.546302
OPP_EFG_PCT_Away     0.546302
PTS_Home             0.543001
FG_PCT_Home          0.512827
FGM_Home             0.460586
REB_PCT_Home         0.442423
DREB_Home            0.417182
AST_RATIO_Home       0.406193
AST_Home             0.402756
FG3_PCT_Home         0.385150
FG3M_Home            0.348831
Name: PLUS_MINUS_Home, dtype: float64