In [2]:
import glob
import numpy as np
import pandas as pd
import os

from functools import reduce

# Create Featurized Datasets

In [3]:
def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

In [3]:
def apply_moving_averages(df, window_sizes=[1,2,3,4,5,6,7,8], alphas = [.01, .05, .1, .2, .3, .5, .75, .9, .95]):
    
    #Create a copy of original df to apply moving average transformations to
    #And shift all rows down by 1 (don't want to predict current game's outcome with current game's stats)
    df_copy = df.copy().loc[:, ~df.columns.isin(['id','date','season','home','opp', 'w/l'])].shift()
    
    #Create df to append all transformed columns to one df
    moving_average_df = df[['id', 'pt_diff', 'date', 'season', 'home']].copy()
    
    df_list = [moving_average_df]
    
    for w in window_sizes:
        #Apply (equally weighted) rolling average to df
        temp_df = df_copy.rolling(window=w, min_periods=1).mean()
        
        #Add suffix to all columns designating which operation was applied
        temp_df = temp_df.add_suffix("_roll_{}".format(w))
        
        df_list.append(temp_df)
        
    for a in alphas:
        temp_df = df_copy.ewm(alpha=a).mean()
        temp_df = temp_df.add_suffix("_exp_{}".format(a))
        
        df_list.append(temp_df)
        
    #Combine all moving average dfs into one
    df_merged = pd.concat(df_list,axis=1)
    
    df_merged.insert(1, 'games_played', df_merged.index + 1)
    
    return df_merged
    
    
    

In [4]:
def days_since_last_game(df):
    
    #Get the number of days since last game each game in season
    dates = pd.to_datetime(df['date'])
    
    return (dates - dates.shift()).dt.days.fillna(0).astype(int)

In [5]:
def consec_away_games(df):
    
    #Get # of consecutive away games for each game in season
    
    num_consec = 0
    consec_list = []
    for home in df['home']:
        if home == 0:
            num_consec += 1
            consec_list.append(num_consec)
        else:
            num_consec = 0
            consec_list.append(num_consec)
            
    return consec_list

In [15]:
#apply feature extraction methods to each individual gamelog and save resulting df

working_dir = '/Users/gregyannett/Documents/nba_game_models/data/'

for year_dir in listdir_nohidden(working_dir+'gamelogs'):
    
    year = year_dir[-4:]
    
    os.mkdir(working_dir+'featurized_gamelogs/{year}'.format(year=year))
    
    for gamelog_path in listdir_nohidden(year_dir):
        
        gamelog = pd.read_csv(gamelog_path)
        
        #Extract features from data
        feature_df = apply_moving_averages(gamelog)
        feature_df['days_since_last'] = days_since_last_game(feature_df)
        feature_df['back2back'] = np.where(feature_df['days_since_last']==1, 1, 0)
        feature_df['consec_away'] = consec_away_games(feature_df)
        
        #save data to featurized_gamelogs directory
        fpath = working_dir + '/featurized_gamelogs/{year}/{team}_{year}.csv'.format(team=gamelog_path[-12:-9], year=year)
        feature_df.to_csv(fpath, index=False)
        
    print(year)
        

2013
2014
2015
2012
1994
1995
2008
2001
2006
2007
2000
2009
2017
2010
2019
2020
2018
2011
2016
1997
1999
1998
1996
2005
2002
2003
2004


In [None]:
working_dir = '/Users/gregyannett/Documents/nba_game_models/data/'

In [67]:
#Gets comprehensive dataframe of all games from all seasons (moving average stats, schedule features, etc for both home and away teams)

working_dir = '/Users/gregyannett/Documents/nba_game_models/data/'

home_dfs = []
away_dfs = []

for year_dir in listdir_nohidden(working_dir + 'featurized_gamelogs'):
    
    year = year_dir[-4:]
    
    for gamelog_path in listdir_nohidden(year_dir):
        
        temp = pd.read_csv(gamelog_path)
        
        is_home = temp['home'].values.astype(bool)
        
        temp = temp.drop(['home'], axis=1)
        
        home = temp[is_home]
        
        home_dfs.append(home)
        
        away = temp[~is_home]
        away = away.drop(['date'],axis=1)
        
        away.columns = ['{}{}'.format('' if c == 'id' else 'away_', c) for c in away.columns]
        
        away_dfs.append(away)  
        
    print(year)
    
all_home_games = pd.concat(home_dfs)
all_away_games = pd.concat(away_dfs)

all_games = all_home_games.merge(all_away_games, on='id').reset_index(drop=True)

all_games.to_csv(working_dir + 'datasets/all_games.csv', index=False)

2013
2014
2015
2012
1994
1995
2008
2001
2006
2007
2000
2009
2017
2010
2019
2021
2020
2018
2011
2016
1997
1999
1998
1996
2005
2002
2003
2004
