# Data Wrangling Part 2

The purpose of this script is to use the 2018-19 season games and the LUTs we generated to train an ML model to predict the winner of NBA regular season games.

In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
teams = pd.read_csv('Data/teams.csv')
df = pd.read_csv('Data/games_modelling.csv')
df.head()

Unnamed: 0,date,season,home,away,is_home_win,winner
0,2003-10-05,2003,UTA,DAL,1,UTA
1,2003-10-06,2003,MEM,MIL,1,MEM
2,2003-10-07,2003,SAC,LAC,1,SAC
3,2003-10-07,2003,POR,HOU,1,POR
4,2003-10-07,2003,MIA,PHI,1,MIA


In [3]:
luts = joblib.load('Data/luts.pkl')

In [4]:
df.shape

(24677, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24677 entries, 0 to 24676
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         24677 non-null  object
 1   season       24677 non-null  int64 
 2   home         24677 non-null  object
 3   away         24677 non-null  object
 4   is_home_win  24677 non-null  int64 
 5   winner       24677 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.1+ MB


It seems that whenever we load this dataframe, the date column reverts back to an object.  Not sure why, but we can just manually fix this.  We'll also generate our team name - id dictionary as well.

In [6]:
# re-format the date column
df['date'] = df['date'].astype('datetime64')

# generate a team name-ID dictionary from the teams dataset
teams = teams[['TEAM_ID', 'ABBREVIATION']]
teams = teams.set_index('TEAM_ID')
id_to_name = teams.to_dict()['ABBREVIATION']

# generate this same dictionary in reverse (i.e. ID-name instead of name-ID)
name_to_id = dict((v,k) for k,v in id_to_name.items())  

Now we're in position to get each team's stats going into each game!  Let's start by loading the start and end dates for the regular season games.

In [7]:
start_dates = {'2003' : '2003-10-28',
              '2004' : '2004-11-02',
              '2005' : '2005-11-01',
              '2006' : '2006-10-31',
              '2007' : '2007-10-30',
              '2008' : '2008-10-28',
              '2009' : '2009-10-27',
              '2010' : '2010-10-26',
              '2011' : '2010-12-25',
              '2012' : '2012-10-30',
              '2013' : '2013-10-29',
              '2014' : '2014-10-28',
              '2015' : '2015-10-27',
              '2016' : '2016-10-25',
              '2017' : '2017-10-17',
              '2018' : '2018-10-16'}

end_dates = {'2003' : '2004-04-14',
            '2004' : '2005-04-20',
            '2005' : '2006-04-19',
            '2006' : '2007-04-18',
            '2007' : '2008-04-16',
            '2008' : '2009-04-16',
            '2009' : '2010-04-14',
            '2010' : '2011-04-13',
            '2011' : '2012-04-26',
            '2012' : '2013-04-17',
            '2013' : '2014-04-16',
            '2014' : '2015-04-15',
            '2015' : '2016-04-13',
            '2016' : '2017-04-12',
            '2017' : '2018-04-11',
            '2018' : '2019-04-10'}

Now, we can loop through the seasons in the dataframe, and the stats we're tracking for each season.  For each iteration, the approach will be as follows:
1. Loop through each season.
2. Create new dataframe for the regular season games in this season (these will be saved in a dictionary of dataframes).
3. Loop through the stats we want to track.
4. Merge the season dataframe with the lut for the stat we're looking at (merge on date), and save this as a temporary df.  This means that for each game, we have all the teams updated stats at the date of the game.
5. Loop through all the games in this season.  For each game, update the home and away teams' stats based on their values in the LUT (which has been merged at this point, so is in the same dataframe).
6. Once we've looped through all the games, we should have two pandas Series, which correspond to the home and away teams' stats.  All we need to do now is append these Series to the original dataframe for this season.

In [8]:
# initialize the seasons to loop through
seasons = np.arange(2003, 2019, 1)

# set up the stats to loop through
stats = stats = ['avg_pts_for', 'avg_pts_against', 'avg_reb_for', 'avg_reb_against', 
         'avg_ast_for', 'avg_ast_against', 'win_pct']

# initialize our empty dictionary of dataframes
games_by_season = {}

# loop through the seasons
for season in seasons:
    
    # create the dataframe of games for this regular season
    games_by_season[season] = df.loc[df['season'] == season, :]
    games_by_season[season] = games_by_season[season].loc[(games_by_season[season]['date'] >= start_dates[str(season)]) & \
                 (games_by_season[season]['date'] <= end_dates[str(season)]), :]
    games_by_season[season] = games_by_season[season].reset_index().drop('index', axis=1)
        
    # loop through the stats
    for stat in stats:

        print(f'Generating {stat} for {season} season')
        
        # this is the key for the LUT for this stat
        id_ = str(season) + '_' + stat
        
        # merge the games df with the LUT for this stat, save in temporary df
        tmp = pd.merge(games_by_season[season], luts[id_], how='left', on='date')
        tmp = tmp.reset_index().drop('index', axis=1)

        # initialize our averages for this stat (just to zeros to start)
        tmp[f'home_{stat}'] = 0.0
        tmp[f'away_{stat}'] = 0.0
                
        # loop through the home teams, and update their stats at the given date
        for i, team in enumerate(tmp['home']):
            tmp.loc[i, f'home_{stat}'] = tmp.loc[i, team]

        # loop through the away teams, and update their stats at the given date
        for i, team in enumerate(tmp['away']):
            tmp.loc[i, f'away_{stat}'] = tmp.loc[i, team]
        
        # take our newly formed statistic columns and add them to the original games df
        games_by_season[season][f'home_{stat}'] = tmp[f'home_{stat}']
        games_by_season[season][f'away_{stat}'] = tmp[f'away_{stat}']

Generating avg_pts_for for 2003 season
Generating avg_pts_against for 2003 season
Generating avg_reb_for for 2003 season
Generating avg_reb_against for 2003 season
Generating avg_ast_for for 2003 season
Generating avg_ast_against for 2003 season
Generating win_pct for 2003 season
Generating avg_pts_for for 2004 season
Generating avg_pts_against for 2004 season
Generating avg_reb_for for 2004 season
Generating avg_reb_against for 2004 season
Generating avg_ast_for for 2004 season
Generating avg_ast_against for 2004 season
Generating win_pct for 2004 season
Generating avg_pts_for for 2005 season
Generating avg_pts_against for 2005 season
Generating avg_reb_for for 2005 season
Generating avg_reb_against for 2005 season
Generating avg_ast_for for 2005 season
Generating avg_ast_against for 2005 season
Generating win_pct for 2005 season
Generating avg_pts_for for 2006 season
Generating avg_pts_against for 2006 season
Generating avg_reb_for for 2006 season
Generating avg_reb_against for 2006 

Now, we have a dictionary of dataframes, each of which corresponds to the games in a season.  In order to perform modelling, we should merge these back together.  Something to note, is our models will be trained to predict game outcomes based on each teams' cumulative stats going into the game.  It doesn't really make sense to use these stats early on in the season though.  For example, if we're going to use a team's average points per game as a feature, our average should be based on a decent number of games.  We will therefore set a cutoff date for each season, for which we will only take games which happened after.  This cutoff date will be one month after the start of the season.

In [9]:
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# loop through the different dataframes
for season in games_by_season:
    
    # setup a column which represents the cutoff date for this season
    games_by_season[season]['cutoff'] = start_dates[str(season)]
    games_by_season[season]['cutoff'] = games_by_season[season]['cutoff'].astype('datetime64')
    games_by_season[season]['cutoff'] = games_by_season[season]['cutoff'] + pd.DateOffset(months=1)
    
    # filter for the games which occurred after this cutoff date
    games_by_season[season] = games_by_season[season].loc[games_by_season[season]['date'] >= \
                                                         games_by_season[season]['cutoff'], :]
    games_by_season[season] = games_by_season[season].reset_index().drop('index', axis=1)

Now we are ready to re-merge our dataframes for modelling.

In [10]:
# we need to initialize the first df
new_df = games_by_season[2003]

for season in games_by_season:
    
    # this df has already been loaded in the initialization step (see above)
    if season == 2003:
        continue
        
    # merge the df for this season with the fully merged version
    new_df = pd.concat([new_df, games_by_season[season]])
    new_df = new_df.reset_index().drop('index', axis=1)
    
# drop the cutoff column -- we no longer have need of it
new_df.drop('cutoff', axis=1, inplace=True)

Let's take a look at what we have!

In [11]:
new_df.head()

Unnamed: 0,date,season,home,away,is_home_win,winner,home_avg_pts_for,away_avg_pts_for,home_avg_pts_against,away_avg_pts_against,home_avg_reb_for,away_avg_reb_for,home_avg_reb_against,away_avg_reb_against,home_avg_ast_for,away_avg_ast_for,home_avg_ast_against,away_avg_ast_against,home_win_pct,away_win_pct
0,2003-11-28,2003,LAL,SAS,1,LAL,103.4,91.2,94.9,84.5,43.1,46.8,42.7,45.0,24.7,21.5,24.7,20.6,0.8,0.6
1,2003-11-28,2003,PHX,GSW,0,GSW,93.8,91.2,93.2,89.9,42.2,44.2,42.6,45.2,20.1,21.2,20.9,22.3,0.5,0.462
2,2003-11-28,2003,IND,PHI,1,IND,88.3,90.2,80.7,90.2,42.2,39.1,41.6,40.6,20.6,20.1,20.5,21.8,0.867,0.5
3,2003-11-28,2003,ATL,MIA,1,ATL,90.6,86.2,96.2,93.6,43.6,42.4,45.2,40.9,19.4,16.8,20.8,19.9,0.312,0.267
4,2003-11-28,2003,DET,CLE,1,DET,93.7,89.3,89.2,93.7,43.6,47.3,43.0,45.5,19.4,22.3,20.4,22.3,0.625,0.267


In [12]:
new_df.shape

(16121, 20)

It's looking pretty good!  For modelling though, we don't really care about some of these columns, so let's fix that.

In [13]:
cols = new_df.columns.tolist()
cols = cols[6:] + [cols[4]]

new_df = new_df[cols]
new_df.head()

Unnamed: 0,home_avg_pts_for,away_avg_pts_for,home_avg_pts_against,away_avg_pts_against,home_avg_reb_for,away_avg_reb_for,home_avg_reb_against,away_avg_reb_against,home_avg_ast_for,away_avg_ast_for,home_avg_ast_against,away_avg_ast_against,home_win_pct,away_win_pct,is_home_win
0,103.4,91.2,94.9,84.5,43.1,46.8,42.7,45.0,24.7,21.5,24.7,20.6,0.8,0.6,1
1,93.8,91.2,93.2,89.9,42.2,44.2,42.6,45.2,20.1,21.2,20.9,22.3,0.5,0.462,0
2,88.3,90.2,80.7,90.2,42.2,39.1,41.6,40.6,20.6,20.1,20.5,21.8,0.867,0.5,1
3,90.6,86.2,96.2,93.6,43.6,42.4,45.2,40.9,19.4,16.8,20.8,19.9,0.312,0.267,1
4,93.7,89.3,89.2,93.7,43.6,47.3,43.0,45.5,19.4,22.3,20.4,22.3,0.625,0.267,1


Great, now we have what looks to be a pretty good dataset for modelling.  Let's save it, and load it back in in another script.

In [14]:
new_df.to_csv('Data/games_modelling2.csv', index=False)