In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
teams = pd.read_csv('Data/teams.csv')
df = pd.read_csv('Data/games_2018-19.csv')
df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,date,year,month,day
0,2018-10-16,21800002,Final,1610612744,1610612760,2018,1610612744,108.0,0.442,0.944,...,0.363,0.649,0.27,21.0,45.0,1,2018-10-16,2018,10,16
1,2018-10-16,21800001,Final,1610612738,1610612755,2018,1610612738,105.0,0.433,0.714,...,0.391,0.609,0.192,18.0,47.0,1,2018-10-16,2018,10,16
2,2018-10-17,21800012,Final,1610612746,1610612743,2018,1610612746,98.0,0.398,0.833,...,0.379,0.786,0.333,20.0,56.0,0,2018-10-17,2018,10,17
3,2018-10-17,21800003,Final,1610612766,1610612749,2018,1610612766,112.0,0.446,0.636,...,0.494,0.75,0.412,26.0,57.0,0,2018-10-17,2018,10,17
4,2018-10-17,21800004,Final,1610612765,1610612751,2018,1610612765,103.0,0.424,0.864,...,0.488,0.682,0.185,28.0,39.0,1,2018-10-17,2018,10,17


In [3]:
luts = joblib.load('Data/luts.pkl')

In [4]:
df.shape

(1230, 25)

In [5]:
# re-format the date column
df['date'] = df['date'].astype('datetime64')

# generate a team name-ID dictionary from the teams dataset
teams = teams[['TEAM_ID', 'ABBREVIATION']]
teams = teams.set_index('TEAM_ID')
teams = teams.to_dict()['ABBREVIATION']

# generate this same dictionary in reverse (i.e. ID-name instead of name-ID)
teams_rev = dict((v,k) for k,v in teams.items())  

In [6]:
df = df[['date', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS']]
df.rename(columns={'HOME_TEAM_ID' : 'home_id', 'VISITOR_TEAM_ID' : 'away_id', 
                        'HOME_TEAM_WINS' : 'is_home_win'}, inplace=True)

df['home'] = df['home_id'].map(teams)
df['away'] = df['away_id'].map(teams)

df = df[['date', 'home', 'away', 'is_home_win']]
df.head()

Unnamed: 0,date,home,away,is_home_win
0,2018-10-16,GSW,OKC,1
1,2018-10-16,BOS,PHI,1
2,2018-10-17,LAC,DEN,0
3,2018-10-17,CHA,MIL,0
4,2018-10-17,DET,BKN,1


Let's get the average points per game of each team for each game:

In [7]:
df = pd.merge(df, luts['avg_pts_for'], how='left', on='date')

df['home_avg_pts_for'] = 0.0
df['away_avg_pts_for'] = 0.0

for i, team in enumerate(df['home']):
    df.loc[i, 'home_avg_pts_for'] = df.loc[i, team]
    
for i, team in enumerate(df['away']):
    df.loc[i, 'away_avg_pts_for'] = df.loc[i, team]
    
df = df[['date', 'home', 'away', 'home_avg_pts_for', 'away_avg_pts_for', 'is_home_win']]
df.tail()

Unnamed: 0,date,home,away,home_avg_pts_for,away_avg_pts_for,is_home_win
1225,2019-04-10,PHI,CHI,115.2,104.9,1
1226,2019-04-10,NYK,DET,104.6,107.0,0
1227,2019-04-10,CHA,ORL,110.7,107.3,0
1228,2019-04-10,BKN,MIA,112.2,105.7,1
1229,2019-04-10,ATL,IND,113.3,108.0,0


Now let's do the same thing with average rebounds:

In [8]:
df = pd.merge(df, luts['avg_pts_against'], how='left', on='date')

df['home_avg_pts_against'] = 0.0
df['away_avg_pts_against'] = 0.0

for i, team in enumerate(df['home']):
    df.loc[i, 'home_avg_pts_against'] = df.loc[i, team]
    
for i, team in enumerate(df['away']):
    df.loc[i, 'away_avg_pts_against'] = df.loc[i, team]
    
df = df[['date', 'home', 'away', 'home_avg_pts_for', 'away_avg_pts_for', 'home_avg_pts_against', 
         'away_avg_pts_against', 'is_home_win']]

df.tail()

Unnamed: 0,date,home,away,home_avg_pts_for,away_avg_pts_for,home_avg_pts_against,away_avg_pts_against,is_home_win
1225,2019-04-10,PHI,CHI,115.2,104.9,112.5,113.4,1
1226,2019-04-10,NYK,DET,104.6,107.0,113.8,107.3,0
1227,2019-04-10,CHA,ORL,110.7,107.3,111.8,106.6,0
1228,2019-04-10,BKN,MIA,112.2,105.7,112.3,105.9,1
1229,2019-04-10,ATL,IND,113.3,108.0,119.4,104.7,0
