In [1]:
import pandas as pd
import numpy as np
import pickle

### Import data

In [2]:
nba = pd.read_csv('../data/gamelogs_advanced.csv', index_col=0)
nba.date_game = pd.to_datetime(nba.date_game,infer_datetime_format=True)

### Change win loss to numeric

In [3]:
nba['w_l'] = np.where(nba['game_result'] == 'W', 1, 0)

### Sort by season and team and create expanding mean columns

In [4]:
teams = nba.sort_values(['season', 'team'], ascending=[True, True])

In [5]:
unwanted_columns = ['team', 'season', 'game_season', 'date_game', 'game_location',
                    'opp_id', 'game_result']

columns_to_mean = [column for column in list(nba.columns) if column not in unwanted_columns]

In [6]:
def mean_expansion(df, column_list):
    for column in column_list:
        df[column+'_mean']=df.groupby(['season', 'team'])[column].transform(lambda x: x.expanding().mean().shift(1))
    return df

means = mean_expansion(teams, columns_to_mean)

In [7]:
filename = '../data/cleaned_stats.sav'
pickle.dump(means, open(filename, 'wb'))

### Make new dataframe consisting of only columns to be merged with schedule

In [8]:
cols = list(means.columns.values)
new_cols = [cols[0]]+cols[2:4]+[cols[7]]+cols[58:]
new_means = means[new_cols].copy()

## Create League Schedule

In [9]:
schedule = nba[['date_game', 'team', 'season', 'game_location', 
                'opp_id']].sort_values(by='date_game').reset_index(drop=True)

In [10]:
schedule['home_team'] = np.where(schedule['game_location'] == '@', schedule['opp_id'], schedule['team'])
schedule['away_team'] = np.where(schedule['game_location'] == '@', schedule['team'], schedule['opp_id'])

In [11]:
schedule.drop(['team', 'game_location', 'opp_id'], axis=1, inplace=True)

In [12]:
schedule.head()

Unnamed: 0,date_game,season,home_team,away_team
0,2007-10-30,2008,LAL,HOU
1,2007-10-30,2008,LAL,HOU
2,2007-10-30,2008,GSW,UTA
3,2007-10-30,2008,GSW,UTA
4,2007-10-30,2008,SAS,POR


In [13]:
schedule_new = schedule.drop_duplicates()

In [14]:
schedule_new.head()

Unnamed: 0,date_game,season,home_team,away_team
0,2007-10-30,2008,LAL,HOU
2,2007-10-30,2008,GSW,UTA
4,2007-10-30,2008,SAS,POR
6,2007-10-31,2008,ORL,MIL
7,2007-10-31,2008,MEM,SAS


In [15]:
filename = '../data/schedules.sav'
pickle.dump(schedule_new, open(filename, 'wb'))

### Merge dataframes for model construction

In [16]:
data = schedule_new.merge(new_means, left_on=['home_team', 'date_game'], right_on=['team', 'date_game'])

In [17]:
home_column_name_dict = {column:'home_'+column for column in new_cols}
data = data.rename(columns = home_column_name_dict)

In [18]:
data.head()

Unnamed: 0,home_date_game,season,home_team,away_team,home_team.1,home_game_season,home_pts,home_pts_mean,home_opp_pts_mean,home_fg_mean,...,home_blk_pct_mean,home_efg_pct_mean,home_tov_pct_mean,home_orb_pct_mean,home_ft_rate_mean,home_opp_efg_pct_mean,home_opp_tov_pct_mean,home_drb_pct_mean,home_opp_ft_rate_mean,home_w_l_mean
0,2007-10-30,2008,LAL,HOU,LAL,1,93,,,,...,,,,,,,,,,
1,2007-10-30,2008,GSW,UTA,GSW,1,96,,,,...,,,,,,,,,,
2,2007-10-30,2008,SAS,POR,SAS,1,106,,,,...,,,,,,,,,,
3,2007-10-31,2008,ORL,MIL,ORL,1,102,,,,...,,,,,,,,,,
4,2007-10-31,2008,MEM,SAS,MEM,1,101,,,,...,,,,,,,,,,


In [19]:
data = data.merge(new_means, left_on=['away_team', 'home_date_game'], right_on=['team', 'date_game'])

In [20]:
away_column_name_dict = {column:'away_'+column for column in new_cols}
data=data.rename(columns = away_column_name_dict)

In [21]:
data=data.rename(columns = {'home_date_game':'date_game'})

In [25]:
data.head(15)

Unnamed: 0,date_game,season,home_team,away_team,home_game_season,home_pts,home_pts_mean,home_opp_pts_mean,home_fg_mean,home_fga_mean,...,away_efg_pct_mean,away_tov_pct_mean,away_orb_pct_mean,away_ft_rate_mean,away_opp_efg_pct_mean,away_opp_tov_pct_mean,away_drb_pct_mean,away_opp_ft_rate_mean,away_w_l_mean,score_margin
0,2007-10-30,2008,LAL,HOU,1,93,,,,,...,,,,,,,,,,-2
1,2007-10-30,2008,GSW,UTA,1,96,,,,,...,,,,,,,,,,-21
2,2007-10-30,2008,SAS,POR,1,106,,,,,...,,,,,,,,,,9
3,2007-10-31,2008,ORL,MIL,1,102,,,,,...,,,,,,,,,,19
4,2007-10-31,2008,MEM,SAS,1,101,,,,,...,0.506,7.5,27.3,0.207,0.538,15.8,77.8,0.167,1.0,-3
5,2007-10-31,2008,DEN,SEA,1,120,,,,,...,,,,,,,,,,17
6,2007-10-31,2008,TOR,PHI,1,106,,,,,...,,,,,,,,,,9
7,2007-10-31,2008,NOH,SAC,1,104,,,,,...,,,,,,,,,,14
8,2007-10-31,2008,NJN,CHI,1,112,,,,,...,,,,,,,,,,9
9,2007-10-31,2008,IND,WAS,1,119,,,,,...,,,,,,,,,,9


In [23]:
data = data.loc[:,~data.columns.duplicated()]

In [24]:
data['score_margin']=data['home_pts']-data['away_pts']

In [28]:
data.tail(16)

Unnamed: 0,date_game,season,home_team,away_team,home_game_season,home_pts,home_pts_mean,home_opp_pts_mean,home_fg_mean,home_fga_mean,...,away_efg_pct_mean,away_tov_pct_mean,away_orb_pct_mean,away_ft_rate_mean,away_opp_efg_pct_mean,away_opp_tov_pct_mean,away_drb_pct_mean,away_opp_ft_rate_mean,away_w_l_mean,score_margin
12043,2017-04-11,2017,SAC,PHO,81,129,102.575,106.6125,37.775,82.125,...,0.496086,13.040741,25.966667,0.235222,0.526222,12.795062,76.423457,0.265481,0.296296,25
12044,2017-04-11,2017,MIN,OKC,81,98,105.5125,106.5625,39.3625,84.3125,...,0.501437,12.8825,27.74875,0.224125,0.513938,12.05125,79.16875,0.220113,0.575,-2
12045,2017-04-12,2017,ORL,DET,82,113,100.925926,107.62963,38.234568,86.987654,...,0.492889,10.546914,24.009877,0.158704,0.518099,11.587654,81.311111,0.199259,0.45679,4
12046,2017-04-12,2017,HOU,MIN,82,123,115.246914,109.469136,40.246914,87.098765,...,0.510679,12.322222,27.165432,0.233037,0.535778,12.91358,76.232099,0.21437,0.382716,5
12047,2017-04-12,2017,GSW,LAL,82,109,115.975309,104.382716,43.098765,87.037037,...,0.502099,12.960494,24.816049,0.197938,0.543136,12.564198,76.161728,0.216333,0.320988,15
12048,2017-04-12,2017,IND,ATL,82,104,105.111111,105.555556,39.283951,84.555556,...,0.505469,13.753086,23.551852,0.217173,0.507531,13.338272,76.306173,0.179815,0.530864,18
12049,2017-04-12,2017,OKC,DEN,82,105,106.617284,105.777778,39.432099,87.259259,...,0.532099,12.812346,27.160494,0.216519,0.533963,10.491358,78.876543,0.196605,0.481481,-6
12050,2017-04-12,2017,MEM,DAL,82,93,100.567901,99.987654,36.432099,83.580247,...,0.505605,11.018519,17.882716,0.180593,0.531704,14.079012,78.011111,0.22042,0.395062,-7
12051,2017-04-12,2017,CLE,TOR,82,83,110.679012,107.271605,40.08642,85.024691,...,0.518025,11.077778,24.702469,0.237309,0.509136,12.996296,76.403704,0.225074,0.617284,-15
12052,2017-04-12,2017,UTA,SAS,82,101,100.703704,96.765432,36.962963,79.481481,...,0.52579,12.104938,23.491358,0.214062,0.491506,12.922222,77.785185,0.19621,0.753086,4


In [83]:
filename = '../data/processed_data.sav'
pickle.dump(data, open(filename, 'wb'))