In [296]:
import pandas as pd
import numpy as np
import pickle
from geopy.geocoders import Nominatim
from geopy.distance import vincenty

In [298]:
filename = '../data/cleaned_stats.sav'
data = pickle.load(open(filename, 'rb'))

### Create new features that average performance over previous 5 games for features correlated with score_margin

In [299]:
def rolling_window_mean(df, column_list, width):
    for column in column_list:
        df[column+'_last5']=df.groupby(['season', 'team'])[column].transform(lambda x: x.rolling(window=width).mean().shift(1))
    return df

In [300]:
columns_to_window = ['off_rtg_mean', 'w_l_mean']

In [301]:
feature_data = rolling_window_mean(data, columns_to_window, 5)

In [302]:
feature_data.head()

Unnamed: 0,team,season,game_season,date_game,game_location,opp_id,game_result,pts,opp_pts,fg,...,tov_pct_mean,orb_pct_mean,ft_rate_mean,opp_efg_pct_mean,opp_tov_pct_mean,drb_pct_mean,opp_ft_rate_mean,w_l_mean,off_rtg_mean_last5,w_l_mean_last5
0,ATL,2008,1,2007-11-02,,DAL,W,101,94,36,...,,,,,,,,,,
1,ATL,2008,2,2007-11-04,@,DET,L,91,92,33,...,13.9,31.1,0.316,0.463,9.9,84.4,0.25,1.0,,
2,ATL,2008,3,2007-11-06,@,NJN,L,82,87,26,...,16.2,32.2,0.304,0.472,11.65,76.7,0.2115,0.5,,
3,ATL,2008,4,2007-11-07,,PHO,W,105,96,41,...,17.066667,27.966667,0.315333,0.468667,13.233333,74.2,0.205,0.333333,,
4,ATL,2008,5,2007-11-09,@,BOS,L,83,106,30,...,16.25,30.025,0.28925,0.46925,13.175,75.55,0.19725,0.5,,


### Create feature to show teams rest coming into game

In [303]:
feature_data['days_rest']=feature_data.groupby(['season', 'team'])['date_game'].transform(lambda x: x.diff()/np.timedelta64(1, 'D'))

### Create feature to show miles traveled for away team

In [304]:
teams_08 = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
            'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 'SEA', 'ORL', 'PHI', 'PHO',
            'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

teams_09_12 = ['ATL', 'NJN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
               'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO',
               'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

teams_13 = ['ATL', 'BRK', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
            'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO',
            'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

teams_14 = ['ATL', 'BRK', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
            'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO',
            'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

teams_15_17 = ['ATL', 'BRK', 'BOS', 'CHO', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
               'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO',
               'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

all_teams = list(set(teams_08+teams_09_12+teams_13+teams_14+teams_15_17))

In [305]:
cities = {'NJN':'Newark, New Jersey', 'NOP':'New Orleans, Louisiana', 'NYK':'New York City, New York', 
          'CLE':'Cleveland, Ohio', 'MIA':'Miami, Florida', 'DEN':'Denver, Colorado', 'HOU':'Houston, Texas',
          'LAL':'Los Angeles, California', 'SAS':'San Antonio, Texas', 'DAL':'Dallas, Texas', 
          'BOS':'Boston, Massachusetts', 'NOH':'New Orleans, Louisiana', 'ATL':'Atlanta, Georgia',
          'MIL':'Milwaukee, Wisconsin', 'POR':'Portland, Oregon', 'CHO':'Charlotte, North Carolina', 
          'MIN':'Minneapolis, Minnesota', 'GSW':'San Francisco, California', 'LAC':'Los Angeles, California', 
          'UTA':'Salt Lake City, Utah', 'OKC':'Oklahoma City, Oklahoma', 'BRK':'Brooklyn, New York',
          'CHI':'Chicago, Illinois', 'SAC':'Sacramento, California', 'MEM':'Memphis Tennessee', 
          'PHI':'Philadelphia, Pennsylvania', 'WAS':'Washington, D.C.', 'ORL':'Orlando, Florida', 
          'IND':'Indianapolis, Indiana', 'CHA':'Charlotte, North Carolina', 'PHO':'Phoenix, Arizona',
          'DET':'Detroit, Michigan', 'SEA':'Seattle, Washington', 'TOR':'Toronto, Ontario'}

In [306]:
geolocator = Nominatim()
for abbrv, city in cities.items():
    location = geolocator.geocode(city)
    cities[abbrv]=(location.latitude, location.longitude)    

In [307]:
feature_data['lat_long']=feature_data['team'].map(cities)

### Merge with Schedules

In [308]:
cols = list(feature_data.columns.values)
cols = [cols[0]]+cols[2:4]+[cols[7]]+cols[58:]
feature_data = feature_data[cols].copy()

In [309]:
filename = '../data/schedules.sav'
schedules = pickle.load(open(filename, 'rb'))

In [310]:
data = schedules.merge(feature_data, left_on=['home_team', 'date_game'], right_on=['team', 'date_game'])

home_column_name_dict = {column:'home_'+column for column in cols}
data = data.rename(columns = home_column_name_dict)

In [311]:
data = data.merge(feature_data, left_on=['away_team', 'home_date_game'], right_on=['team', 'date_game'])

away_column_name_dict = {column:'away_'+column for column in cols}
data=data.rename(columns = away_column_name_dict)
data=data.rename(columns = {'home_date_game':'date_game'})

In [312]:
data = data.loc[:,~data.columns.duplicated()]

In [313]:
data['score_margin'] = data['home_pts']-data['away_pts']

In [314]:
def calc_distance(df):
    return vincenty(df['home_lat_long'], df['away_lat_long']).miles

data['distance_traveled'] = data.apply(calc_distance, axis=1)

In [315]:
filename = '../data/new_features.sav'
pickle.dump(data, open(filename, 'wb'))