This process is going to create a data frame that has cumulative over the season for that team. What that means is that the statistics are going to be the average up until that particular game. For example if we are focusing on the Atlanta Braves for the 2016 season each game is going to represent how that team is performing up until that game. If we are predicting the outcome of the 40th game we are using the average of all statistics for the first 39 games.

In [74]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns',1000)
# used for sorting
from operator import itemgetter

In [75]:
df = pd.read_csv('data/pbp_data_final.csv')
df['Date'] = pd.to_datetime(df["Date"])

  interactivity=interactivity, compiler=compiler, result=result)


In [76]:
# all team and years
team_list = df.home_team.unique()
year_list = df.Date.dt.year.unique()

These are rows that do not have any impact on the game in a way that can be useful for prediction. 

In [77]:
drop_cols_before_modeling = ['Unnamed: 0','day_of_week','away_league',
                            'away_team_game_number','home_league',
                            'home_team_game_number','day_or_night','park_id','attendance',
                            'time_of_game','away_line_scores','home_line_scores','year','id']
df.drop(columns=drop_cols_before_modeling,inplace=True)
df.drop(df.loc[:,'hb_ump_id':'acquisition_info'],axis=1,inplace=True)

Now I must get rid of all statistics that are not able to be averaged to create cumulative statistics over a year.

In [78]:
drop_for_cummean = ['number_of_outs','target','away_won_last_game','home_won_last_game']
df.drop(columns=drop_for_cummean,inplace=True)

In [79]:
# making dummy rows for use in putting games into year/team buckets
df['home_date'] = df.Date
df['away_date'] = df.Date
# making dummy rows for game in series for attaching later
df['home_game_in_series'] = df.game_in_series
df['away_game_in_series'] = df.game_in_series

In [80]:
# initialize dict so that we can aggregate the stats of each team per year
# final dict is where we are going to be adding the aggregated stats
stat_dict = {}
final_dict = {}
for year in year_list:
    stat_dict[year] = {}
    final_dict[year] = {}
    for team in team_list:
        stat_dict[year][team] = []
        final_dict[year][team] = []

In [81]:
# this is putting games into the correct year/team combo
# we need both home and away games bc away games affect stats of the team
# this will end in lists that contain every game for every team for every year
# dict[year][team] = 161 lists of each games stats
for year in year_list:
    for team in team_list:
        # home game stats
        for game in df[(df.Date.dt.year == year)&(df.home_team == team)].filter(regex='home').values.tolist():
            stat_dict[year][team].append(np.array(game))
        # away game stats
        for game in df[(df.Date.dt.year == year)&(df.away_team == team)].filter(regex='away').values.tolist():
            stat_dict[year][team].append(np.array(game))
            

In [82]:
# have to sort each year/team array so that they are in the correct date order
for year in year_list:
    for team in team_list:
        stat_dict[year][team] = sorted(stat_dict[year][team],key=itemgetter(-2))

In [83]:
# putting date in the front to make it easier to work with
for year in year_list:
    for team in team_list:
        year_team_stats = []
        for game in stat_dict[year][team]:
            year_team_stats.append(np.insert(game[:-2],0,game[-2:]))
        stat_dict[year][team] = year_team_stats

In [84]:
# this is aggregating the stats per year
# so each game is the mean of all stats of that game and all previous
for year in year_list:
    for team in team_list:
        curr_game_number = 0
        aggregate_stats = np.zeros(40)
        for game in stat_dict[year][team]:
            header_info = game[0:3]
            contents = game[3:]
            aggregate_stats_current_game = aggregate_stats/curr_game_number
            final_dict[year][team].append(np.concatenate((header_info,aggregate_stats_current_game)))
            aggregate_stats = contents+aggregate_stats
            curr_game_number +=1

  # Remove the CWD from sys.path while we load stuff.


In [85]:
agg_df = pd.DataFrame()

In [86]:
# creating the aggregate stats array
agg_array = []
for year in year_list:
    for team in team_list:
        for game in final_dict[year][team]:
            agg_array.append(game)

In [87]:
# making the array of all stats
agg_df = pd.DataFrame(agg_array)

In [88]:
away_data_column_names = list(df.filter(regex='away').columns)[1:-2] # gets rid of team name and date
home_data_column_names = list(df.filter(regex='home').columns)[1:-2] # gets rid of team name and date

In [89]:
# get the whole dataframes games containing
# date, home team, away team
game_basic_info_df = df.loc[:,['Date','game_in_series','away_team','home_team']]

From this point need to identify a away to attach games that will not increase the number of games and will not make duplicate games.

In [90]:
# merge the home team
# change the statistics column names to be joined on
date_and_name_home = ['Date','game_in_series','home_team']
home_col_names = date_and_name_home + home_data_column_names
agg_df.columns = home_col_names

game_basic_info_df = game_basic_info_df.merge(agg_df,how='left',on=['Date','home_team','game_in_series'])

# merge the away team

date_and_name_away = ['Date','game_in_series','away_team']
away_col_names = date_and_name_away + away_data_column_names
agg_df.columns = away_col_names
game_basic_info_df = game_basic_info_df.merge(agg_df,how='left',on=['Date','away_team','game_in_series'])


In [91]:
#changing the outcomes that have been averaged to win_loss
game_basic_info_df.rename(mapper={'home_outcome':'home_win_loss',
                          'away_outcome':'away_win_loss'},axis=1,inplace=True)

In [92]:
game_basic_info_df

Unnamed: 0,Date,game_in_series,away_team,home_team,home_team_score,home_at_bats,home_hits,home_doubles,home_triples,home_hrs,home_rbi,home_sh,home_sf,home_hbp,home_walk,home_int_walk,home_so,home_sb,home_cs,home_gidp,home_catch_interference,home_left_on_base,home_pitchers_used,home_pitch_earned_runs,home_team_earned_runs,home_pitch_wild_pitches,home_pitch_balks,home_def_putouts,home_def_assists,home_def_errors,home_def_passed_balls,home_def_double_plays,home_def_triple_plays,home_win_loss,home_OBP,home_AVG,home_singles,home_SLG,home_BABIP,home_ISO,home_PASO,home_total_bases,home_runs_created,home_wOBA,away_team_score,away_at_bats,away_hits,away_doubles,away_triples,away_hrs,away_rbi,away_sh,away_sf,away_hbp,away_walk,away_int_walk,away_so,away_sb,away_cs,away_gidp,away_catch_interference,away_left_on_base,away_pitchers_used,away_pitch_earned_runs,away_team_earned_runs,away_pitch_wild_pitches,away_pitch_balks,away_def_putouts,away_def_assists,away_def_errors,away_def_passed_balls,away_def_double_plays,away_def_triple_plays,away_win_loss,away_OBP,away_AVG,away_singles,away_SLG,away_BABIP,away_ISO,away_PASO,away_total_bases,away_runs_created,away_wOBA
0,2000-03-30,0,NYN,CHN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2000-04-03,0,COL,ATL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2000-04-03,0,MIL,CIN,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2000-04-03,0,SFN,MIA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2000-04-03,0,LAN,WAS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48563,2019-09-29,0,DET,CHA,4.393750,34.381250,8.99375,1.612500,0.125000,1.131250,4.200000,0.225000,0.200000,0.393750,2.356250,0.081250,9.606250,0.393750,0.175000,0.706250,0.006250,6.675000,4.318750,4.793750,4.787500,0.443750,0.031250,26.318750,9.437500,0.731250,0.081250,1.050000,0.00625,0.443750,0.308682,0.256513,6.125000,0.406390,0.324657,0.149877,4.136388,14.250000,4.699443,0.315905,3.618750,34.487500,8.287500,1.818750,0.256250,0.918750,3.456250,0.056250,0.262500,0.300000,2.425000,0.087500,9.918750,0.350000,0.125000,0.662500,0.000000,6.656250,4.587500,5.206250,5.193750,0.406250,0.043750,26.718750,9.031250,0.681250,0.100000,0.787500,0.000000,0.293750,0.286314,0.234660,5.293750,0.379340,0.300106,0.144680,3.861421,13.375000,4.207533,0.293725
48564,2019-09-29,0,MIN,KCA,4.260870,33.931677,8.36646,1.726708,0.242236,1.000000,4.037267,0.149068,0.254658,0.366460,2.813665,0.105590,8.683230,0.726708,0.242236,0.701863,0.018634,6.515528,4.211180,5.093168,5.093168,0.366460,0.031056,26.385093,9.329193,0.447205,0.062112,0.937888,0.00000,0.360248,0.302681,0.242305,5.397516,0.392268,0.295905,0.149962,4.453711,13.577640,4.407290,0.307124,5.807453,35.403727,9.571429,1.968944,0.142857,1.888199,5.602484,0.062112,0.254658,0.496894,3.260870,0.130435,8.236025,0.173913,0.130435,0.627329,0.018634,6.913043,4.223602,4.192547,4.192547,0.440994,0.031056,27.111801,8.677019,0.683230,0.093168,0.801242,0.012422,0.627329,0.332489,0.264235,5.571429,0.483901,0.292951,0.219666,5.045172,17.490683,6.157451,0.356767
48565,2019-09-29,0,OAK,SEA,4.662500,33.987500,8.06875,1.568750,0.175000,1.475000,4.487500,0.081250,0.225000,0.350000,3.606250,0.043750,9.775000,0.706250,0.287500,0.518750,0.012500,6.662500,4.300000,4.950000,4.943750,0.437500,0.025000,26.650000,9.193750,0.818750,0.037500,0.900000,0.00000,0.412500,0.304135,0.230968,4.850000,0.413480,0.279212,0.182512,3.802788,14.412500,4.813185,0.315740,5.231250,34.350000,8.575000,1.800000,0.143750,1.587500,4.950000,0.043750,0.225000,0.543750,3.568750,0.106250,8.262500,0.300000,0.131250,0.862500,0.000000,6.693750,4.356250,3.962500,3.962500,0.443750,0.025000,27.150000,9.087500,0.500000,0.118750,0.768750,0.000000,0.606250,0.320794,0.244032,5.043750,0.439480,0.276033,0.195448,4.839062,15.425000,5.235853,0.335078
48566,2019-09-29,0,NYA,TEX,4.993789,34.217391,8.47205,1.832298,0.149068,1.385093,4.726708,0.105590,0.267081,0.409938,3.291925,0.111801,9.763975,0.795031,0.236025,0.596273,0.012422,6.577640,4.099379,5.043478,5.012422,0.422360,0.018634,26.627329,8.465839,0.652174,0.068323,0.888199,0.00000,0.478261,0.310396,0.241520,5.105590,0.422123,0.295096,0.180603,3.919005,14.757764,4.942092,0.322698,5.850932,34.490683,9.254658,1.801242,0.105590,1.894410,5.608696,0.062112,0.204969,0.304348,3.527950,0.111801,8.844720,0.341615,0.136646,0.701863,0.006211,6.434783,4.354037,4.267081,4.267081,0.341615,0.031056,26.739130,8.316770,0.621118,0.080745,0.826087,0.000000,0.639752,0.333478,0.263322,5.453416,0.481912,0.302061,0.218590,4.379459,16.950311,5.995393,0.356419


In [99]:
adding_outcomes = df[['Date','game_in_series','home_team','away_team','home_outcome','away_outcome']]

In [101]:
game_basic_info_df = game_basic_info_df.merge(adding_outcomes,how='left',on=['Date','game_in_series','home_team','away_team'])

In [102]:
game_basic_info_df.to_csv('data/aggregate_data.csv')