In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 73)

In [3]:
team_df = pd.read_pickle('data/team_stats_scraped.pickle')
game_df = pd.read_pickle('data/games_scraped.pickle')

In [4]:
game_df.shape

(29399, 26)

In [5]:
mask = (game_df.team == 'chi') & (game_df.year == 2019)
game_df[mask]

Unnamed: 0,team,year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,game_location,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st
4819,chi,2019,gnb,1,Thu,September 5,L,0-1,,3.0,10.0,16.0,254.0,208.0,46.0,1.0,13.0,213.0,166.0,47.0,,8.0,,-20.0,9.0,-2.0
4820,chi,2019,den,2,Sun,September 15,W,1-1,@,16.0,14.0,18.0,273.0,120.0,153.0,,27.0,372.0,282.0,90.0,1.0,4.0,,4.0,-4.0,5.0
4821,chi,2019,was,3,Mon,September 23,W,2-1,@,31.0,15.0,21.0,298.0,208.0,90.0,1.0,25.0,356.0,287.0,69.0,5.0,8.0,,9.0,11.0,-3.0
4822,chi,2019,min,4,Sun,September 29,W,3-1,,16.0,6.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,4.0,,-2.0,10.0,1.0
4823,chi,2019,rai,5,Sun,October 6,L,3-2,@,21.0,24.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,1.0,,-3.0,-2.0,-0.0
4824,chi,2019,,6,,,,,,,,,,,,,,,,,,,,,,
4825,chi,2019,nor,7,Sun,October 20,L,3-3,,25.0,36.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,151.0,,4.0,,-12.0,-13.0,10.0
4826,chi,2019,sdg,8,Sun,October 27,L,3-4,,16.0,17.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,36.0,1.0,1.0,,0.0,5.0,-6.0
4827,chi,2019,phi,9,Sun,November 3,L,3-5,@,14.0,22.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,146.0,,1.0,,-4.0,-10.0,2.0
4828,chi,2019,det,10,Sun,November 10,W,4-5,,20.0,13.0,13.0,226.0,145.0,81.0,,21.0,357.0,259.0,98.0,1.0,1.0,,-0.0,6.0,-2.0


In [6]:
# clean the game_location column & apply change. def home_game(row):
def home_game(row):    
    if row == '@':
        return 0
    return 1

game_df['home'] = game_df.game_location.apply(home_game)
game_df.drop("game_location", axis=1, errors='ignore', inplace=True)

In [7]:
# add team_year identifier to each row (to easily groupby team/year)

team_years = game_df['team'] + '-' +  game_df['year'].astype(str) 
game_df.insert(loc=2, column='team_year', value=team_years)
game_df.sample()

Unnamed: 0,team,year,team_year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
10953,htx,2010,htx-2010,nyj,11,Sun,November 21,L,4-6,27.0,30.0,17.0,343.0,246.0,97.0,1.0,22.0,401.0,298.0,103.0,2.0,1.0,,2.0,-3.0,-1.0,0


Datetime formatting strings:

- ``%B`` month fullname
- ``%-d`` day of month without leading zero\*
    - \*note: the ``-`` only works on Linux/OS-X...for windows replace with '#'
- ``%Y`` full year

In [8]:
# convert game date to datetime & add new column to DF


f = r"%B %d-%Y"
full_game_date = game_df.game_date + '-' + game_df.year.astype(str)
full_game_date = pd.to_datetime(full_game_date, format=f)
game_df.insert(loc=2, column='date', value=full_game_date)
game_df.sample(3)

Unnamed: 0,team,year,date,team_year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
9549,det,2008,2008-12-14,det-2008,clt,15,Sun,December 14,L,0-14,21.0,31.0,18.0,323.0,233.0,90.0,2.0,28.0,421.0,315.0,106.0,2.0,1.0,,1.0,-25.0,13.0,0
14782,min,1972,1972-12-10,min-1972,gnb,13,Sun,December 10,L,7-6,7.0,23.0,9.0,144.0,90.0,54.0,4.0,18.0,270.0,56.0,214.0,2.0,2.0,,,,,1
9277,det,1993,1993-10-31,det-1993,min,9,Sun,October 31,W,6-2,30.0,27.0,13.0,338.0,260.0,78.0,2.0,25.0,378.0,262.0,116.0,3.0,8.0,,,,,0


In [9]:
# now drop the game_date col
game_df.drop("game_date", axis=1, inplace=True, errors='ignore')
game_df.sample(2) # to confirm

Unnamed: 0,team,year,date,team_year,opp,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
22359,pit,2002,2002-09-15,pit-2002,rai,2,Sun,L,0-2,17.0,30.0,14.0,273.0,201.0,72.0,5.0,27.0,464.0,369.0,95.0,2.0,8.0,,-17.0,1.0,2.0,1
18998,nyj,1986,1986-11-09,nyj-1986,atl,10,Sun,W,9-1,28.0,14.0,22.0,426.0,322.0,104.0,,23.0,393.0,305.0,88.0,2.0,1.0,,,,,0


In [10]:
# add decade column
decades = game_df['year'] // 10
game_df.insert(loc=5, column='decade', value=decades)
game_df[['date', 'decade']].sample(10)

Unnamed: 0,date,decade
24222,1991-10-20,199
4330,NaT,199
5613,2013-12-29,201
18434,2011-11-06,201
20832,1974-09-15,197
26079,1992-11-01,199
9757,2020-12-06,202
16785,1974-09-29,197
9760,2020-12-26,202
10486,2004-09-26,200


In [16]:
# add turnovers^2 col

game_df[['to2_off', 'to2_def']] = game_df[['to_off', 'to_def']].to_numpy() ** 2
game_df[['to_off', 'to2_off', 'to_def', 'to2_def']]

Unnamed: 0,to_off,to2_off,to_def,to2_def
0,5.0,25.0,6.0,36.0
1,5.0,25.0,6.0,36.0
2,6.0,36.0,3.0,9.0
3,6.0,36.0,,
4,3.0,9.0,2.0,4.0
...,...,...,...,...
29394,2.0,4.0,1.0,1.0
29395,4.0,16.0,2.0,4.0
29396,2.0,4.0,3.0,9.0
29397,,,,


In [17]:
# add unique game identifier that will be the same for both teams in the same game


def apply_game_id(row):
    teams = []
    teams.append(str(row['team']))
    teams.append(str(row['opp']))
    teams.sort()
    
    game_id = teams[0] + '-' +  teams[1] + '-' + str(row['date'])[:-8]
    return game_id

In [18]:
game_df['game_id'] = game_df.apply(apply_game_id, axis=1)
game_df

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id
0,crd,1960,1960-09-23,crd-1960,ram,196,1,Fri,W,1-0,43.0,21.0,20.0,430.0,281.0,149.0,5.0,16.0,238.0,188.0,50.0,6.0,,,,,,0,25.0,36.0,crd-ram-1960-09-23
1,crd,1960,1960-10-02,crd-1960,nyg,196,2,Sun,L,1-1,14.0,35.0,11.0,222.0,115.0,107.0,5.0,18.0,405.0,268.0,137.0,6.0,,,,,,1,25.0,36.0,crd-nyg-1960-10-02
2,crd,1960,1960-10-09,crd-1960,phi,196,3,Sun,L,1-2,27.0,31.0,20.0,364.0,145.0,219.0,6.0,14.0,293.0,199.0,94.0,3.0,,,,,,0,36.0,9.0,crd-phi-1960-10-09
3,crd,1960,1960-10-16,crd-1960,pit,196,4,Sun,L,1-3,14.0,27.0,20.0,295.0,154.0,141.0,6.0,12.0,237.0,146.0,91.0,,,,,,,0,36.0,,crd-pit-1960-10-16
4,crd,1960,1960-10-23,crd-1960,dal,196,5,Sun,W,2-3,12.0,10.0,17.0,306.0,80.0,226.0,3.0,9.0,175.0,124.0,51.0,2.0,,,,,,1,9.0,4.0,crd-dal-1960-10-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29394,was,2020,2020-12-20,was-2020,sea,202,15,Sun,L,6-8,15.0,20.0,26.0,353.0,269.0,84.0,2.0,16.0,302.0,121.0,181.0,1.0,1.0,,1.0,-3.0,-4.0,1,4.0,1.0,sea-was-2020-12-20
29395,was,2020,2020-12-27,was-2020,car,202,16,Sun,L,6-9,13.0,20.0,20.0,386.0,278.0,108.0,4.0,19.0,280.0,167.0,113.0,2.0,4.0,,-5.0,5.0,-7.0,1,16.0,4.0,car-was-2020-12-27
29396,was,2020,2020-01-03,was-2020,phi,202,17,Sun,W,7-9,20.0,14.0,16.0,248.0,145.0,103.0,2.0,16.0,216.0,98.0,118.0,3.0,8.0,,-8.0,11.0,3.0,0,4.0,9.0,phi-was-2020-01-03
29397,was,2020,NaT,was-2020,,202,,,,,,,,,,,,,,,,,,,,,,1,,,nan-was-


In [19]:
cat_game_outcome = game_df.loc[:, ['game_outcome']]
cat_game_outcome.value_counts()

game_outcome
L               13773
W               13773
T                 254
dtype: int64

In [20]:
# convert game_outcome col to binary to avoid multicolinearity

game_df[['result_tie', 'result_win']] = pd.get_dummies(game_df.game_outcome, drop_first=True)
game_df.sample(10)

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win
21904,pit,1976,1976-10-17,pit-1976,cin,197,6,Sun,W,2-4,23.0,6.0,16.0,253.0,52.0,201.0,2.0,11.0,171.0,96.0,75.0,3.0,1.0,,,,,1,4.0,9.0,cin-pit-1976-10-17,0,1
28747,was,1984,1984-10-07,was-1984,clt,198,6,Sun,W,4-2,35.0,7.0,25.0,446.0,268.0,178.0,2.0,12.0,186.0,115.0,71.0,2.0,1.0,,,,,0,4.0,4.0,clt-was-1984-10-07,0,1
19444,nyj,2011,2011-11-06,nyj-2011,buf,201,9,Sun,W,5-3,27.0,11.0,24.0,348.0,222.0,126.0,2.0,14.0,287.0,191.0,96.0,3.0,1.0,,5.0,13.0,0.0,0,4.0,9.0,buf-nyj-2011-11-06,0,1
28563,was,1972,1972-12-17,was-1972,buf,197,14,Sun,L,11-3,17.0,24.0,15.0,171.0,38.0,133.0,3.0,17.0,264.0,80.0,184.0,2.0,1.0,,,,,1,9.0,4.0,buf-was-1972-12-17,0,0
8816,det,1963,1963-09-22,det-1963,gnb,196,2,Sun,L,1-1,10.0,31.0,7.0,147.0,80.0,67.0,4.0,15.0,316.0,112.0,204.0,4.0,,,,,,0,16.0,16.0,det-gnb-1963-09-22,0,0
4242,chi,1986,1986-11-03,chi-1986,ram,198,9,Mon,L,7-2,17.0,20.0,14.0,269.0,123.0,146.0,3.0,14.0,278.0,137.0,141.0,3.0,9.0,,,,,1,9.0,9.0,chi-ram-1986-11-03,0,0
14959,min,1983,1983-10-30,min-1983,crd,198,9,Sun,L,6-3,31.0,41.0,23.0,348.0,289.0,59.0,3.0,24.0,420.0,238.0,182.0,2.0,1.0,,,,,0,9.0,4.0,crd-min-1983-10-30,0,0
15268,min,2000,2000-11-06,min-2000,gnb,200,10,Mon,L,7-2,20.0,26.0,22.0,407.0,250.0,157.0,5.0,19.0,298.0,217.0,81.0,,9.0,OT,0.0,-2.0,-3.0,0,25.0,,gnb-min-2000-11-06,0,0
28433,was,1963,1963-12-01,was-1963,clt,196,12,Sun,L,3-9,20.0,36.0,18.0,385.0,305.0,80.0,3.0,24.0,468.0,343.0,125.0,,,,,,,1,9.0,,clt-was-1963-12-01,0,0
1864,atl,2018,2018-12-16,atl-2018,crd,201,15,Sun,W,5-9,40.0,14.0,23.0,435.0,220.0,215.0,,18.0,253.0,193.0,60.0,3.0,1.0,,15.0,16.0,-7.0,1,,9.0,atl-crd-2018-12-16,0,1


In [21]:
mask = game_df.game_outcome == "T"
game_df[mask].head()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win
9,crd,1960,1960-11-27,crd-1960,cle,196,10,Sun,T,5-4-1,17.0,17.0,21.0,322.0,127.0,195.0,6.0,17.0,274.0,88.0,186.0,2.0,,,,,,1,36.0,4.0,cle-crd-1960-11-27,1,0
30,crd,1962,1962-10-14,crd-1962,was,196,5,Sun,T,1-3-1,17.0,17.0,16.0,309.0,251.0,58.0,2.0,15.0,321.0,259.0,62.0,1.0,,,,,,1,4.0,1.0,crd-was-1962-10-14,1,0
55,crd,1964,1964-09-20,crd-1964,cle,196,2,Sun,T,1-0-1,33.0,33.0,17.0,344.0,230.0,114.0,3.0,18.0,281.0,166.0,115.0,3.0,,,,,,0,9.0,9.0,cle-crd-1964-09-20,1,0
63,crd,1964,1964-11-15,crd-1964,nyg,196,10,Sun,T,5-3-2,10.0,10.0,12.0,143.0,105.0,38.0,3.0,12.0,212.0,131.0,81.0,3.0,,,,,,1,9.0,9.0,crd-nyg-1964-11-15,1,0
87,crd,1966,1966-10-16,crd-1966,dal,196,6,Sun,T,5-0-1,10.0,10.0,13.0,175.0,130.0,45.0,,17.0,326.0,191.0,135.0,4.0,,,,,,1,,16.0,crd-dal-1966-10-16,1,0


In [22]:


game_df = (game_df.assign(wins=game_df.team_record.str.split('-').str.get(0),
                          losses=game_df.team_record.str.split('-').str.get(1),
                          ties=game_df.team_record.str.split('-').str.get(2)))
game_df.sample()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties
21629,phi,2019,2019-09-15,phi-2019,atl,201,2.0,Sun,L,1-1,20.0,24.0,18.0,286.0,237.0,49.0,3.0,19.0,367.0,310.0,57.0,3.0,8.0,,-5.0,-3.0,4.0,0,9.0,9.0,atl-phi-2019-09-15,0,0,1.0,1.0,
23700,sdg,2019,2019-10-20,sdg-2019,oti,201,7.0,Sun,L,2-5,20.0,23.0,24.0,365.0,326.0,39.0,1.0,22.0,403.0,306.0,97.0,1.0,4.0,,8.0,-10.0,-0.0,0,1.0,1.0,oti-sdg-2019-10-20,0,0,2.0,5.0,
25080,sea,1995,1995-10-01,sea-1995,den,199,5.0,Sun,W,2-2,27.0,10.0,26.0,422.0,192.0,230.0,,18.0,318.0,243.0,75.0,1.0,4.0,,20.0,-0.0,-3.0,1,,1.0,den-sea-1995-10-01,0,1,2.0,2.0,
17627,nyg,1962,1962-12-09,nyg-1962,cle,196,13.0,Sun,W,11-2,17.0,13.0,19.0,314.0,99.0,215.0,1.0,16.0,236.0,147.0,89.0,,,,,,,1,1.0,,cle-nyg-1962-12-09,0,1,11.0,2.0,
17288,nor,2004,2004-01-02,nor-2004,car,200,17.0,Sun,W,8-8,21.0,18.0,15.0,360.0,196.0,164.0,,21.0,320.0,274.0,46.0,3.0,1.0,,0.0,9.0,-8.0,0,,9.0,car-nor-2004-01-02,0,1,8.0,8.0,
11136,htx,2020,NaT,htx-2020,,202,8.0,,,,,,,,,,,,,,,,,,,,,1,,,htx-nan-,0,0,,,
12693,kan,1963,1963-10-20,kan-1963,sdg,196,7.0,Sun,L,2-3-1,17.0,38.0,17.0,325.0,247.0,78.0,3.0,18.0,402.0,255.0,147.0,,,,,,,1,9.0,,kan-sdg-1963-10-20,0,0,2.0,3.0,1.0
26104,ram,1993,1993-12-26,ram-1993,cle,199,17.0,Sun,L,4-11,14.0,42.0,20.0,382.0,287.0,95.0,3.0,23.0,315.0,213.0,102.0,,4.0,,,,,1,9.0,,cle-ram-1993-12-26,0,0,4.0,11.0,
18408,nyg,2009,2009-01-03,nyg-2009,min,200,17.0,Sun,L,8-8,7.0,44.0,11.0,181.0,146.0,35.0,2.0,28.0,487.0,358.0,129.0,,1.0,,-13.0,-25.0,6.0,0,4.0,,min-nyg-2009-01-03,0,0,8.0,8.0,
11508,clt,1984,1984-09-16,clt-1984,crd,198,3.0,Sun,L,1-2,33.0,34.0,22.0,399.0,210.0,189.0,3.0,19.0,397.0,265.0,132.0,3.0,1.0,,,,,1,9.0,9.0,clt-crd-1984-09-16,0,0,1.0,2.0,


Okay, almost there. Our ties are coming through, but we get NaNs for teams that haven't tied, so we'll need to replace those with zeros.

In [23]:
# fill ties NaNs with zero
game_df['ties'] = game_df.ties.fillna(0)

In [24]:
# add 'margin' col
margins = game_df.pts_off - game_df.pts_def
game_df.insert(loc=11, column='margin', value=margins)
game_df[mask].head()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,margin,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties
9,crd,1960,1960-11-27,crd-1960,cle,196,10,Sun,T,5-4-1,17.0,0.0,17.0,21.0,322.0,127.0,195.0,6.0,17.0,274.0,88.0,186.0,2.0,,,,,,1,36.0,4.0,cle-crd-1960-11-27,1,0,5,4,1
30,crd,1962,1962-10-14,crd-1962,was,196,5,Sun,T,1-3-1,17.0,0.0,17.0,16.0,309.0,251.0,58.0,2.0,15.0,321.0,259.0,62.0,1.0,,,,,,1,4.0,1.0,crd-was-1962-10-14,1,0,1,3,1
55,crd,1964,1964-09-20,crd-1964,cle,196,2,Sun,T,1-0-1,33.0,0.0,33.0,17.0,344.0,230.0,114.0,3.0,18.0,281.0,166.0,115.0,3.0,,,,,,0,9.0,9.0,cle-crd-1964-09-20,1,0,1,0,1
63,crd,1964,1964-11-15,crd-1964,nyg,196,10,Sun,T,5-3-2,10.0,0.0,10.0,12.0,143.0,105.0,38.0,3.0,12.0,212.0,131.0,81.0,3.0,,,,,,1,9.0,9.0,crd-nyg-1964-11-15,1,0,5,3,2
87,crd,1966,1966-10-16,crd-1966,dal,196,6,Sun,T,5-0-1,10.0,0.0,10.0,13.0,175.0,130.0,45.0,,17.0,326.0,191.0,135.0,4.0,,,,,,1,,16.0,crd-dal-1966-10-16,1,0,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28508,was,1969,1969-10-05,was-1969,sfo,196,3,Sun,T,1-1-1,17.0,0.0,17.0,20.0,304.0,250.0,54.0,2.0,19.0,271.0,144.0,127.0,1.0,,,,,,0,4.0,1.0,sfo-was-1969-10-05,1,0,1,1,1
28513,was,1969,1969-11-09,was-1969,phi,196,8,Sun,T,4-2-2,28.0,0.0,28.0,18.0,339.0,222.0,117.0,1.0,25.0,372.0,199.0,173.0,3.0,,,,,,1,1.0,9.0,phi-was-1969-11-09,1,0,4,2,2
28541,was,1971,1971-11-07,was-1971,phi,197,8,Sun,T,6-1-1,7.0,0.0,7.0,14.0,197.0,136.0,61.0,7.0,10.0,172.0,95.0,77.0,1.0,1.0,,,,,1,49.0,1.0,phi-was-1971-11-07,1,0,6,1,1
28989,was,1997,1997-11-23,was-1997,nyg,199,13,Sun,T,6-5-1,7.0,0.0,7.0,23.0,374.0,291.0,83.0,4.0,18.0,262.0,105.0,157.0,2.0,8.0,OT,-24.0,23.0,-2.0,1,16.0,4.0,nyg-was-1997-11-23,1,0,6,5,1


In [26]:
# drop rows for bye weeks
game_df.dropna(axis=0, how='any', subset=['game_outcome'], inplace=True)


# drop exp_pts cols
game_df.drop(['exp_pts_off', 'exp_pts_def', 'exp_pts_st'], axis=1, inplace=True)

In [None]:
# add 'prev_week' cols

cols_to_shift = ['wins', 'losses', 'ties', 'pts_off', 'pts_def',
       'margin', 'first_down_off', 'yards_off', 'pass_yds_off', 'rush_yds_off',
       'to_off', 'first_down_def', 'yards_def', 'pass_yds_def', 'rush_yds_def',
       'to_def', 'result_tie', 'result_win']

for col in cols_to_shift:
    new_col = 'prev_' + col
    game_df[new_col] = game_df.groupby('team_year')[col].apply(lambda grp: grp.shift(1))

In [None]:
mask = game_df.team_year == 'chi-2019'
game_df[mask]

In [None]:
# create rolling three feature for these stats:
# using margin instead of wins here... if we want to get num wins in last three weeks,
#  will need to do differently using sum instead of mean

roll_cols = ['pts_off', 'pts_def', 'margin', 'first_down_off', 'yards_off',
             'pass_yds_off', 'rush_yds_off', 'to_off', 'yards_def', 'pass_yds_def',
             'rush_yds_def', 'to_def', 'exp_pts_off', 'exp_pts_def', 'exp_pts_st']

In [None]:
roll3_cols = ['roll3_' + col_name for col_name in roll_cols]

game_df[roll3_cols] = (game_df.groupby('team_year')[roll_cols]
                      .transform(lambda x: round(x.shift(1).rolling(3).mean(), 3)))


In [None]:
# get rolling wins (sum)

rolling_wins = (game_df.groupby('team_year')['result_win']
                      .transform(lambda x: round(x.shift(1).rolling(3).sum(), 3)))

game_df.insert(loc=38, column='roll3_wins', value=rolling_wins)

rolling_ties = (game_df.groupby('team_year')['result_win']
                      .transform(lambda x: round(x.shift(1).rolling(3).sum(), 3)))

game_df.insert(loc=38, column='roll3_ties', value=rolling_ties)

Now let's add more columns with **Expontentially Weighted Moving Averages (EWMA)** of our stats. This is similar to taking a rolling mean of a team's previous 6 performances, but EWMA weights the most recent game in the window much higher than the week before that, and so on.

For this, we will use a 3-16 week span (greedy). My hypothesis is that the most recent performances are the best predictors, and this should help further quantify recent performance.

In [None]:
# add ewma cols

ewma_cols = ['ewma_' + col_name for col_name in roll_cols]

game_df[ewma_cols] = (game_df.groupby('team_year')[roll_cols]
                      .transform(lambda x: round(x.shift(1).ewm(span=16, min_periods=3).mean(), 3)))


In [None]:
mask = game_df.team_year == 'nor-2014'
game_df[mask][['week_num', 'team', 'pass_yds_off', 'prev_pass_yds_off', 'ewma_pass_yds_off', 'roll3_pass_yds_off']].head(18)

In [None]:
np.mean([197, 120, 208])

Looks good!!