In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 73)

In [3]:
team_df = pd.read_pickle('data/team_stats_scraped.pickle')
game_df = pd.read_pickle('data/games_scraped.pickle')

In [4]:
game_df.shape

(29399, 26)

In [5]:
mask = (game_df.team == 'chi') & (game_df.year == 2019)
game_df[mask]

Unnamed: 0,team,year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,game_location,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st
4819,chi,2019,gnb,1,Thu,September 5,L,0-1,,3.0,10.0,16.0,254.0,208.0,46.0,1.0,13.0,213.0,166.0,47.0,,8.0,,-20.0,9.0,-2.0
4820,chi,2019,den,2,Sun,September 15,W,1-1,@,16.0,14.0,18.0,273.0,120.0,153.0,,27.0,372.0,282.0,90.0,1.0,4.0,,4.0,-4.0,5.0
4821,chi,2019,was,3,Mon,September 23,W,2-1,@,31.0,15.0,21.0,298.0,208.0,90.0,1.0,25.0,356.0,287.0,69.0,5.0,8.0,,9.0,11.0,-3.0
4822,chi,2019,min,4,Sun,September 29,W,3-1,,16.0,6.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,4.0,,-2.0,10.0,1.0
4823,chi,2019,rai,5,Sun,October 6,L,3-2,@,21.0,24.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,1.0,,-3.0,-2.0,-0.0
4824,chi,2019,,6,,,,,,,,,,,,,,,,,,,,,,
4825,chi,2019,nor,7,Sun,October 20,L,3-3,,25.0,36.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,151.0,,4.0,,-12.0,-13.0,10.0
4826,chi,2019,sdg,8,Sun,October 27,L,3-4,,16.0,17.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,36.0,1.0,1.0,,0.0,5.0,-6.0
4827,chi,2019,phi,9,Sun,November 3,L,3-5,@,14.0,22.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,146.0,,1.0,,-4.0,-10.0,2.0
4828,chi,2019,det,10,Sun,November 10,W,4-5,,20.0,13.0,13.0,226.0,145.0,81.0,,21.0,357.0,259.0,98.0,1.0,1.0,,-0.0,6.0,-2.0


In [6]:
# clean the game_location column & apply change. def home_game(row):
def home_game(row):    
    if row == '@':
        return 0
    return 1

game_df['home'] = game_df.game_location.apply(home_game)
game_df.drop("game_location", axis=1, errors='ignore', inplace=True)

In [7]:
# add team_year identifier to each row (to easily groupby team/year)

team_years = game_df['team'] + '-' +  game_df['year'].astype(str) 
game_df.insert(loc=2, column='team_year', value=team_years)
game_df.sample()

Unnamed: 0,team,year,team_year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
4345,chi,1992,chi-1992,nyg,3,Mon,September 21,L,1-2,14.0,27.0,16.0,274.0,175.0,99.0,2.0,26.0,386.0,212.0,174.0,1.0,9.0,,,,,1


Datetime formatting strings:

- ``%B`` month fullname
- ``%-d`` day of month without leading zero\*
    - \*note: the ``-`` only works on Linux/OS-X...for windows replace with '#'
- ``%Y`` full year

In [8]:
# convert game date to datetime & add new column to DF


f = r"%B %d-%Y"
full_game_date = game_df.game_date + '-' + game_df.year.astype(str)
full_game_date = pd.to_datetime(full_game_date, format=f)
game_df.insert(loc=2, column='date', value=full_game_date)
game_df.sample(3)

Unnamed: 0,team,year,date,team_year,opp,week_num,game_day_of_week,game_date,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
26379,ram,2009,2009-10-04,ram-2009,sfo,4,Sun,October 4,L,0-4,0.0,35.0,9.0,177.0,82.0,95.0,3.0,13.0,228.0,132.0,96.0,,4.0,,-29.0,2.0,-5.0,0
26913,tam,1996,1996-09-15,tam-1996,den,3,Sun,September 15,L,0-3,23.0,27.0,15.0,307.0,186.0,121.0,2.0,20.0,374.0,180.0,194.0,3.0,8.0,,-7.0,1.0,0.0,0
18166,nyg,1996,1996-12-08,nyg-1996,mia,15,Sun,December 8,W,6-8,17.0,7.0,21.0,280.0,149.0,131.0,1.0,15.0,294.0,191.0,103.0,2.0,1.0,,0.0,8.0,2.0,0


In [9]:
# now drop the game_date col
game_df.drop("game_date", axis=1, inplace=True, errors='ignore')
game_df.sample(2) # to confirm

Unnamed: 0,team,year,date,team_year,opp,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home
18585,nyg,2019,2019-12-29,nyg-2019,phi,17,Sun,L,4-12,17.0,34.0,19.0,397.0,275.0,122.0,2.0,25.0,400.0,279.0,121.0,,4.0,,-6.0,-6.0,0.0,1
26221,ram,2000,2000-11-05,ram-2000,car,10,Sun,L,7-2,24.0,27.0,20.0,426.0,395.0,31.0,2.0,18.0,268.0,178.0,90.0,2.0,8.0,,11.0,-0.0,-8.0,1


In [10]:
# add decade column
decades = game_df['year'] // 10
game_df.insert(loc=5, column='decade', value=decades)
game_df[['date', 'decade']].sample(10)

Unnamed: 0,date,decade
24158,1988-09-25,198
29103,2004-10-10,200
10938,2009-12-06,200
16849,1978-11-12,197
21086,1989-12-31,198
27834,1990-12-23,199
22420,2005-10-16,200
22810,1965-12-26,196
22153,1991-09-15,199
25187,2001-11-04,200


In [11]:
# add turnovers^2 col

game_df[['to2_off', 'to2_def']] = game_df[['to_off', 'to_def']].to_numpy() ** 2
game_df[['to_off', 'to2_off', 'to_def', 'to2_def']]

Unnamed: 0,to_off,to2_off,to_def,to2_def
0,5.0,25.0,6.0,36.0
1,5.0,25.0,6.0,36.0
2,6.0,36.0,3.0,9.0
3,6.0,36.0,,
4,3.0,9.0,2.0,4.0
...,...,...,...,...
29394,2.0,4.0,1.0,1.0
29395,4.0,16.0,2.0,4.0
29396,2.0,4.0,3.0,9.0
29397,,,,


In [12]:
# add unique game identifier that will be the same for both teams in the same game


def apply_game_id(row):
    teams = []
    teams.append(str(row['team']))
    teams.append(str(row['opp']))
    teams.sort()
    
    game_id = teams[0] + '-' +  teams[1] + '-' + str(row['date'])[:-9]
    return game_id

In [13]:
game_df['game_id'] = game_df.apply(apply_game_id, axis=1)
game_df

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id
0,crd,1960,1960-09-23,crd-1960,ram,196,1,Fri,W,1-0,43.0,21.0,20.0,430.0,281.0,149.0,5.0,16.0,238.0,188.0,50.0,6.0,,,,,,0,25.0,36.0,crd-ram-1960-09-23
1,crd,1960,1960-10-02,crd-1960,nyg,196,2,Sun,L,1-1,14.0,35.0,11.0,222.0,115.0,107.0,5.0,18.0,405.0,268.0,137.0,6.0,,,,,,1,25.0,36.0,crd-nyg-1960-10-02
2,crd,1960,1960-10-09,crd-1960,phi,196,3,Sun,L,1-2,27.0,31.0,20.0,364.0,145.0,219.0,6.0,14.0,293.0,199.0,94.0,3.0,,,,,,0,36.0,9.0,crd-phi-1960-10-09
3,crd,1960,1960-10-16,crd-1960,pit,196,4,Sun,L,1-3,14.0,27.0,20.0,295.0,154.0,141.0,6.0,12.0,237.0,146.0,91.0,,,,,,,0,36.0,,crd-pit-1960-10-16
4,crd,1960,1960-10-23,crd-1960,dal,196,5,Sun,W,2-3,12.0,10.0,17.0,306.0,80.0,226.0,3.0,9.0,175.0,124.0,51.0,2.0,,,,,,1,9.0,4.0,crd-dal-1960-10-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29394,was,2020,2020-12-20,was-2020,sea,202,15,Sun,L,6-8,15.0,20.0,26.0,353.0,269.0,84.0,2.0,16.0,302.0,121.0,181.0,1.0,1.0,,1.0,-3.0,-4.0,1,4.0,1.0,sea-was-2020-12-20
29395,was,2020,2020-12-27,was-2020,car,202,16,Sun,L,6-9,13.0,20.0,20.0,386.0,278.0,108.0,4.0,19.0,280.0,167.0,113.0,2.0,4.0,,-5.0,5.0,-7.0,1,16.0,4.0,car-was-2020-12-27
29396,was,2020,2020-01-03,was-2020,phi,202,17,Sun,W,7-9,20.0,14.0,16.0,248.0,145.0,103.0,2.0,16.0,216.0,98.0,118.0,3.0,8.0,,-8.0,11.0,3.0,0,4.0,9.0,phi-was-2020-01-03
29397,was,2020,NaT,was-2020,,202,,,,,,,,,,,,,,,,,,,,,,1,,,nan-was-


In [14]:
cat_game_outcome = game_df.loc[:, ['game_outcome']]
cat_game_outcome.value_counts()

game_outcome
L               13773
W               13773
T                 254
dtype: int64

In [15]:
# convert game_outcome col to binary to avoid multicolinearity

game_df[['result_tie', 'result_win']] = pd.get_dummies(game_df.game_outcome, drop_first=True)
game_df.sample(10)

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win
25132,sea,1998,1998-10-11,sea-1998,den,199,6,Sun,L,3-3,16.0,21.0,16.0,319.0,242.0,77.0,1.0,17.0,373.0,171.0,202.0,2.0,4.0,,-6.0,0.0,-2.0,1,1.0,4.0,den-sea-1998-10-11,0,0
28514,was,1969,1969-11-16,was-1969,dal,196,9,Sun,L,4-3-2,28.0,41.0,19.0,377.0,320.0,57.0,4.0,23.0,365.0,147.0,218.0,1.0,,,,,,1,16.0,1.0,dal-was-1969-11-16,0,0
16369,nwe,2005,2005-09-25,nwe-2005,pit,200,3,Sun,W,2-1,23.0,20.0,24.0,426.0,346.0,80.0,3.0,14.0,269.0,190.0,79.0,1.0,4.0,,-1.0,1.0,2.0,0,9.0,1.0,nwe-pit-2005-09-25,0,1
10554,gnb,2007,NaT,gnb-2007,,200,,,,,,,,,,,,,,,,,,,,,,1,,,gnb-nan-,0,0
10277,gnb,1993,1993-11-08,gnb-1993,kan,199,10,Mon,L,4-4,16.0,23.0,20.0,287.0,188.0,99.0,6.0,15.0,253.0,153.0,100.0,,9.0,,,,,0,36.0,,gnb-kan-1993-11-08,0,0
13842,mia,1977,1977-11-13,mia-1977,nwe,197,9,Sun,W,7-2,17.0,5.0,14.0,173.0,102.0,71.0,1.0,17.0,274.0,140.0,134.0,1.0,1.0,,,,,1,1.0,1.0,mia-nwe-1977-11-13,0,1
3541,car,2003,2003-01-10,car-2003,ram,200,Division,Sat,W,13-5,29.0,23.0,24.0,485.0,269.0,216.0,1.0,23.0,380.0,316.0,64.0,3.0,4.0,OT,10.0,-0.0,-2.0,0,1.0,9.0,car-ram-2003-01-10,0,1
5193,cin,1989,1989-12-17,cin-1989,oti,198,15,Sun,W,8-7,61.0,7.0,35.0,584.0,392.0,192.0,2.0,14.0,194.0,155.0,39.0,5.0,1.0,,,,,1,4.0,25.0,cin-oti-1989-12-17,0,1
29356,was,2018,2018-11-18,was-2018,htx,201,11,Sun,L,6-4,21.0,23.0,23.0,278.0,154.0,124.0,2.0,19.0,320.0,181.0,139.0,3.0,1.0,,-6.0,3.0,1.0,1,4.0,9.0,htx-was-2018-11-18,0,0
22398,pit,2004,2004-10-03,pit-2004,cin,200,4,Sun,W,3-1,28.0,17.0,23.0,333.0,168.0,165.0,2.0,22.0,293.0,156.0,137.0,3.0,1.0,,4.0,7.0,2.0,1,4.0,9.0,cin-pit-2004-10-03,0,1


In [16]:
mask = game_df.game_outcome == "T"
game_df[mask].head()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win
9,crd,1960,1960-11-27,crd-1960,cle,196,10,Sun,T,5-4-1,17.0,17.0,21.0,322.0,127.0,195.0,6.0,17.0,274.0,88.0,186.0,2.0,,,,,,1,36.0,4.0,cle-crd-1960-11-27,1,0
30,crd,1962,1962-10-14,crd-1962,was,196,5,Sun,T,1-3-1,17.0,17.0,16.0,309.0,251.0,58.0,2.0,15.0,321.0,259.0,62.0,1.0,,,,,,1,4.0,1.0,crd-was-1962-10-14,1,0
55,crd,1964,1964-09-20,crd-1964,cle,196,2,Sun,T,1-0-1,33.0,33.0,17.0,344.0,230.0,114.0,3.0,18.0,281.0,166.0,115.0,3.0,,,,,,0,9.0,9.0,cle-crd-1964-09-20,1,0
63,crd,1964,1964-11-15,crd-1964,nyg,196,10,Sun,T,5-3-2,10.0,10.0,12.0,143.0,105.0,38.0,3.0,12.0,212.0,131.0,81.0,3.0,,,,,,1,9.0,9.0,crd-nyg-1964-11-15,1,0
87,crd,1966,1966-10-16,crd-1966,dal,196,6,Sun,T,5-0-1,10.0,10.0,13.0,175.0,130.0,45.0,,17.0,326.0,191.0,135.0,4.0,,,,,,1,,16.0,crd-dal-1966-10-16,1,0


In [17]:


game_df = (game_df.assign(wins=game_df.team_record.str.split('-').str.get(0),
                          losses=game_df.team_record.str.split('-').str.get(1),
                          ties=game_df.team_record.str.split('-').str.get(2)))
game_df.sample()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties
21810,pit,1970,1970-11-15,pit-1970,kan,197,9,Sun,L,4-5,14.0,31.0,15.0,202.0,99.0,103.0,6.0,21.0,424.0,257.0,167.0,3.0,1.0,,,,,1,36.0,9.0,kan-pit-1970-11-15,0,0,4,5,


Okay, almost there. Our ties are coming through, but we get NaNs for teams that haven't tied, so we'll need to replace those with zeros.

In [18]:
# fill ties NaNs with zero
game_df['ties'] = game_df.ties.fillna(0)

In [19]:
# add 'margin' col
margins = game_df.pts_off - game_df.pts_def
game_df.insert(loc=11, column='margin', value=margins)
game_df[mask].head()

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,margin,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,exp_pts_off,exp_pts_def,exp_pts_st,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties
9,crd,1960,1960-11-27,crd-1960,cle,196,10,Sun,T,5-4-1,17.0,0.0,17.0,21.0,322.0,127.0,195.0,6.0,17.0,274.0,88.0,186.0,2.0,,,,,,1,36.0,4.0,cle-crd-1960-11-27,1,0,5,4,1
30,crd,1962,1962-10-14,crd-1962,was,196,5,Sun,T,1-3-1,17.0,0.0,17.0,16.0,309.0,251.0,58.0,2.0,15.0,321.0,259.0,62.0,1.0,,,,,,1,4.0,1.0,crd-was-1962-10-14,1,0,1,3,1
55,crd,1964,1964-09-20,crd-1964,cle,196,2,Sun,T,1-0-1,33.0,0.0,33.0,17.0,344.0,230.0,114.0,3.0,18.0,281.0,166.0,115.0,3.0,,,,,,0,9.0,9.0,cle-crd-1964-09-20,1,0,1,0,1
63,crd,1964,1964-11-15,crd-1964,nyg,196,10,Sun,T,5-3-2,10.0,0.0,10.0,12.0,143.0,105.0,38.0,3.0,12.0,212.0,131.0,81.0,3.0,,,,,,1,9.0,9.0,crd-nyg-1964-11-15,1,0,5,3,2
87,crd,1966,1966-10-16,crd-1966,dal,196,6,Sun,T,5-0-1,10.0,0.0,10.0,13.0,175.0,130.0,45.0,,17.0,326.0,191.0,135.0,4.0,,,,,,1,,16.0,crd-dal-1966-10-16,1,0,5,0,1


In [20]:
# drop rows for bye weeks
game_df.dropna(axis=0, how='any', subset=['game_outcome'], inplace=True)


# drop exp_pts cols
game_df.drop(['exp_pts_off', 'exp_pts_def', 'exp_pts_st'], axis=1, inplace=True)

In [21]:
# add 'prev_week' cols

cols_to_shift = ['wins', 'losses', 'ties', 'pts_off', 'pts_def',
       'margin', 'first_down_off', 'yards_off', 'pass_yds_off', 'rush_yds_off',
       'to_off', 'first_down_def', 'yards_def', 'pass_yds_def', 'rush_yds_def',
       'to_def', 'result_tie', 'result_win']

for col in cols_to_shift:
    new_col = 'prev_' + col
    game_df[new_col] = game_df.groupby('team_year')[col].apply(lambda grp: grp.shift(1))

In [22]:
mask = game_df.team_year == 'chi-2019'
game_df[mask]

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,margin,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties,prev_wins,prev_losses,prev_ties,prev_pts_off,prev_pts_def,prev_margin,prev_first_down_off,prev_yards_off,prev_pass_yds_off,prev_rush_yds_off,prev_to_off,prev_first_down_def,prev_yards_def,prev_pass_yds_def,prev_rush_yds_def,prev_to_def,prev_result_tie,prev_result_win
4819,chi,2019,2019-09-05,chi-2019,gnb,201,1,Thu,L,0-1,3.0,-7.0,10.0,16.0,254.0,208.0,46.0,1.0,13.0,213.0,166.0,47.0,,8.0,,1,1.0,,chi-gnb-2019-09-05,0,0,0,1,0,,,,,,,,,,,,,,,,,,
4820,chi,2019,2019-09-15,chi-2019,den,201,2,Sun,W,1-1,16.0,2.0,14.0,18.0,273.0,120.0,153.0,,27.0,372.0,282.0,90.0,1.0,4.0,,0,,1.0,chi-den-2019-09-15,0,1,1,1,0,0.0,1.0,0.0,3.0,10.0,-7.0,16.0,254.0,208.0,46.0,1.0,13.0,213.0,166.0,47.0,,0.0,0.0
4821,chi,2019,2019-09-23,chi-2019,was,201,3,Mon,W,2-1,31.0,16.0,15.0,21.0,298.0,208.0,90.0,1.0,25.0,356.0,287.0,69.0,5.0,8.0,,0,1.0,25.0,chi-was-2019-09-23,0,1,2,1,0,1.0,1.0,0.0,16.0,14.0,2.0,18.0,273.0,120.0,153.0,,27.0,372.0,282.0,90.0,1.0,0.0,1.0
4822,chi,2019,2019-09-29,chi-2019,min,201,4,Sun,W,3-1,16.0,10.0,6.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,4.0,,1,,4.0,chi-min-2019-09-29,0,1,3,1,0,2.0,1.0,0.0,31.0,15.0,16.0,21.0,298.0,208.0,90.0,1.0,25.0,356.0,287.0,69.0,5.0,0.0,1.0
4823,chi,2019,2019-10-06,chi-2019,rai,201,5,Sun,L,3-2,21.0,-3.0,24.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,1.0,,0,4.0,4.0,chi-rai-2019-10-06,0,0,3,2,0,3.0,1.0,0.0,16.0,6.0,10.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,0.0,1.0
4825,chi,2019,2019-10-20,chi-2019,nor,201,7,Sun,L,3-3,25.0,-11.0,36.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,151.0,,4.0,,1,4.0,,chi-nor-2019-10-20,0,0,3,3,0,3.0,2.0,0.0,21.0,24.0,-3.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,0.0,0.0
4826,chi,2019,2019-10-27,chi-2019,sdg,201,8,Sun,L,3-4,16.0,-1.0,17.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,36.0,1.0,1.0,,1,4.0,1.0,chi-sdg-2019-10-27,0,0,3,4,0,3.0,3.0,0.0,25.0,36.0,-11.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,151.0,,0.0,0.0
4827,chi,2019,2019-11-03,chi-2019,phi,201,9,Sun,L,3-5,14.0,-8.0,22.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,146.0,,1.0,,0,1.0,,chi-phi-2019-11-03,0,0,3,5,0,3.0,4.0,0.0,16.0,17.0,-1.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,36.0,1.0,0.0,0.0
4828,chi,2019,2019-11-10,chi-2019,det,201,10,Sun,W,4-5,20.0,7.0,13.0,13.0,226.0,145.0,81.0,,21.0,357.0,259.0,98.0,1.0,1.0,,1,,1.0,chi-det-2019-11-10,0,1,4,5,0,3.0,5.0,0.0,14.0,22.0,-8.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,146.0,,0.0,0.0
4829,chi,2019,2019-11-17,chi-2019,ram,201,11,Sun,L,4-6,7.0,-10.0,17.0,17.0,267.0,193.0,74.0,1.0,13.0,283.0,173.0,110.0,2.0,8.0,,0,1.0,4.0,chi-ram-2019-11-17,0,0,4,6,0,4.0,5.0,0.0,20.0,13.0,7.0,13.0,226.0,145.0,81.0,,21.0,357.0,259.0,98.0,1.0,0.0,1.0


In [23]:
game_df.columns

Index(['team', 'year', 'date', 'team_year', 'opp', 'decade', 'week_num',
       'game_day_of_week', 'game_outcome', 'team_record', 'pts_off', 'margin',
       'pts_def', 'first_down_off', 'yards_off', 'pass_yds_off',
       'rush_yds_off', 'to_off', 'first_down_def', 'yards_def', 'pass_yds_def',
       'rush_yds_def', 'to_def', 'game_time', 'overtime', 'home', 'to2_off',
       'to2_def', 'game_id', 'result_tie', 'result_win', 'wins', 'losses',
       'ties', 'prev_wins', 'prev_losses', 'prev_ties', 'prev_pts_off',
       'prev_pts_def', 'prev_margin', 'prev_first_down_off', 'prev_yards_off',
       'prev_pass_yds_off', 'prev_rush_yds_off', 'prev_to_off',
       'prev_first_down_def', 'prev_yards_def', 'prev_pass_yds_def',
       'prev_rush_yds_def', 'prev_to_def', 'prev_result_tie',
       'prev_result_win'],
      dtype='object')

In [24]:
# create rolling three feature for these stats:
# using margin instead of wins here... if we want to get num wins in last three weeks,
#  will need to do differently using sum instead of mean

# don't use prev_ columns, we will just shift them down when we use .transform()
roll_cols = ['result_win', 'result_tie', 'pts_off', 'pts_def', 'margin', 'first_down_off', 'yards_off', 
             'pass_yds_off', 'rush_yds_off', 'to_off', 'to2_off', 'yards_def', 'pass_yds_def',
             'rush_yds_def', 'to_def', 'to2_def']

In [25]:
roll3_cols = ['roll3_' + col_name for col_name in roll_cols]

game_df[roll3_cols] = (game_df.groupby('team_year')[roll_cols]
                      .transform(lambda x: round(x.shift(1).rolling(3).mean(), 3)))


In [26]:
# get rolling wins (sum) (commented out to try using mean)

rolling_wins = (game_df.groupby('team_year')['result_win']
                      .transform(lambda x: round(x.shift(1).rolling(3).sum(), 3)))

game_df.insert(loc=53, column='roll3_num_wins', value=rolling_wins)

rolling_ties = (game_df.groupby('team_year')['result_win']
                      .transform(lambda x: round(x.shift(1).rolling(3).sum(), 3)))

game_df.insert(loc=53, column='roll3_num_ties', value=rolling_ties)

Now let's add more columns with **Expontentially Weighted Moving Averages (EWMA)** of our stats. This is similar to taking a rolling mean of a team's previous 6 performances, but EWMA weights the most recent game in the window much higher than the week before that, and so on.

For this, we will use a 3-16 week span (greedy). My hypothesis is that the most recent performances are the best predictors, and this should help further quantify recent performance.

In [27]:
# add ewma cols

ewma_cols = ['ewma_' + col_name for col_name in roll_cols]

game_df[ewma_cols] = (game_df.groupby('team_year')[roll_cols]
                      .transform(lambda x: round(x.shift(1).ewm(span=16, min_periods=3).mean(), 3)))


In [28]:
mask = game_df.team_year == 'nor-2014'
game_df[mask][['week_num', 'team', 'pass_yds_off', 'prev_pass_yds_off', 'ewma_pass_yds_off', 'roll3_pass_yds_off']].head(18)

Unnamed: 0,week_num,team,pass_yds_off,prev_pass_yds_off,ewma_pass_yds_off,roll3_pass_yds_off
17457,1,nor,333.0,,,
17458,2,nor,223.0,333.0,,
17459,3,nor,288.0,223.0,,
17460,4,nor,334.0,288.0,279.612,281.333
17461,5,nor,371.0,334.0,295.858,281.667
17463,7,nor,335.0,371.0,314.862,331.0
17464,8,nor,302.0,335.0,319.348,346.667
17465,9,nor,270.0,302.0,315.851,336.0
17466,10,nor,287.0,270.0,307.324,302.333
17467,11,nor,255.0,287.0,303.786,286.333


In [29]:
np.mean([197, 120, 208])

175.0

Now, let's delete the first three rows of each team-year. We have to do this because we will be using at least some rolling columns for our model.

In [30]:
game_df.dropna(axis=0, how="any", subset=["roll3_pts_off"], inplace=True)

Looks good!!

**Now, we have to solve a pretty complex problem.**

Currently, each row has all the stats needed for the team in the 'team' column. However, we don't have the same information for the opponent in the same row. Let's make that happen.

To further clarify, for each row, we need to pull the opposing team's rolling stats, adding them back into the same row. This way, our model will only need one row to make a prediction.

Let's take a look at an example.

In [31]:
mask = (game_df.game_id == 'chi-rai-2019-10-06')
game_df[mask]


Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,margin,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties,prev_wins,prev_losses,prev_ties,prev_pts_off,roll3_num_ties,roll3_num_wins,prev_pts_def,prev_margin,prev_first_down_off,prev_yards_off,prev_pass_yds_off,prev_rush_yds_off,prev_to_off,prev_first_down_def,prev_yards_def,prev_pass_yds_def,prev_rush_yds_def,prev_to_def,prev_result_tie,prev_result_win,roll3_result_win,roll3_result_tie,roll3_pts_off,roll3_pts_def,roll3_margin,roll3_first_down_off,roll3_yards_off,roll3_pass_yds_off,roll3_rush_yds_off,roll3_to_off,roll3_to2_off,roll3_yards_def,roll3_pass_yds_def,roll3_rush_yds_def,roll3_to_def,roll3_to2_def,ewma_result_win,ewma_result_tie,ewma_pts_off,ewma_pts_def,ewma_margin,ewma_first_down_off,ewma_yards_off,ewma_pass_yds_off,ewma_rush_yds_off,ewma_to_off,ewma_to2_off,ewma_yards_def,ewma_pass_yds_def,ewma_rush_yds_def,ewma_to_def,ewma_to2_def
4823,chi,2019,2019-10-06,chi-2019,rai,201,5,Sun,L,3-2,21.0,-3.0,24.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,1.0,,0,4.0,4.0,chi-rai-2019-10-06,0,0,3,2,0,3,1,0,16.0,3.0,3.0,6.0,10.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,0.0,1.0,1.0,0.0,21.0,11.667,9.333,18.667,280.0,175.0,105.0,,,316.667,250.333,66.333,2.667,10.0,0.795,0.0,17.286,11.053,6.233,18.082,274.496,184.25,90.246,,,290.353,229.646,60.707,2.702,10.086
20606,rai,2019,2019-10-06,rai-2019,chi,201,5,Sun,W,3-2,24.0,3.0,21.0,25.0,398.0,229.0,169.0,2.0,15.0,236.0,194.0,42.0,2.0,1.0,,1,4.0,4.0,chi-rai-2019-10-06,0,1,3,2,0,2,2,0,31.0,1.0,1.0,24.0,7.0,21.0,377.0,189.0,188.0,1.0,22.0,346.0,265.0,81.0,2.0,0.0,1.0,0.333,0.0,18.333,28.667,-10.333,19.0,328.667,193.667,135.0,1.333,2.0,399.333,291.667,107.667,,,0.504,0.0,20.2,25.924,-5.725,19.07,336.851,207.394,129.456,1.293,1.878,384.007,277.499,106.508,,


So, in the above table, we need to get all of the roll3_x and ewma_x columns from the adjacent tables, respectively. We'll prefix these with opp_.

In [32]:
game_df.columns

Index(['team', 'year', 'date', 'team_year', 'opp', 'decade', 'week_num',
       'game_day_of_week', 'game_outcome', 'team_record', 'pts_off', 'margin',
       'pts_def', 'first_down_off', 'yards_off', 'pass_yds_off',
       'rush_yds_off', 'to_off', 'first_down_def', 'yards_def', 'pass_yds_def',
       'rush_yds_def', 'to_def', 'game_time', 'overtime', 'home', 'to2_off',
       'to2_def', 'game_id', 'result_tie', 'result_win', 'wins', 'losses',
       'ties', 'prev_wins', 'prev_losses', 'prev_ties', 'prev_pts_off',
       'roll3_num_ties', 'roll3_num_wins', 'prev_pts_def', 'prev_margin',
       'prev_first_down_off', 'prev_yards_off', 'prev_pass_yds_off',
       'prev_rush_yds_off', 'prev_to_off', 'prev_first_down_def',
       'prev_yards_def', 'prev_pass_yds_def', 'prev_rush_yds_def',
       'prev_to_def', 'prev_result_tie', 'prev_result_win', 'roll3_result_win',
       'roll3_result_tie', 'roll3_pts_off', 'roll3_pts_def', 'roll3_margin',
       'roll3_first_down_off', 'roll3_yards_of

In [33]:
opp_pull_cols = ['game_id', 'team', 'opp', 'prev_wins', 'prev_losses',
       'prev_ties', 'roll3_result_win', 'roll3_num_wins', 'roll3_result_tie', 
       'roll3_num_ties', 'prev_pts_off', 'prev_pts_def',
       'prev_margin', 'prev_first_down_off', 'prev_yards_off',
       'prev_pass_yds_off', 'prev_rush_yds_off', 'prev_to_off',
       'prev_first_down_def', 'prev_yards_def', 'prev_pass_yds_def',
       'prev_rush_yds_def', 'prev_to_def', 'prev_result_tie',
       'prev_result_win', 'roll3_pts_off', 'roll3_pts_def', 'roll3_margin',
       'roll3_first_down_off', 'roll3_yards_off', 'roll3_pass_yds_off',
       'roll3_rush_yds_off', 'roll3_to_off', 'roll3_yards_def',
       'roll3_pass_yds_def', 'roll3_rush_yds_def', 'roll3_to_def', 'ewma_result_win',
       'ewma_result_tie', 'ewma_num_wins', 'ewma_num_ties', 
       'ewma_pts_off', 'ewma_pts_def', 'ewma_margin', 'ewma_first_down_off',
       'ewma_yards_off', 'ewma_pass_yds_off', 'ewma_rush_yds_off',
       'ewma_to_off', 'ewma_yards_def', 'ewma_pass_yds_def',
       'ewma_rush_yds_def', 'ewma_to_def']

In [34]:
# convert all cols to float

game_df[opp_pull_cols] = game_df[opp_pull_cols].fillna(0)
for col in opp_pull_cols[3:]:
    game_df[col] = game_df[col].astype(float)

**Let's solve this using `pd.merge`, using a self-join.**

In [35]:
game_df = (game_df
             .merge(right=game_df[opp_pull_cols],
                    left_on=['game_id', 'team'],
                    right_on=['game_id', 'opp'],
                    suffixes=[None, '_opp']))

Let's make sure it worked by checking a couple of random game_ids.

In [36]:
mask = (game_df.game_id == 'dal-nyg-2019-11-04')

game_df[mask][['game_id', 'team', 'opp', 'prev_rush_yds_off',
                 'prev_rush_yds_off_opp', 'roll3_margin', 'roll3_margin_opp']]

Unnamed: 0,game_id,team,opp,prev_rush_yds_off,prev_rush_yds_off_opp,roll3_margin,roll3_margin_opp
5903,dal-nyg-2019-11-04,dal,nyg,189.0,80.0,5.0,-10.667
14213,dal-nyg-2019-11-04,nyg,dal,80.0,189.0,-10.667,5.0


In [37]:
mask = (game_df.game_id == 'chi-rai-2019-10-06')
game_df[mask][['game_id', 'team', 'opp', 'prev_rush_yds_off',
                 'prev_rush_yds_off_opp', 'roll3_margin', 'roll3_margin_opp']]

Unnamed: 0,game_id,team,opp,prev_rush_yds_off,prev_rush_yds_off_opp,roll3_margin,roll3_margin_opp
3686,chi-rai-2019-10-06,chi,rai,72.0,188.0,9.333,-10.333
15771,chi-rai-2019-10-06,rai,chi,188.0,72.0,-10.333,9.333


In [38]:
mask = (game_df.game_id == 'chi-gnb-1960-12-04')
game_df[mask][['game_id', 'team', 'opp', 'prev_rush_yds_off',
                 'prev_rush_yds_off_opp', 'roll3_margin', 'roll3_margin_opp']]

Unnamed: 0,game_id,team,opp,prev_rush_yds_off,prev_rush_yds_off_opp,roll3_margin,roll3_margin_opp
2946,chi-gnb-1960-12-04,chi,gnb,221.0,118.0,9.0,6.333
7479,chi-gnb-1960-12-04,gnb,chi,118.0,221.0,6.333,9.0


## Now let's take another look to make sure everything looks good.

If it does, we're ready to start modeling!

In [39]:
mask = game_df['team_year'] == 'chi-2019'
game_df[mask]

Unnamed: 0,team,year,date,team_year,opp,decade,week_num,game_day_of_week,game_outcome,team_record,pts_off,margin,pts_def,first_down_off,yards_off,pass_yds_off,rush_yds_off,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,game_time,overtime,home,to2_off,to2_def,game_id,result_tie,result_win,wins,losses,ties,prev_wins,prev_losses,prev_ties,prev_pts_off,roll3_num_ties,roll3_num_wins,prev_pts_def,prev_margin,prev_first_down_off,prev_yards_off,prev_pass_yds_off,prev_rush_yds_off,prev_to_off,prev_first_down_def,prev_yards_def,prev_pass_yds_def,...,ewma_pass_yds_def,ewma_rush_yds_def,ewma_to_def,ewma_to2_def,team_opp,opp_opp,prev_wins_opp,prev_losses_opp,prev_ties_opp,roll3_result_tie_opp,roll3_result_win_opp,prev_pts_off_opp,prev_pts_def_opp,prev_margin_opp,prev_first_down_off_opp,prev_yards_off_opp,prev_pass_yds_off_opp,prev_rush_yds_off_opp,prev_to_off_opp,prev_first_down_def_opp,prev_yards_def_opp,prev_pass_yds_def_opp,prev_rush_yds_def_opp,prev_to_def_opp,prev_result_tie_opp,prev_result_win_opp,roll3_pts_off_opp,roll3_pts_def_opp,roll3_margin_opp,roll3_first_down_off_opp,roll3_yards_off_opp,roll3_pass_yds_off_opp,roll3_rush_yds_off_opp,roll3_to_off_opp,roll3_yards_def_opp,roll3_pass_yds_def_opp,roll3_rush_yds_def_opp,roll3_to_def_opp,ewma_pts_off_opp,ewma_pts_def_opp,ewma_margin_opp,ewma_first_down_off_opp,ewma_yards_off_opp,ewma_pass_yds_off_opp,ewma_rush_yds_off_opp,ewma_to_off_opp,ewma_yards_def_opp,ewma_pass_yds_def_opp,ewma_rush_yds_def_opp,ewma_to_def_opp
3685,chi,2019,2019-09-29,chi-2019,min,201,4,Sun,W,3-1,16.0,10.0,6.0,17.0,269.0,197.0,72.0,,15.0,222.0,182.0,40.0,2.0,4.0,,1,,4.0,chi-min-2019-09-29,0,1,3,1,0,2.0,1.0,0.0,31.0,2.0,2.0,15.0,16.0,21.0,298.0,208.0,90.0,1.0,25.0,356.0,287.0,...,249.939,69.527,0.0,,min,chi,2.0,1.0,0.0,0.0,0.667,34.0,14.0,20.0,23.0,385.0,174.0,211.0,0.0,17.0,302.0,214.0,88.0,1.0,0.0,1.0,26.0,15.667,10.333,18.667,358.333,164.667,193.667,0.0,327.333,225.667,101.667,2.0,26.276,15.736,10.54,18.884,362.997,167.719,195.278,0.0,325.524,223.343,102.181,1.917
3686,chi,2019,2019-10-06,chi-2019,rai,201,5,Sun,L,3-2,21.0,-3.0,24.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,169.0,2.0,1.0,,0,4.0,4.0,chi-rai-2019-10-06,0,0,3,2,0,3.0,1.0,0.0,16.0,3.0,3.0,6.0,10.0,17.0,269.0,197.0,72.0,0.0,15.0,222.0,182.0,...,229.646,60.707,2.702,10.086,rai,chi,2.0,2.0,0.0,0.0,0.333,31.0,24.0,7.0,21.0,377.0,189.0,188.0,1.0,22.0,346.0,265.0,81.0,2.0,0.0,1.0,18.333,28.667,-10.333,19.0,328.667,193.667,135.0,1.333,399.333,291.667,107.667,0.0,20.2,25.924,-5.725,19.07,336.851,207.394,129.456,1.293,384.007,277.499,106.508,0.0
3687,chi,2019,2019-10-20,chi-2019,nor,201,7,Sun,L,3-3,25.0,-11.0,36.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,151.0,,4.0,,1,4.0,,chi-nor-2019-10-20,0,0,3,3,0,3.0,2.0,0.0,21.0,2.0,2.0,24.0,-3.0,15.0,236.0,194.0,42.0,2.0,25.0,398.0,229.0,...,229.482,88.095,2.492,8.268,nor,chi,5.0,1.0,0.0,0.0,1.0,13.0,6.0,7.0,18.0,326.0,222.0,104.0,0.0,11.0,226.0,151.0,75.0,1.0,0.0,1.0,18.667,13.333,5.333,20.333,349.667,238.667,111.0,0.0,245.0,173.667,71.333,0.0,20.911,18.903,2.008,19.346,342.859,238.584,104.275,1.0,323.756,227.263,96.493,1.432
3688,chi,2019,2019-10-27,chi-2019,sdg,201,8,Sun,L,3-4,16.0,-1.0,17.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,36.0,1.0,1.0,,1,4.0,1.0,chi-sdg-2019-10-27,0,0,3,4,0,3.0,3.0,0.0,25.0,1.0,1.0,36.0,-11.0,16.0,252.0,235.0,17.0,2.0,24.0,424.0,273.0,...,239.177,102.109,2.492,8.268,sdg,chi,2.0,5.0,0.0,0.0,0.0,20.0,23.0,-3.0,24.0,365.0,326.0,39.0,1.0,22.0,403.0,306.0,97.0,1.0,0.0,0.0,16.667,22.333,-5.667,22.0,319.667,284.333,35.333,2.333,336.333,199.0,137.333,1.333,19.626,20.413,-0.787,22.565,359.904,294.337,65.567,2.013,332.227,216.46,115.766,1.293
3689,chi,2019,2019-11-03,chi-2019,phi,201,9,Sun,L,3-5,14.0,-8.0,22.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,146.0,,1.0,,0,1.0,,chi-phi-2019-11-03,0,0,3,5,0,3.0,4.0,0.0,16.0,0.0,0.0,17.0,-1.0,26.0,388.0,226.0,162.0,2.0,11.0,231.0,195.0,...,230.272,88.783,2.079,6.253,phi,chi,4.0,4.0,0.0,0.0,0.333,31.0,13.0,18.0,21.0,371.0,153.0,218.0,1.0,16.0,253.0,155.0,98.0,1.0,0.0,1.0,20.333,29.333,-9.0,18.667,351.333,204.333,147.0,2.667,367.333,231.0,136.333,1.333,24.715,24.684,0.031,19.631,341.044,208.965,132.078,2.305,341.013,241.146,99.868,1.837
3690,chi,2019,2019-11-10,chi-2019,det,201,10,Sun,W,4-5,20.0,7.0,13.0,13.0,226.0,145.0,81.0,,21.0,357.0,259.0,98.0,1.0,1.0,,1,,1.0,chi-det-2019-11-10,0,1,4,5,0,3.0,5.0,0.0,14.0,0.0,0.0,22.0,-8.0,10.0,164.0,102.0,62.0,1.0,26.0,373.0,227.0,...,229.663,99.423,2.079,6.253,det,chi,3.0,4.0,1.0,0.0,0.333,24.0,31.0,-7.0,26.0,473.0,383.0,90.0,2.0,25.0,450.0,279.0,171.0,0.0,0.0,0.0,28.333,33.0,-4.667,23.0,427.0,350.333,76.667,1.667,441.333,302.333,139.0,0.0,26.004,28.239,-2.236,21.359,396.016,303.962,92.054,1.813,428.136,290.26,137.876,1.998
3691,chi,2019,2019-11-17,chi-2019,ram,201,11,Sun,L,4-6,7.0,-10.0,17.0,17.0,267.0,193.0,74.0,1.0,13.0,283.0,173.0,110.0,2.0,8.0,,0,1.0,4.0,chi-ram-2019-11-17,0,0,4,6,0,4.0,5.0,0.0,20.0,1.0,1.0,13.0,7.0,13.0,226.0,145.0,81.0,0.0,21.0,357.0,259.0,...,234.77,99.176,1.795,4.873,ram,chi,5.0,4.0,0.0,0.0,0.667,12.0,17.0,-5.0,16.0,306.0,218.0,88.0,4.0,15.0,273.0,231.0,42.0,2.0,0.0,0.0,24.333,12.333,12.0,19.333,385.667,293.667,92.0,0.0,299.333,238.0,61.333,0.0,23.93,19.976,3.954,20.4,372.553,279.344,93.209,2.507,329.327,243.18,86.147,1.818
3692,chi,2019,2019-11-24,chi-2019,nyg,201,12,Sun,W,5-6,19.0,5.0,14.0,20.0,335.0,270.0,65.0,2.0,14.0,243.0,134.0,109.0,1.0,1.0,,1,4.0,1.0,chi-nyg-2019-11-24,0,1,5,6,0,4.0,6.0,0.0,7.0,1.0,1.0,17.0,-10.0,17.0,267.0,193.0,74.0,1.0,13.0,283.0,173.0,...,224.592,100.959,1.842,4.673,nyg,chi,2.0,8.0,0.0,0.0,0.0,27.0,34.0,-7.0,15.0,281.0,258.0,23.0,2.0,18.0,294.0,218.0,76.0,0.0,0.0,0.0,23.667,34.0,-10.333,19.667,307.333,239.667,67.667,2.0,366.0,263.667,102.333,0.0,20.876,29.904,-9.028,19.06,306.915,221.808,85.107,2.397,370.294,249.501,120.793,1.998
3693,chi,2019,2019-11-28,chi-2019,det,201,13,Thu,W,6-6,24.0,4.0,20.0,22.0,419.0,331.0,88.0,1.0,19.0,364.0,259.0,105.0,1.0,12.0,,0,1.0,1.0,chi-det-2019-11-28,0,1,6,6,0,5.0,6.0,0.0,19.0,2.0,2.0,14.0,5.0,20.0,335.0,270.0,65.0,2.0,14.0,243.0,134.0,...,210.336,102.225,1.669,3.915,det,chi,3.0,7.0,1.0,0.0,0.0,16.0,19.0,-3.0,21.0,364.0,189.0,175.0,4.0,13.0,230.0,144.0,86.0,2.0,0.0,0.0,18.667,24.667,-6.0,20.333,344.333,213.0,131.333,0.0,321.667,241.0,80.667,0.0,22.975,26.715,-3.74,20.931,374.532,264.678,109.855,2.148,383.42,269.406,114.014,1.796
3694,chi,2019,2019-12-05,chi-2019,dal,201,14,Thu,W,7-6,31.0,7.0,24.0,24.0,382.0,231.0,151.0,2.0,22.0,408.0,326.0,82.0,,8.0,,1,4.0,,chi-dal-2019-12-05,0,1,7,6,0,6.0,6.0,0.0,24.0,2.0,2.0,20.0,4.0,22.0,419.0,331.0,88.0,1.0,19.0,364.0,259.0,...,217.701,102.645,1.542,3.363,dal,chi,6.0,6.0,0.0,0.0,0.333,15.0,26.0,-11.0,32.0,426.0,323.0,103.0,2.0,22.0,356.0,232.0,124.0,0.0,0.0,0.0,19.667,22.0,-2.333,24.667,418.667,323.0,95.667,1.333,316.667,201.333,115.333,0.0,24.321,20.602,3.719,24.645,426.295,307.479,118.816,1.539,319.987,209.679,110.308,2.2


Looks great. Now it's time to start modeling our data! Head over to `model_exploration.ipynb` to continue following along.