In [34]:
from collections import defaultdict
import nfl_data_py as nfl
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)

Inspired by Christopher Davis and the Deep Dive Pod [Youtube link](https://www.youtube.com/watch?v=QvNcnS-2EOY&ab_channel=ChristopherDavis%2CUCIrvine)

In [35]:
year = 2020

schedule = nfl.import_schedules([year])
schedule.head()

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,home_score,location,result,total,overtime,old_game_id,gsis,nfl_detail_id,pfr,pff,espn,away_rest,home_rest,away_moneyline,home_moneyline,spread_line,away_spread_odds,home_spread_odds,total_line,under_odds,over_odds,div_game,roof,surface,temp,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
5583,2020_01_HOU_KC,2020,REG,1,2020-09-10,Thursday,20:20,HOU,20.0,KC,34.0,Home,14.0,54.0,0.0,2020091000,58168.0,,202009100kan,18532.0,401220225.0,7,7,349.0,-423.0,9.5,-105.0,-105.0,53.5,-109.0,-102.0,0,outdoors,grass,56.0,7.0,00-0033537,00-0033873,Deshaun Watson,Patrick Mahomes,Bill O'Brien,Andy Reid,Clete Blakeman,KAN00,Arrowhead Stadium
5584,2020_01_SEA_ATL,2020,REG,1,2020-09-13,Sunday,13:00,SEA,38.0,ATL,25.0,Home,-13.0,63.0,0.0,2020091300,58169.0,,202009130atl,18533.0,401220313.0,7,7,102.0,-112.0,1.0,-105.0,-105.0,49.5,-108.0,-103.0,0,closed,fieldturf,,,00-0029263,00-0026143,Russell Wilson,Matt Ryan,Pete Carroll,Dan Quinn,Shawn Hochuli,ATL97,Mercedes-Benz Stadium
5585,2020_01_CLE_BAL,2020,REG,1,2020-09-13,Sunday,13:00,CLE,6.0,BAL,38.0,Home,32.0,44.0,0.0,2020091301,58170.0,,202009130rav,18534.0,401220147.0,7,7,260.0,-302.0,7.0,-107.0,-103.0,47.0,100.0,-112.0,1,outdoors,grass,76.0,5.0,00-0034855,00-0034796,Baker Mayfield,Lamar Jackson,Kevin Stefanski,John Harbaugh,Ronald Torbert,BAL00,M&T Bank Stadium
5586,2020_01_NYJ_BUF,2020,REG,1,2020-09-13,Sunday,13:00,NYJ,17.0,BUF,27.0,Home,10.0,44.0,0.0,2020091302,58171.0,,202009130buf,18535.0,401220116.0,7,7,242.0,-279.0,6.5,-110.0,100.0,39.5,-114.0,102.0,1,outdoors,astroturf,67.0,15.0,00-0034869,00-0034857,Sam Darnold,Josh Allen,Adam Gase,Sean McDermott,Shawn Smith,BUF00,New Era Field
5587,2020_01_LV_CAR,2020,REG,1,2020-09-13,Sunday,13:00,LV,34.0,CAR,30.0,Home,-4.0,64.0,0.0,2020091303,58172.0,,202009130car,18536.0,401220370.0,7,7,-124.0,134.0,-3.0,-112.0,-113.0,48.0,-101.0,-110.0,0,outdoors,grass,81.0,5.0,00-0031280,00-0031237,Derek Carr,Teddy Bridgewater,Jon Gruden,Matt Rhule,Brad Allen,CAR00,Bank of America Stadium


In [36]:
cols_of_interest = [
    'game_id',
    'week',
    'away_team', 
    'home_team',
    'total',
    'spread_line'
]
# weeks_of_interest = list(range(9, 18))
# df = schedule[schedule['week'].isin(weeks_of_interest)][cols_of_interest].copy()
df = schedule[cols_of_interest].copy()
df.head()

Unnamed: 0,game_id,week,away_team,home_team,total,spread_line
5583,2020_01_HOU_KC,1,HOU,KC,54.0,9.5
5584,2020_01_SEA_ATL,1,SEA,ATL,63.0,1.0
5585,2020_01_CLE_BAL,1,CLE,BAL,44.0,7.0
5586,2020_01_NYJ_BUF,1,NYJ,BUF,44.0,6.5
5587,2020_01_LV_CAR,1,LV,CAR,64.0,-3.0


In [37]:
exp_home_score = lambda row: (row['total'] + row['spread_line'])/2
exp_away_score = lambda row: (row['total'] - row['spread_line'])/2

mean_score = df['total'].mean()/2
median_score = df['total'].median()/2

print(f'mean: {mean_score}\nmedian: {median_score}\ndif: {mean_score-median_score}')

df['exp_away'] = df.apply(exp_away_score, axis=1)
df['exp_home'] = df.apply(exp_home_score, axis=1)
df['scaled_away'] = df['exp_away'] - mean_score
df['scaled_home'] = df['exp_home'] - mean_score
df.head()


mean: 24.7453531598513
median: 24.5
dif: 0.24535315985130168


Unnamed: 0,game_id,week,away_team,home_team,total,spread_line,exp_away,exp_home,scaled_away,scaled_home
5583,2020_01_HOU_KC,1,HOU,KC,54.0,9.5,22.25,31.75,-2.495353,7.004647
5584,2020_01_SEA_ATL,1,SEA,ATL,63.0,1.0,31.0,32.0,6.254647,7.254647
5585,2020_01_CLE_BAL,1,CLE,BAL,44.0,7.0,18.5,25.5,-6.245353,0.754647
5586,2020_01_NYJ_BUF,1,NYJ,BUF,44.0,6.5,18.75,25.25,-5.995353,0.504647
5587,2020_01_LV_CAR,1,LV,CAR,64.0,-3.0,33.5,30.5,8.754647,5.754647


In [38]:
# create a system of equations where each row represents the coefficients for an offense going against a defense
system = []
for _, _, _, away_team, home_team, _, _, _, _, scaled_away, scaled_home in df.itertuples():
    # Perspective of home team
    # 1 * home_offense - 1 * away_defense + 0.5 * home_field_advantage = home_score
    system.append({
        f'{home_team}_off':  1,
        f'{away_team}_def': -1,
        'hfa': 0.5,
        'score': scaled_home,
    })
    # Perspective of away team (home field hurts you)
    # 1 * away_offense - 1 * home_defense - 0.5 * home_field_advantage = away_score
    system.append({
        f'{away_team}_off':  1,
        f'{home_team}_def': -1,
        'hfa': -0.5,
        'score': scaled_away,
    })

In [39]:
# create a sorted list of teams
teams = list(sorted(df['home_team'].unique()))

# turn the system of equations into a dataframe
system_df = pd.DataFrame(system).fillna(0)

# sort the columns of the system of equations
sorted_system_cols = list(sorted(system_df.columns))
sorted_system_cols.remove('hfa')
sorted_system_cols.remove('score')
sorted_system_cols += ['hfa', 'score']

system_df = system_df[sorted_system_cols]
system_df.head()

Unnamed: 0,ARI_def,ARI_off,ATL_def,ATL_off,BAL_def,BAL_off,BUF_def,BUF_off,CAR_def,CAR_off,CHI_def,CHI_off,CIN_def,CIN_off,CLE_def,CLE_off,DAL_def,DAL_off,DEN_def,DEN_off,DET_def,DET_off,GB_def,GB_off,HOU_def,HOU_off,IND_def,IND_off,JAX_def,JAX_off,KC_def,KC_off,LAC_def,LAC_off,LA_def,LA_off,LV_def,LV_off,MIA_def,MIA_off,MIN_def,MIN_off,NE_def,NE_off,NO_def,NO_off,NYG_def,NYG_off,NYJ_def,NYJ_off,PHI_def,PHI_off,PIT_def,PIT_off,SEA_def,SEA_off,SF_def,SF_off,TB_def,TB_off,TEN_def,TEN_off,WAS_def,WAS_off,hfa,score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,7.004647
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-2.495353
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,7.254647
3,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,6.254647
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.754647


In [40]:
# use linear regression to solve for the coefficients
reg = LinearRegression(fit_intercept=False)
# can use ridge regression to penalize large values if it makes them

reg.fit(system_df.drop('score', axis=1), system_df['score'])

hfa = round(reg.coef_[-1], 2)
print('home field advantage:', hfa)

home field advantage: 1.11


In [41]:
# dict to hold team ratings
rating_dict = defaultdict(dict)
# iterate over offensive and defensive coefficients (ignore hfa and score columns)
for team_str, rating in zip(system_df.columns[:-2], reg.coef_[:-1]):
    team, unit = team_str.split('_')
    rating_dict[team][unit] = rating

In [42]:
power_ratings = pd.DataFrame(rating_dict).T
power_ratings['ovr'] = power_ratings['off'] + power_ratings['def']
power_ratings = power_ratings[['ovr', 'off', 'def']].copy()
power_ratings.sort_values('ovr', ascending=False)

Unnamed: 0,ovr,off,def
KC,6.859334,4.005409,2.853925
BAL,5.742363,0.940356,4.802007
NO,5.577369,2.660782,2.916587
GB,4.989946,4.731433,0.258513
TB,4.909879,4.099325,0.810554
SEA,4.098728,4.218229,-0.119501
PIT,3.344655,0.86038,2.484275
IND,3.205826,1.390751,1.815075
LA,3.039606,-1.762677,4.802284
BUF,2.908961,3.841936,-0.932975


In [43]:
fig = px.scatter(power_ratings.reset_index(level=0), x='off', y='def', color='ovr', text='index', title=f'Market Derived Power Ratings {year}')
fig.update_traces(textposition='top center')
fig.show()

In [44]:
fig.write_image(f"{year}_power_ratings.png")