In [1]:
import pandas as pd

In [2]:
from matplotlib import pyplot as plt

In [3]:
import itertools  

In [4]:
sample_data = pd.read_csv('sample_data_wk6.csv')
sample_data = sample_data.drop(columns=['rowid', 'matchupid'])
sample_data.to_csv('sample_data_wk6.csv',index=False)

KeyError: "['rowid' 'matchupid'] not found in axis"

In [None]:
real_data = pd.read_csv('real_data_raw_wk6.csv')
real_data['margin'] = real_data['homescore']-real_data['awayscore']
real_data['total'] = real_data['homescore']+real_data['awayscore']

real_data.loc[real_data['homescore'] >= real_data['awayscore'], 'winnerhomeoraway'] = 'home' 
real_data.loc[real_data['homescore'] < real_data['awayscore'], 'winnerhomeoraway'] = 'away' 

real_data.loc[real_data['winnerhomeoraway'] == 'home', 'winnerteam'] = real_data.loc[real_data['winnerhomeoraway'] == 'home'].hometeam 
real_data.loc[real_data['winnerhomeoraway'] == 'away', 'winnerteam'] = real_data.loc[real_data['winnerhomeoraway'] == 'away'].awayteam 

real_data = real_data.drop(columns=['rowid', 'matchupid'])

real_data.to_csv('real_data_wk6.csv',index=False)

In [None]:
sample_data[0:1000:15]

In [None]:
real_data

In [None]:
real_dict={}
pred_dict={}
diff_dict={}
match_dict={}

num_games = len(real_data)

for i,team in enumerate(zip(itertools.count(step=2),real_data.hometeam,real_data.awayteam)):
    home_team = team [1]
    away_team = team [2]
    
    match = home_team+'-'+away_team
    match_dict[i] = (home_team,match)
    match_dict[i+num_games] = (away_team,match)
    
    r_home_marg = int(real_data.margin[real_data.hometeam == home_team])
    r_away_marg = -r_home_marg
    
    p_home_all_marg = list((sample_data[sample_data.hometeam == home_team]).margin)
    p_home_marg = sum(p_home_all_marg)/len(p_home_all_marg)
    p_away_marg = -p_home_marg
    
    d_home_marg = round(r_home_marg - p_home_marg,2)
    d_away_marg = round(r_away_marg - p_away_marg,2)

    real_dict[i] = (home_team,r_home_marg)
    real_dict[i+num_games] = (away_team,r_away_marg)
    
    pred_dict[i] = (home_team,p_home_marg)
    pred_dict[i+num_games] = (away_team,p_away_marg)
    
    diff_dict[i] = (home_team,d_home_marg)
    diff_dict[i+num_games] = (away_team,d_away_marg)

In [None]:
match_dict

In [None]:
real_dict

In [None]:
pred_dict

In [None]:
diff_dict

In [None]:
df_real = pd.DataFrame.from_dict(real_dict, orient='index',columns = ['team','real_margin'])
df_real = df_real.sort_values(by=['team'],ignore_index=True,ascending = False)

df_pred = pd.DataFrame.from_dict(pred_dict, orient='index',columns = ['team','pred_margin'])
df_pred = df_pred.sort_values(by=['team'],ignore_index=True,ascending = False)

df_diff = pd.DataFrame.from_dict(diff_dict, orient='index',columns = ['team','diff_margin'])
df_diff = df_diff.sort_values(by=['team'],ignore_index=True,ascending = False)

df_match = pd.DataFrame.from_dict(match_dict, orient='index',columns = ['team','match'])
df_match = df_match.sort_values(by=['team'],ignore_index=True,ascending = False)

df_marg = df_real
df_marg['pred_margin'] = df_pred['pred_margin']
df_marg['diff_margin'] = df_diff['diff_margin']
df_marg['match'] = df_match['match']

df_marg_srt = df_marg.sort_values(by=['diff_margin'],ignore_index=True,ascending = False)
df_marg_srt_hlf = df_marg_srt[0:int((len(df_marg_srt)/2))]
df_marg_srt_hlf

In [None]:
#This first plot shows the teams ranked from most underestimated to most overestimated after comparing the 
#average simulation point margins to the eventual real game point margins.  

from matplotlib.pyplot import figure
plt.figure(figsize = (20,8))
plt.plot(df_marg_srt.team,df_marg_srt.diff_margin)
plt.plot(df_marg_srt.team,df_marg_srt.real_margin,'x')
plt.plot(df_marg_srt.team,df_marg_srt.pred_margin,'x')
plt.plot(df_marg_srt.team, [0]*len(df_marg_srt),color='black')
plt.legend(['diff', 'real', 'predicted'])
plt.xticks(rotation=90)
plt.xlabel('Team')
plt.ylabel('Real Pnt Margin - Predicted Pnt Margin')
plt.title('Difference in Actual vs Predicted Point Spread (NFL wk6)')
plt.show()

In [None]:
#Preceding team-level view is a bit crowded (displays a flipped mirror image, 
#considering for each team there is an opponent with equal and opposite data).  
#So the bottom plot uses the data from only the underestimated team perspective.  This "match-level" view 
#does a better job emphasizing the overall margin prediction accuracy trend, but you lose the insight 
#into how individual teams performed in simulations and real-life.

import matplotlib.pyplot as plt
import numpy as np

labels = df_marg_srt_hlf.match
realb = df_marg_srt_hlf['real_margin']
predb = df_marg_srt_hlf['pred_margin']
diffb = df_marg_srt_hlf['diff_margin']

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig = plt.figure(figsize = (12,18))
ax = plt.subplot(111)
                 
#fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, realb, width,color='g', label='real margin')
rects2 = ax.bar(x + width/2, predb, width,color='b', label='pred margin')
lined = plt.plot(x, diffb,color='r', label='diff margin')
line0 = plt.plot(x, [0]*len(diffb),color='black')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('margin')
ax.set_title('real vs predicted margins (home-away) NFL wk6')
plt.xticks(rotation=90)
plt.xlabel('Match (home-away)')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
          
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(-30, 0),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom',bbox=dict(boxstyle="round,pad=0.3", fc="w", ec="k", lw=1))

autolabel(rects1)
autolabel(rects2)

#fig.tight_layout()

plt.show()

In [None]:
display(round(df_marg_srt_hlf['diff_margin'].mean(),2))
display(round(df_marg_srt_hlf['diff_margin'].median(),2))