# Are FiveThirtyEight's ELO Game Predictions Uniformly Accurate?

##### Taking FiveThirtyEight's ELO baseball game records from 1871-present and analyzing their accuracy based on team, regular season vs playoffs and "eras" of baseball.


In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

Oddly, the format of the `date` feature changes at the century mark of 1900 (come on guys, that's the _last_ year of the century, not the first). I will divide the data by date format, convert to datetimes, then concatenate the dataframes. 

In [50]:
df = pd.read_csv('mlb_elo.csv')
df_1900s = df.loc[:200673]
df_1800s = df.loc[200674:]
df_1800s.head(5)

# df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
# df.head(15)

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2
200674,1899-10-15,1899,0,,CIN,CL3,1525.598,1334.931,0.774819,0.225181,...,,,,,0.774724,0.225276,1527.44,1333.141,19.0,3.0
200675,1899-10-15,1899,0,,CHC,LS2,1511.134,1528.226,0.50994,0.49006,...,,,,,0.509568,0.490432,1508.52,1530.713,5.0,9.0
200676,1899-10-15,1899,0,,CIN,CL3,1523.768,1336.761,0.771121,0.228879,...,,,,,0.771022,0.228978,1525.577,1335.004,16.0,1.0
200677,1899-10-15,1899,0,,CHC,STL,1508.055,1505.439,0.538229,0.461771,...,,,,,0.540744,0.459256,1510.941,1502.502,7.0,0.0
200678,1899-10-14,1899,0,,LAD,BL2,1557.368,1555.611,0.537,0.463,...,,,,,0.537223,0.462777,1559.938,1553.052,8.0,3.0


Now that I have two dataframes with different date formats, I'll apply the `.to_datetime()` method.

In [51]:
df_1800s['date'] = pd.to_datetime(df_1800s['date'], format='%Y-%m-%d')
df_1900s['date'] = pd.to_datetime(df_1900s['date'], format='%m/%d/%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Concatenate dataframes, then get rid of games that have yet to happen. Finally, `df.reset_index()` to get index back to 0.

In [64]:
frames = [df_1900s, df_1800s]
df = pd.concat(frames)
df = df.loc[df['date'] < dt.datetime(2019, 1, 1)]
df.reset_index(inplace=True)
df.head(5)

Unnamed: 0,index,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,...,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2
0,2430,2018-10-28,2018,0,w,LAD,BOS,1572.026537,1603.192577,0.486253,...,55.720196,55.904108,8.777848,11.191392,0.483877,0.516123,1572.395835,1610.086323,1.0,5.0
1,2431,2018-10-27,2018,0,w,LAD,BOS,1575.479964,1599.73915,0.499503,...,54.474209,53.638152,3.666228,0.386608,0.508342,0.491658,1576.245147,1606.237011,6.0,9.0
2,2432,2018-10-26,2018,0,w,LAD,BOS,1573.220427,1601.998687,0.490832,...,57.86924,51.146043,20.863039,-11.123666,0.555907,0.444093,1579.775197,1602.706961,3.0,2.0
3,2433,2018-10-24,2018,0,w,BOS,LAD,1600.026162,1575.192952,0.59262,...,55.360095,51.91669,9.286617,-7.249446,0.619808,0.380192,1604.605197,1577.876961,4.0,2.0
4,2434,2018-10-23,2018,0,w,BOS,LAD,1597.035414,1578.1837,0.581491,...,64.089475,56.62859,50.165559,14.184438,0.648954,0.351046,1602.81127,1579.670888,8.0,4.0


In [68]:
def determine_winner(row):
    if row['score1'] > row['score2']:
        return row['team1']
    else:
        return row['team2']

df.loc[0:5].apply(lambda x: print(x[1]))

2431
2018-10-27 00:00:00
2018
0
w
LAD
BOS
1575.479964
1599.73915
0.49950267
0.50049733
1572.026537
1603.192577
1579.7751970000002
1602.7069609999999
Rich Hill
Eduardo Rodriguez
54.47420893
53.63815195
3.6662283739999997
0.38660769
0.508341984
0.49165801600000003
1576.245147
1606.237011
6.0
9.0


index           None
date            None
season          None
neutral         None
playoff         None
team1           None
team2           None
elo1_pre        None
elo2_pre        None
elo_prob1       None
elo_prob2       None
elo1_post       None
elo2_post       None
rating1_pre     None
rating2_pre     None
pitcher1        None
pitcher2        None
pitcher1_rgs    None
pitcher2_rgs    None
pitcher1_adj    None
pitcher2_adj    None
rating_prob1    None
rating_prob2    None
rating1_post    None
rating2_post    None
score1          None
score2          None
dtype: object

In [62]:
for each in df.iterrows():
    stats = each[1]
    winners.append(think(stats['team1'], stats['team2'], stats['score1'], stats['score2']))
winners[0:5]

['BOS', 'BOS', 'LAD', 'BOS', 'BOS']

In [63]:
df['winner'] = winners
df.head(10)

Unnamed: 0,level_0,index,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,...,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2,winner
0,0,2430,2018-10-28,2018,0,w,LAD,BOS,1572.026537,1603.192577,...,55.904108,8.777848,11.191392,0.483877,0.516123,1572.395835,1610.086323,1.0,5.0,BOS
1,1,2431,2018-10-27,2018,0,w,LAD,BOS,1575.479964,1599.73915,...,53.638152,3.666228,0.386608,0.508342,0.491658,1576.245147,1606.237011,6.0,9.0,BOS
2,2,2432,2018-10-26,2018,0,w,LAD,BOS,1573.220427,1601.998687,...,51.146043,20.863039,-11.123666,0.555907,0.444093,1579.775197,1602.706961,3.0,2.0,LAD
3,3,2433,2018-10-24,2018,0,w,BOS,LAD,1600.026162,1575.192952,...,51.91669,9.286617,-7.249446,0.619808,0.380192,1604.605197,1577.876961,4.0,2.0,BOS
4,4,2434,2018-10-23,2018,0,w,BOS,LAD,1597.035414,1578.1837,...,56.62859,50.165559,14.184438,0.648954,0.351046,1602.81127,1579.670888,8.0,4.0,BOS
5,5,2435,2018-10-20,2018,0,l,MIL,LAD,1561.262901,1574.030002,...,57.921682,14.107999,20.416846,0.5095,0.4905,1561.204647,1582.013536,1.0,5.0,LAD
6,6,2436,2018-10-19,2018,0,l,MIL,LAD,1556.847363,1578.44554,...,53.007724,-3.468566,-3.741747,0.505211,0.494789,1565.239306,1577.978877,7.0,2.0,MIL
7,7,2437,2018-10-18,2018,0,l,HOU,BOS,1592.540969,1593.329654,...,54.123687,28.830198,4.53392,0.603029,0.396971,1598.225096,1600.468622,1.0,4.0,BOS
8,8,2438,2018-10-17,2018,0,l,HOU,BOS,1595.655169,1590.215453,...,52.064617,-6.96972,-6.067848,0.568016,0.431984,1602.435705,1596.258014,6.0,8.0,BOS
9,9,2439,2018-10-17,2018,0,l,LAD,MIL,1575.651446,1559.641457,...,51.293974,10.386649,-2.277124,0.600428,0.399572,1582.387663,1560.83052,5.0,2.0,LAD
