# Get the data

## Statcast data

In [2]:
import pandas as pd

In [3]:
import pybaseball
from concurrent.futures import ThreadPoolExecutor, as_completed

date_range = pd.date_range('2018-03-29', pd.datetime.today()).strftime('%Y-%m-%d')
results = []
errors = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(pybaseball.statcast, date): date for date in date_range}
    for future in as_completed(futures):
        try:
            results.append(future.result())
        except Exception as err:
            errors.append(err)
            print(f'could not get data for data {futures[future]}')

could not get data for data 2018-04-30
could not get data for data 2018-07-02
could not get data for data 2018-07-16
could not get data for data 2018-07-17
could not get data for data 2018-07-18
could not get data for data 2018-08-07
could not get data for data 2018-09-20


In [4]:
df = pd.concat(results)
df.shape

(665052, 90)

In [5]:
df.to_csv(f'statcast-{pd.datetime.today().strftime("%Y-%m-%d")}.csv', index=False)

In [6]:
d = df.groupby([
    'game_date', 'home_team', 'away_team', 
], as_index=False)[['post_away_score', 'post_home_score']].max()

In [7]:
d[(d.home_team == 'PHI') & (d.away_team == 'SF')]

Unnamed: 0,game_date,home_team,away_team,post_away_score,post_home_score
490,2018-05-07,PHI,SF,0.0,11.0
503,2018-05-08,PHI,SF,2.0,4.0
518,2018-05-09,PHI,SF,3.0,11.0
530,2018-05-10,PHI,SF,3.0,6.0


In [8]:
d.shape

(2213, 5)

In [9]:
team_ids = {team: i for i, team in enumerate(d.home_team.unique())}
n_teams = len(team_ids)
team_ids

{'ARI': 0,
 'ATL': 1,
 'BAL': 2,
 'BOS': 21,
 'CHC': 29,
 'CIN': 13,
 'CLE': 27,
 'COL': 28,
 'CWS': 22,
 'DET': 14,
 'HOU': 15,
 'KC': 3,
 'LAA': 16,
 'LAD': 4,
 'MIA': 5,
 'MIL': 17,
 'MIN': 23,
 'NYM': 6,
 'NYY': 19,
 'OAK': 7,
 'PHI': 24,
 'PIT': 18,
 'SD': 8,
 'SEA': 9,
 'SF': 20,
 'STL': 25,
 'TB': 10,
 'TEX': 11,
 'TOR': 12,
 'WSH': 26}

In [10]:
import numpy as np
d['home_team_id'] = d.home_team.map(team_ids)
d['away_team_id'] = d.away_team.map(team_ids)
d['home_team_win'] = np.where(d.post_home_score > d.post_away_score, 1, 0)
d['away_team_win'] = np.where(d.post_home_score < d.post_away_score, 1, 0)

## Standings

In [11]:
import requests as rq
import pandas as pd
response = rq.get('https://projects.fivethirtyeight.com/2018-mlb-predictions/')
standings = pd.read_html(response.content)[0]

In [12]:
standings

Unnamed: 0_level_0,Unnamed: 0_level_0,Avg. Simulated SeasonAvg. Simulation,Postseason Chances,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Team▲▼,Division▲▼,Team rating▲▼,1-Week Change▲▼,RecordProj.rec.▲▼,Run Diff.▲▼,Make PlayoffsMakePlay-offs▲▼,WinDivisionWinDiv.▲▼,Win World SeriesWinW.S.▲▼
0,Astros95-57Astros,AL West,1595,-6.0,102-60,258,>99%,99%,20%
1,Red Sox103-49Red Sox,AL East,1586,-8.0,109-53,223,✓,>99%,24%
2,Dodgers85-68Dodgers,NL West,1569,11.0,91-71,165,96%,93%,17%
3,Yankees93-58Yankees,AL East,1565,4.0,99-63,173,>99%,<1%,7%
4,Athletics91-61Athletics,AL West,1561,-3.0,97-65,122,99%,1%,4%
5,Indians85-66Indians,AL Central,1556,1.0,91-71,166,✓,✓,9%
6,Rays85-66Rays,AL East,1550,8.0,91-71,84,1%,—,<1%
7,Cubs89-63Cubs,NL Central,1548,3.0,95-67,119,>99%,89%,10%
8,Cardinals84-69Cardinals,NL Central,1541,-2.0,89-73,79,75%,<1%,3%
9,Brewers87-66Brewers,NL Central,1532,1.0,92-70,64,99%,10%,3%


In [13]:
team_name_map = {
    'Diamondbacks': 'ARI',
    'Braves': 'ATL',
    'Orioles': 'BAL',
    'Red Sox': 'BOS',
    'Angels': 'LAA',
    'Cubs': 'CHC',
    'White Sox': 'CWS',
    'Reds': 'CIN',
    'Indians': 'CLE',
    'Rockies': 'COL',
    'Tigers': 'DET',
    'Astros': 'HOU',
    'Royals': 'KC',
    'Dodgers': 'LAD',
    'Marlins': 'MIA',
    'Brewers': 'MIL',
    'Twins': 'MIN',
    'Yankees': 'NYY',
    'Athletics': 'OAK',
    'Phillies': 'PHI',
    'Pirates': 'PIT',
    'Padres': 'SD',
    'Mariners': 'SEA',
    'Giants': 'SF',
    'Cardinals': 'STL',
    'Rays': 'TB',
    'Rangers': 'TEX',
    'Blue Jays': 'TOR',
    'Nationals': 'WSH',
    'Mets': 'NYM',
}

In [14]:
def sanitize_name(n):
    out = ''
    for a in n:
        if a.isdigit():
            break
        out += a
    return out
rankings = standings.iloc[:,0].apply(sanitize_name).map(team_name_map)

In [15]:
import numpy as np
rankings = pd.DataFrame({
    'team': rankings,
    'team_id': rankings.map(team_ids),
    'score': np.arange(len(rankings))[::-1]
})

In [16]:
rankings

Unnamed: 0,team,team_id,score
0,HOU,15,29
1,BOS,21,28
2,LAD,4,27
3,NYY,19,26
4,OAK,7,25
5,CLE,27,24
6,TB,10,23
7,CHC,29,22
8,STL,25,21
9,MIL,17,20


In [17]:
assert not rankings.team.isnull().any()
assert not rankings.team_id.isnull().any()

In [18]:
# center at 10 - negative values restrict how we can model the problem
rankings['score'] = 10 + (rankings['score'] - np.mean(rankings['score'])) / np.std(rankings['score'])

In [19]:
rankings

Unnamed: 0,team,team_id,score
0,HOU,15,11.675247
1,BOS,21,11.559712
2,LAD,4,11.444178
3,NYY,19,11.328644
4,OAK,7,11.21311
5,CLE,27,11.097575
6,TB,10,10.982041
7,CHC,29,10.866507
8,STL,25,10.750973
9,MIL,17,10.635438


In [20]:
rankings.to_csv(f'rankings-{pd.datetime.today().strftime("%Y-%m-%d")}.csv', index=False)

## Aggregate

In [23]:
dd = d.groupby(['home_team', 'home_team_id', 'away_team', 'away_team_id'], as_index=False)[['home_team_win', 'away_team_win']].sum()
dd['total_games'] = dd.home_team_win + dd.away_team_win
dd.head()

Unnamed: 0,home_team,home_team_id,away_team,away_team_id,home_team_win,away_team_win,total_games
0,ARI,0,ATL,1,1,3,4
1,ARI,0,CHC,29,1,2,3
2,ARI,0,CIN,13,2,1,3
3,ARI,0,COL,28,3,3,6
4,ARI,0,HOU,15,1,1,2


In [24]:
dd.to_csv(f'wins-{pd.datetime.today().strftime("%Y-%m-%d")}.csv', index=False)