# Get the data

## Statcast data

In [20]:
import pandas as pd
import requests
import io

# pybaseball was breaking for some reason so I pulled this method out of the source code
def small_request(start_dt,end_dt):
    url = "https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={}&game_date_lt={}&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&".format(start_dt, end_dt)
    s=requests.get(url, timeout=None).content
    data = pd.read_csv(io.StringIO(s.decode('utf-8')))
    return data

date_range = pd.date_range('2024-03-20', pd.Timestamp.today()).strftime('%Y-%m-%d')
# date_range = pd.date_range('2023-03-20', '2023-10-03')
small_request(start_dt=date_range[0], end_dt=date_range[0])

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
0,CH,2015-03-20,83.3,1.73,5.76,"Elías, Roenis",471107,606273,walk,blocked_ball,...,1,1,1,,,,0.028,,,
1,FF,2015-03-20,90.6,1.11,6.17,"Elías, Roenis",471107,606273,,ball,...,1,1,1,,,,0.000,,,
2,FF,2015-03-20,93.8,-2.44,5.69,"Kennedy, Ian",474249,453178,field_out,hit_into_play,...,1,1,1,,,,0.041,,,
3,CH,2015-03-20,83.5,1.78,5.90,"Elías, Roenis",471107,606273,,foul_tip,...,1,1,1,,,,0.037,,,
4,SL,2015-03-20,85.1,-2.38,5.96,"Kennedy, Ian",474249,453178,,foul,...,1,1,1,,,,0.000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2696,,2015-03-20,,,,"Richard, Clayton",573131,453385,field_out,hit_into_play,...,1,3,1,,,,0.017,,,
2697,,2015-03-20,,,,"Nelson, Jimmy",572041,519076,field_out,hit_into_play,...,0,0,0,,,,0.022,,,
2698,,2015-03-20,,,,"Paulino, Felipe",517370,462956,single,hit_into_play,...,0,1,0,,,,-0.037,,,
2699,,2015-03-20,,,,"Gibson, Kyle",543281,502043,,called_strike,...,0,0,0,,,,0.000,,,


In [24]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

date_range = pd.date_range('2024-03-01', pd.Timestamp.today())
# date_range = pd.date_range('2023-03-20', '2023-10-03')
date_range = date_range[date_range.month.to_series().between(3, 10)].strftime('%Y-%m-%d')
results = []
errors = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(small_request, date, date): date for date in date_range}
    for future in tqdm(as_completed(futures), total=len(date_range)):
        try:
            result = future.result()
            result = (
                result[result.game_type.eq('R')]
                .groupby(['home_team', 'away_team', 'game_pk', 'game_date'])
                [['post_home_score', 'post_away_score']].max()
            )
            results.append(result)
        except Exception as err:
            errors.append(err)
            print(f'could not get data for data {futures[future]}')

  0%|          | 0/2270 [00:00<?, ?it/s]

In [36]:
df = pd.concat(results)
df.shape

(21079, 2)

In [37]:
df = df.reset_index()
df

Unnamed: 0,home_team,away_team,game_pk,game_date,post_home_score,post_away_score
0,AZ,SF,413660,2015-04-06,4,5
1,CIN,PIT,413658,2015-04-06,5,2
2,DET,MIN,413656,2015-04-06,4,0
3,HOU,CLE,413654,2015-04-06,2,0
4,KC,CWS,413653,2015-04-06,10,1
...,...,...,...,...,...,...
21074,CHC,PIT,746881,2024-05-16,4,5
21075,HOU,OAK,746391,2024-05-16,8,1
21076,LAD,CIN,746150,2024-05-16,2,7
21077,MIN,NYY,745907,2024-05-16,0,5


In [39]:
df['game_date'] = pd.to_datetime(df.game_date)
df.groupby(df.game_date.dt.year).size()

game_date
2015    2429
2016    2428
2017    2430
2018    2431
2019    2429
2020     898
2021    2429
2022    2430
2023    2430
2024     745
dtype: int64

In [40]:
df.to_csv(f'scores.csv.gz')

# Power Rankings

In [8]:
import requests as rq

fangraphs = pd.read_html(
    rq.get('https://blogs.fangraphs.com/fangraphs-power-rankings-may-13-19/').content
)
power_rankings = [d for d in fangraphs if len(d) == 30][0]
power_rankings.head()

Unnamed: 0,Rank,Team,Record,Elo,Opponent Elo,Playoff Odds,Power Score,Δ
0,1,Yankees,33-15,1607,1504,97.6%,1613,3
1,2,Phillies,34-14,1598,1471,97.7%,1607,1
2,3,Dodgers,32-17,1599,1493,99.4%,1606,-1
3,4,Braves,26-16,1589,1504,98.7%,1591,-3
4,5,Orioles,29-15,1576,1498,90.8%,1581,0


In [9]:
power_rankings.Team.tolist()

['Yankees',
 'Phillies',
 'Dodgers',
 'Braves',
 'Orioles',
 'Guardians',
 'Royals',
 'Brewers',
 'Mariners',
 'Astros',
 'Rays',
 'Cubs',
 'Padres',
 'Twins',
 'Giants',
 'Diamondbacks',
 'Rangers',
 'Mets',
 'Red Sox',
 'Blue Jays',
 'Tigers',
 'Cardinals',
 'Pirates',
 'Angels',
 'Reds',
 'Nationals',
 'Marlins',
 'Athletics',
 'White Sox',
 'Rockies']

In [10]:
power_rankings.set_index(
    power_rankings.Team.map({
        'Braves': 'ATL',
        'Dodgers': 'LAD',
        'Phillies': 'PHI',
        'Yankees': 'NYY',
        'Orioles': 'BAL',
        'Twins': 'MIN',
        'Brewers': 'MIL',
        'Cubs': 'CHC',
        'Mariners': 'SEA',
        'Royals': 'KC',
        'Guardians': 'CLE',
        'Padres': 'SD',
        'Red Sox': 'BOS',
        'Mets': 'NYM',
        'Rangers': 'TEX',
        'Diamondbacks': 'AZ',
        'Rays': 'TB',
        'Blue Jays': 'TOR',
        'Astros': 'HOU',
        'Tigers': 'DET',
        'Giants': 'SF',
        'Athletics': 'OAK',
        'Nationals': 'WSH',
        'Cardinals': 'STL',
        'Reds': 'CIN',
        'Pirates': 'PIT',
        'Angels': 'LAA',
        'Marlins': 'MIA',
        'White Sox': 'CWS',
        'Rockies': 'COL'
    }).rename('team'),
    inplace=True
)

In [11]:
power_rankings.to_csv('power-rankings.csv')