# Get the data

## Statcast data

In [1]:
import pandas as pd
import requests
import io

# pybaseball was breaking for some reason so I pulled this method out of the source code
def small_request(start_dt,end_dt):
    url = "https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={}&game_date_lt={}&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&".format(start_dt, end_dt)
    s=requests.get(url, timeout=None).content
    data = pd.read_csv(io.StringIO(s.decode('utf-8')))
    return data

date_range = pd.date_range('2024-03-29', pd.Timestamp.today()).strftime('%Y-%m-%d')
# date_range = pd.date_range('2023-03-20', '2023-10-03')
small_request(start_dt=date_range[0], end_dt=date_range[0])

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
0,FF,2024-03-29,94.7,-1.39,5.82,"Kirby, George",678882,669923,single,hit_into_play,...,1,0,1,Standard,Standard,218,-0.025,0.164,,
1,FF,2024-03-29,94.1,-1.57,5.77,"Kirby, George",678882,669923,,swinging_strike,...,1,0,1,Standard,Standard,221,0.000,-0.021,,
2,SI,2024-03-29,93.5,-1.58,5.72,"Kirby, George",678882,669923,,called_strike,...,1,0,1,Infield shade,Standard,218,0.000,-0.014,,
3,SI,2024-03-29,93.2,-1.56,5.73,"Kirby, George",671213,669923,field_out,hit_into_play,...,1,0,1,Infield shade,Standard,219,0.031,-0.144,,
4,KC,2024-03-29,80.4,-1.48,5.69,"Kirby, George",671213,669923,,ball,...,1,0,1,Infield shade,Standard,37,0.000,0.024,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,SI,2024-03-29,90.4,1.11,5.64,"Pérez, Martín",650333,527048,,ball,...,0,0,0,Standard,Standard,128,0.000,0.033,,
2926,SI,2024-03-29,92.7,-1.52,5.63,"Parsons, Wes",642350,641149,,swinging_strike,...,5,5,1,Standard,Standard,219,0.000,-0.033,,
2927,FF,2024-03-29,92.5,2.05,4.89,"Harrison, Kyle",593428,690986,,ball,...,0,0,3,Standard,Standard,122,0.000,0.033,,
2928,FF,2024-03-29,93.4,-1.43,6.95,"Pivetta, Nick",641487,601713,,called_strike,...,0,0,0,Infield shade,Standard,189,0.000,-0.033,,


In [2]:
_.groupby(['home_team', 'away_team']).size()

home_team  away_team
AZ         COL          269
HOU        NYY          335
LAD        STL          265
MIA        PIT          338
NYM        MIL          270
OAK        CLE          299
PHI        ATL          319
SD         SF           279
SEA        BOS          266
TB         TOR          290
dtype: int64

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

date_range = pd.date_range('2015-03-01', pd.Timestamp.today())
# date_range = pd.date_range('2024-03-01', pd.Timestamp.today())
date_range = date_range[date_range.month.to_series().between(3, 10)].strftime('%Y-%m-%d')
results = []
errors = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(small_request, date, date): date for date in date_range}
    for future in tqdm(as_completed(futures), total=len(date_range)):
        try:
            result = future.result()
            result = (
                result[result.game_type.eq('R')]
                .groupby(['home_team', 'away_team', 'game_pk', 'game_date'])
                [['post_home_score', 'post_away_score']].max()
            )
            results.append(result)
        except Exception as err:
            errors.append(err)
            print(f'could not get data for data {futures[future]}')

  0%|          | 0/96 [00:00<?, ?it/s]

In [4]:
df = pd.concat(results)
df.shape

(899, 2)

In [5]:
df = df.reset_index()
df

Unnamed: 0,home_team,away_team,game_pk,game_date,post_home_score,post_away_score
0,AZ,COL,747223,2024-03-29,7,3
1,HOU,NYY,746412,2024-03-29,1,7
2,LAD,STL,746167,2024-03-29,6,3
3,MIA,PIT,746091,2024-03-29,2,7
4,NYM,MIL,745848,2024-03-29,1,3
...,...,...,...,...,...,...
894,NYM,AZ,745813,2024-06-02,4,5
895,PHI,STL,745572,2024-06-02,4,5
896,SEA,LAA,745252,2024-06-02,5,1
897,SF,NYY,745330,2024-06-02,5,7


In [6]:
df['game_date'] = pd.to_datetime(df.game_date)
df.groupby(df.game_date.dt.year).size()

game_date
2024    899
dtype: int64

In [7]:
# separating previous years from this year makes it easier to pull
# new games from this year without having to wait for the historical
# seasons which aren't changing.
past_seasons = df[df.game_date.lt('2024-01-01')]
if False:
    past_seasons.to_csv('scores-historical.csv.gz', index=False)

In [8]:
this_season = df[df.game_date.gt('2024-01-01')]
this_season.to_csv('scores-2024.csv.gz', index=False)

# Power Rankings

In [11]:
import requests as rq

fangraphs = pd.read_html(
    rq.get('https://blogs.fangraphs.com/fangraphs-power-rankings-may-13-19/').content
)
power_rankings = [d for d in fangraphs if len(d) == 30][0]
power_rankings.head()

Unnamed: 0,Rank,Team,Record,Elo,Opponent Elo,Playoff Odds,Power Score,Δ
0,1,Yankees,33-15,1607,1504,97.6%,1613,3
1,2,Phillies,34-14,1598,1471,97.7%,1607,1
2,3,Dodgers,32-17,1599,1493,99.4%,1606,-1
3,4,Braves,26-16,1589,1504,98.7%,1591,-3
4,5,Orioles,29-15,1576,1498,90.8%,1581,0


In [12]:
power_rankings.Team.tolist()

['Yankees',
 'Phillies',
 'Dodgers',
 'Braves',
 'Orioles',
 'Guardians',
 'Royals',
 'Brewers',
 'Mariners',
 'Astros',
 'Rays',
 'Cubs',
 'Padres',
 'Twins',
 'Giants',
 'Diamondbacks',
 'Rangers',
 'Mets',
 'Red Sox',
 'Blue Jays',
 'Tigers',
 'Cardinals',
 'Pirates',
 'Angels',
 'Reds',
 'Nationals',
 'Marlins',
 'Athletics',
 'White Sox',
 'Rockies']

In [13]:
power_rankings.set_index(
    power_rankings.Team.map({
        'Braves': 'ATL',
        'Dodgers': 'LAD',
        'Phillies': 'PHI',
        'Yankees': 'NYY',
        'Orioles': 'BAL',
        'Twins': 'MIN',
        'Brewers': 'MIL',
        'Cubs': 'CHC',
        'Mariners': 'SEA',
        'Royals': 'KC',
        'Guardians': 'CLE',
        'Padres': 'SD',
        'Red Sox': 'BOS',
        'Mets': 'NYM',
        'Rangers': 'TEX',
        'Diamondbacks': 'AZ',
        'Rays': 'TB',
        'Blue Jays': 'TOR',
        'Astros': 'HOU',
        'Tigers': 'DET',
        'Giants': 'SF',
        'Athletics': 'OAK',
        'Nationals': 'WSH',
        'Cardinals': 'STL',
        'Reds': 'CIN',
        'Pirates': 'PIT',
        'Angels': 'LAA',
        'Marlins': 'MIA',
        'White Sox': 'CWS',
        'Rockies': 'COL'
    }).rename('team'),
    inplace=True
)

In [14]:
power_rankings.to_csv('power-rankings.csv')

# Season Projections

In [45]:
projections = pd.read_html('https://www.fangraphs.com/depthcharts.aspx?position=Standings')[6]
projections

Unnamed: 0_level_0,Unnamed: 0_level_0,2024 Year to Date,2024 Year to Date,2024 Year to Date,2024 Year to Date,2024 Year to Date,2024 Year to Date,2024 Year to Date,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Rest of Season,2024 Projected Full Season,2024 Projected Full Season,2024 Projected Full Season,2024 Projected Full Season,2024 Projected Full Season,2024 Projected Full Season
Unnamed: 0_level_1,Team,G,W,L,W%,RDif,RS/G,RA/G,G,W,...,W%,RDif,RS/G,RA/G,W,L,W%,RDif,RS/G,RA/G
0,Yankees,55,37,18,0.673,92,4.84,3.16,107,59,...,0.553,54,4.73,4.22,96,66,0.593,146,4.77,3.86
1,Phillies,55,38,17,0.691,89,5.33,3.71,107,57,...,0.535,35,4.53,4.2,95,67,0.588,124,4.8,4.03
2,Dodgers,55,33,22,0.6,69,4.91,3.65,107,61,...,0.571,73,4.85,4.16,94,68,0.581,142,4.87,3.99
3,Orioles,52,34,18,0.654,69,5.06,3.73,110,59,...,0.535,37,4.53,4.2,93,69,0.573,106,4.7,4.05
4,Braves,51,30,21,0.588,43,4.63,3.78,111,61,...,0.552,54,4.67,4.18,91,71,0.563,97,4.66,4.06
5,Guardians,54,36,18,0.667,71,4.98,3.67,108,53,...,0.487,-13,4.34,4.46,89,73,0.547,58,4.55,4.2
6,Twins,53,29,24,0.547,5,4.45,4.36,109,58,...,0.529,29,4.36,4.09,87,75,0.535,34,4.39,4.18
7,Royals,55,34,21,0.618,78,4.87,3.45,107,53,...,0.491,-9,4.44,4.53,87,75,0.534,69,4.59,4.17
8,Brewers,53,31,22,0.585,49,4.96,4.04,109,55,...,0.5,0,4.41,4.41,86,76,0.528,49,4.59,4.29
9,Mariners,55,29,26,0.527,-6,3.73,3.84,107,56,...,0.521,20,4.21,4.03,85,77,0.523,14,4.05,3.96


In [47]:
projections.to_csv('fangraphs-projections.csv', index=False)