Import Packages, Initial File

In [None]:
# Import relevant packages
import pandas as pd 
import numpy as np
import plotly.express as px
import sklearn

In [None]:
from google.colab import files
uploaded = files.upload()

Saving baseball.csv to baseball.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['baseball.csv']))

Take a look at the data: team records, win percentages, playoff wins and World Series wins from 2010-2021

In [None]:
df.head()

Unnamed: 0,Rank,Team,Record,Win Percentage,Made Playoffs,Playoff Series Wins,Titles
0,1.0,Los Angeles Dodgers,1179-825,58.80%,10.0,12.0,1.0
1,2.0,New York Yankees,1145-859,57.10%,10.0,8.0,0.0
2,3.0,St. Louis Cardinals,1112-890,55.50%,9.0,9.0,1.0
3,4.0,Tampa Bay Rays,1086-919,54.20%,7.0,5.0,0.0
4,5.0,Atlanta Braves,1067-935,53.30%,8.0,5.0,1.0


We have too many teams, so we should drop them

In [None]:
df.shape

(33, 7)

In [None]:
df = df.drop(df.index[[7, 15,32]])
df

In [None]:
df.dtypes

Rank                   float64
Team                    object
Record                  object
Win Percentage          object
Made Playoffs          float64
Playoff Series Wins    float64
Titles                 float64
dtype: object

Have to change the data types of important variables like Wins and Losses

In [None]:
df[['Wins', 'Losses']] = df['Record'].str.split('-', 1, expand=True)
df = df.astype({"Wins": int, "Losses": int})

df['Winning Percentage'] = (df['Wins'])/(df['Wins']+df['Losses'])
df.head()

Unnamed: 0,Rank,Team,Record,Win Percentage,Made Playoffs,Playoff Series Wins,Titles,Wins,Losses,Winning Percentage
0,1.0,Los Angeles Dodgers,1179-825,58.80%,10.0,12.0,1.0,1179,825,0.588323
1,2.0,New York Yankees,1145-859,57.10%,10.0,8.0,0.0,1145,859,0.571357
2,3.0,St. Louis Cardinals,1112-890,55.50%,9.0,9.0,1.0,1112,890,0.555445
3,4.0,Tampa Bay Rays,1086-919,54.20%,7.0,5.0,0.0,1086,919,0.541646
4,5.0,Atlanta Braves,1067-935,53.30%,8.0,5.0,1.0,1067,935,0.532967


Upload Location Data, 2022 Standings Data, 2022 Payroll Data

In [None]:
uploaded = files.upload()
df_stadiums = pd.read_csv(io.BytesIO(uploaded['stadiums.csv']))

Saving stadiums.csv to stadiums.csv


In [None]:
uploaded = files.upload()
df_record = pd.read_csv(io.BytesIO(uploaded['2022standings.csv']))

Saving 2022standings.csv to 2022standings.csv


In [None]:
uploaded = files.upload()
df_payroll = pd.read_csv(io.BytesIO(uploaded['payroll.csv']))

Saving payroll.csv to payroll.csv


Merge each of these datasets together into one cohesive dataframe

In [None]:
df_merge = pd.merge(df, df_stadiums, left_on = 'Team', right_on = 'team', how = 'inner')
df_merge.columns.to_list()
df_merge = df_merge.drop(['Record', 'Win Percentage', 'Unnamed: 0', 'team', 'address'], axis =1)

In [None]:
df_merge = pd.merge(df_merge, df_payroll, how= 'inner')
df_merge = pd.merge(df_merge, df_record, left_on = 'Team', right_on = 'Tm',how = 'inner')

In [None]:
df_merge['Total']

Produce a scatter map of teams by location, with their winning percentage, payroll, recent playoff success

In [None]:
fig = px.scatter_mapbox(df_merge,
                    lat='lat',
                    lon='lng',
                    hover_name='Team', zoom = 3, 
                    color = 'Winning Percentage', 
                    opacity = 0.5, 
                    size = 'Total',
                    hover_data = ['Made Playoffs', 'Playoff Series Wins', 'Titles', 'lat', 'lng'],
                    labels = {'Times Making Playoffs': 'Made Playoffs',
                                  'Playoff Series Won': 'Playoff Series Wins',
                                  'World Series Won': 'Titles',
                                'lat': 'Stadium Latitude',
                                'lng':'Satidum Longitude',
                                'Total': 'Payroll'
                                  })
fig.update_layout(mapbox_style = 'carto-positron')
fig.show()

In [None]:
df_merge

Produce a scatter plot exploring the relationship between payroll, wins, and recent playoff success

In [None]:
px.scatter(df_merge, x = 'Total', y = 'W', color = 'Rk', trendline = 'ols', 
           title = 'Relationship between Payroll and Number of Wins in 2022 Season', size = 'Playoff Series Wins', hover_name = 'Team',
           labels = {"W": "Wins in 2022", 'Total': 'Payroll in 2022'})

Now that we have looked at macro data, let's read in some game-level data

In [None]:
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['savant_data.csv']))
df.head()

See how many games we have

In [None]:
df.drop(['Unnamed: 38', 'home_win?'], axis=1, inplace=True)
df['game_pk'].nunique()

120

See the range in number of at bats per game

In [None]:
df_grouped = df.loc[df.groupby('game_pk')['at_bat_number'].idxmax()]
print(df_grouped['at_bat_number'].min())
df_grouped['at_bat_number'].max()

48


95

Sort by most impactful plays in game outcome

In [None]:
df['abs_delta_home_win_exp'] = abs(df['delta_home_win_exp'])

df.sort_values(by = ['abs_delta_home_win_exp'], ascending = False)

Unnamed: 0,pitch_type,game_date,batter,pitcher,events,description,stand,p_throws,home_team,away_team,...,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,delta_home_win_exp,delta_run_exp,abs_delta_home_win_exp
1175,FF,9/29/22,541645,642547,home_run,hit_into_play,R,R,MIL,MIA,...,0,0,2,4,2,4,2,-0.663,3.531,0.663
7443,FS,9/27/22,623912,663986,double,hit_into_play,R,R,CLE,TB,...,4,4,4,6,4,6,4,-0.470,1.901,0.470
581,SL,9/30/22,663728,642758,home_run,hit_into_play,L,R,SEA,OAK,...,1,1,1,1,2,2,1,0.467,0.982,0.467
8264,SI,9/24/22,493329,596082,single,hit_into_play,R,R,BAL,HOU,...,9,9,9,11,9,11,9,-0.414,1.560,0.414
957,CH,10/1/22,650559,642207,single,hit_into_play,R,R,MIL,MIA,...,2,2,3,4,3,4,3,-0.413,1.065,0.413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,EP,9/30/22,595909,665019,field_out,hit_into_play,L,R,DET,MIN,...,7,7,0,7,0,7,0,0.000,-0.300,0.000
5119,SL,9/25/22,676391,622098,field_out,hit_into_play,R,R,OAK,NYM,...,12,1,12,12,1,1,12,0.000,-0.353,0.000
6881,CH,9/27/22,641820,592761,field_out,hit_into_play,R,L,HOU,ARI,...,1,8,1,1,8,8,1,0.000,-0.298,0.000
7054,SL,9/24/22,595777,663385,field_out,hit_into_play,L,R,COL,SD,...,9,9,2,9,2,9,2,0.000,-0.178,0.000


Visualize which types of at bat outcomes are most common

In [None]:
fig = px.histogram(df, x='events')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

Visualize which types of game outcome are most impactful

In [None]:
fig = px.histogram(df, x='events', y = 'abs_delta_home_win_exp')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

Read in data from one particular game (Marlins @ Brewers)

In [None]:
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['marlins.csv']))
df['road_win_exp'] = 1 - df['home_win_exp']
df.head()

Saving marlins.csv to marlins (1).csv


Unnamed: 0,pitch_type,game_date,batter,pitcher,events,description,stand,p_throws,home_team,away_team,...,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,delta_home_win_exp,delta_run_exp,home_win_exp,road_win_exp
0,SL,9/29/22,542932,641778,strikeout,swinging_strike,R,L,MIL,MIA,...,0,0,0,0,0,0,0.022,-0.158,0.522,0.478
1,FF,9/29/22,641779,641778,strikeout,called_strike,R,L,MIL,MIA,...,0,0,0,0,0,0,0.015,-0.134,0.537,0.463
2,FC,9/29/22,605119,641778,strikeout,swinging_strike,R,L,MIL,MIA,...,0,0,0,0,0,0,0.01,-0.083,0.547,0.453
3,SL,9/29/22,457705,666129,strikeout,swinging_strike,R,L,MIL,MIA,...,0,0,0,0,0,0,-0.021,-0.141,0.526,0.474
4,SL,9/29/22,642715,666129,single,hit_into_play,R,L,MIL,MIA,...,0,0,0,0,0,0,0.024,0.211,0.55,0.45


Display win probability chart as a moving graph

In [None]:
import plotly.graph_objects as go
fig = px.line(df, x="at_bat_number", y="road_win_exp", markers = True, 
            range_x=[0,75],
      range_y=[0,1],
      animation_frame = 'at_bat_number')

fig = go.Figure(
    layout=go.Layout(
        updatemenus=[dict(type="buttons", direction="right", x=0.9, y=1.16), ],
        xaxis=dict(range=[0,72],
                   autorange=False, tickwidth=2,
                   title_text="At Bat"),
        yaxis=dict(range=[0,1],
                   autorange=False,
                   title_text="Marlins Win Probability"),
        title="MIA @ MIL September 29, 2022",
    ))
init = 1

fig.add_trace(
    go.Scatter(x=df.at_bat_number[:init],
               y=df.home_win_exp[:init],
               name="Win Probability",
               visible=True,
               line=dict(color="#367C7D")))

# Animation
fig.update(frames=[
    go.Frame(
        data=[
            go.Scatter(x=df.at_bat_number[:k], y=df.road_win_exp[:k])]
    )
    for k in range(init, len(df)+1)])

# Buttons
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(label="Play",
                        method="animate",
                    args=[None, {"frame": {"duration": 100}}])
            ]))])
fig.show()