<h1>MLB ANALYSIS</h1>

<h4>Dataset contains information of all season, including spring training games & WBC games that took place at the same time.</h4>

<h6>For consideration<br>
<br>
<b>Spring Training</b><br>
Start date: 2023-02-24<br>
End date: 2023-03-28<br>
<br>
<b>Regular Season</b><br>
Start date: 2023-03-30<br>
End date: 2023-10-02<br>
<br>
<b>Postseason</b><br>
Wildcard Series: 2023-10-03 to 2023-10-05<br>
Divisional Series: 2023-10-07 to 2023-10-11<br>
League Championship Series: 2023-10-14 to 2023-10-18<br>
World Series: 2023-10-20 to 2023-10-27</h6>

In [74]:
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from os import getcwd

pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

<h3>READ DATAFRAME</h3>

In [75]:
path = getcwd()
df = pd.read_csv(path+'/../output/single_date_test_df.csv', parse_dates=['game_date'])
season_up_to_date_DF = pd.read_csv(path+'/../output/2023_MLB_Season_df.csv',parse_dates=['game_date'])

season_up_to_date_DF.sample(5)

Unnamed: 0,game_date,team,opponent,record,1st_inning,2nd_inning,3rd_inning,4th_inning,5th_inning,6th_inning,7th_inning,8th_inning,9th_inning,10th_inning,11th_inning,12th_inning,13th_inning,14th_inning,15th_inning,16th_inning,extra_inning,total_innings,runs,hits,errors,result
3145,2023-06-16,Brewers,Pirates,35 - 34,3,0,0,1,0,1,0,0,X,X,X,X,X,X,X,X,N,9,5,5,0,W
685,2023-03-16,Brewers,Angels,7 - 10,0,0,0,0,1,0,1,0,0,X,X,X,X,X,X,X,N,9,2,4,3,L
1809,2023-04-27,Phillies,Mariners,13 - 13,0,1,0,0,0,0,0,0,X,X,X,X,X,X,X,X,N,9,1,4,0,W
4649,2023-08-15,Blue Jays,Phillies,67 - 54,0,0,0,0,0,1,0,1,X,X,X,X,X,X,X,X,N,9,2,4,0,W
5928,2023-10-01,Yankees,Royals,82 - 80,0,0,0,0,0,2,0,0,0,X,X,X,X,X,X,X,N,9,2,7,1,L


<h3>CLEAN DATA</h3>
<h5>1. Exclude non MLB teams (WBC, spring training, all-star)</h5>

In [76]:
season_up_to_date_DF.shape

(6018, 26)

In [77]:
mlb_teams = sorted(season_up_to_date_DF['team'].unique().tolist())
not_mlb_teams = ['AL All-Stars','Australia','Canada','China','Chinese Taipei','Colombia','Cuba','Czech Republic',
                 'Dominican Rep.','Great Britain','Huskies','Israel','Italy','Japan','Korea','Mexico',
                 'Mountaineers', 'NL All-Stars', 'Netherlands','Nicaragua','Panama','Puerto Rico','Space Cowboys',
                 'United States','Venezuela']

for team in not_mlb_teams:
    mlb_teams.remove(team)

len(mlb_teams)

30

In [78]:
season_up_to_date_DF = season_up_to_date_DF.drop(
    season_up_to_date_DF[(season_up_to_date_DF['team'].isin(not_mlb_teams))].index)
season_up_to_date_DF.shape

(5898, 26)

<h5>2. Exclude non regular season games</h5>

In [79]:
season_up_to_date_DF = season_up_to_date_DF[(season_up_to_date_DF["game_date"] > '2023-03-29') & 
                                            (season_up_to_date_DF["game_date"] < '2023-10-03')]
season_up_to_date_DF.shape

(4874, 26)

<h5>3. Drop duplicates</h5>
<h6>Scraping on MLB page causes duplicates on games that where postponed, 

drop them based on a col different than 'game_date' so that those rows are duplicates</h6>

In [80]:
season_up_to_date_DF.groupby("team").game_date.count().value_counts()

game_date
162    19
163     9
165     1
164     1
Name: count, dtype: int64

In [81]:
season_up_to_date_DF = season_up_to_date_DF.drop_duplicates(subset=['team','opponent','record'])
season_up_to_date_DF.shape

(4860, 26)

In [82]:
season_up_to_date_DF.groupby("team").game_date.count().value_counts()

game_date
162    30
Name: count, dtype: int64

<h5>4. replace 'X' with 0 and change dtype of affected cols for numeric operations on inning cols</h5>

In [83]:
season_up_to_date_DF.iloc[:,9:20] = season_up_to_date_DF.iloc[:,9:20].replace('X','0')
season_up_to_date_DF[["6th_inning",
                     "7th_inning",
                     "8th_inning",
                     "9th_inning",
                     "10th_inning",
                     "11th_inning",
                     "12th_inning",
                     "13th_inning",
                     "14th_inning",
                     "15th_inning",
                     "16th_inning"]] = season_up_to_date_DF[["6th_inning",
                                                             "7th_inning",
                                                             "8th_inning",
                                                             "9th_inning",
                                                             "10th_inning",
                                                             "11th_inning",
                                                             "12th_inning",
                                                             "13th_inning",
                                                             "14th_inning",
                                                             "15th_inning",
                                                             "16th_inning"]].apply(pd.to_numeric)
season_up_to_date_DF.dtypes

game_date        datetime64[ns]
team                     object
opponent                 object
record                   object
1st_inning                int64
2nd_inning                int64
3rd_inning                int64
4th_inning                int64
5th_inning                int64
6th_inning                int64
7th_inning                int64
8th_inning                int64
9th_inning                int64
10th_inning               int64
11th_inning               int64
12th_inning               int64
13th_inning               int64
14th_inning               int64
15th_inning               int64
16th_inning               int64
extra_inning             object
total_innings             int64
runs                      int64
hits                      int64
errors                    int64
result                   object
dtype: object

<h3>CLEAN DATAFRAME</h3>

In [84]:
season_up_to_date_DF.sample(10)

Unnamed: 0,game_date,team,opponent,record,1st_inning,2nd_inning,3rd_inning,4th_inning,5th_inning,6th_inning,7th_inning,8th_inning,9th_inning,10th_inning,11th_inning,12th_inning,13th_inning,14th_inning,15th_inning,16th_inning,extra_inning,total_innings,runs,hits,errors,result
1210,2023-04-04,Tigers,Astros,2 - 3,1,0,0,0,1,1,0,2,1,0,0,0,0,0,0,0,N,9,6,12,0,W
4513,2023-08-09,Angels,Giants,58 - 58,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,N,9,4,4,2,W
2790,2023-06-03,Cardinals,Pirates,25 - 34,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,N,9,3,6,2,L
5295,2023-09-08,Rangers,Athletics,76 - 64,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,N,9,3,9,0,L
2797,2023-06-03,Astros,Angels,35 - 23,1,0,1,5,0,0,2,0,0,0,0,0,0,0,0,0,N,9,9,11,0,W
2320,2023-05-16,Braves,Rangers,26 - 16,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,N,9,4,8,1,L
3673,2023-07-06,Rays,Phillies,57 - 33,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,Y,11,1,4,1,L
2491,2023-05-22,Mariners,Athletics,23 - 24,2,3,0,0,1,2,0,3,0,0,0,0,0,0,0,0,N,9,11,13,0,W
5498,2023-09-15,Dodgers,Mariners,89 - 57,0,0,0,0,2,2,0,1,1,0,0,0,0,0,0,0,N,9,6,8,0,W
1350,2023-04-10,Red Sox,Rays,5 - 5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,9,0,3,0,L


<h3>ANALYSIS</h3>

<h5>Games per day and month</h5>

In [86]:
# Given that each game has 2 registers because each team scores are stores, count is duplicated
game_calendar = (season_up_to_date_DF[['game_date']].value_counts()//2).reset_index()
game_calendar['Month'] = game_calendar['game_date'].apply(lambda x: x.month_name())
game_calendar['Day'] = game_calendar['game_date'].apply(lambda x : x.day_name())
game_calendar.Month = pd.Categorical(game_calendar.Month, categories = ['March','April','May','June',
                                                                      'July','August','September','October'])
game_calendar.Day = pd.Categorical(game_calendar.Day, categories = ['Sunday','Monday','Tuesday','Wednesday',
                                                                  'Thursday','Friday','Saturday'])
game_calendar = game_calendar.groupby(['Month','Day'])['count'].sum().reset_index().sort_values(['Month','Day'])

fig_1 = px.bar(game_calendar, 
               x='Month', 
               y='count', 
               color='Day', 
               title='Calendar Distribution',
               labels={'count':'Games'},
               #text='count',
               color_discrete_sequence=px.colors.sequential.Oryel).update_layout(legend_traceorder="reversed")
pio.write_html(fig_1,path+'/../img_plots/fig_1.html')
fig_1.show()

<h5>EXTRA INNING GAMES</h5>

In [96]:
extra_inning_monthly_games = (season_up_to_date_DF[['game_date','extra_inning']].groupby('extra_inning').value_counts()//2).reset_index().sort_values('game_date')
extra_inning_monthly_games['Month'] = extra_inning_monthly_games['game_date'].apply(lambda x: x.month_name())
extra_inning_monthly_games.Month = pd.Categorical(extra_inning_monthly_games.Month, categories = ['March','April','May','June',
                                                                                                  'July','August','September','October'])
extra_inning_monthly_games = extra_inning_monthly_games.groupby(['extra_inning','Month'])['count'].sum().reset_index().sort_values(['Month'])
extra_inning_yes = extra_inning_monthly_games[extra_inning_monthly_games['extra_inning']=='Y']
extra_inning_no = extra_inning_monthly_games[extra_inning_monthly_games['extra_inning']=='N']

extra_inning_total_games = extra_inning_monthly_games.drop('Month',axis=1).groupby('extra_inning').sum().reset_index()

fig_2 = make_subplots(rows=1, 
                      cols=2,
                      specs=[[{'type': 'pie'},{'type': 'bar'}]],
                      subplot_titles=['Season','Monthly'])

fig_2.add_trace(go.Pie(labels=extra_inning_total_games['extra_inning'].map({'N':'No','Y':'Yes'}),
                       values=extra_inning_total_games['count'],
                       marker_colors=['rgb(243, 173, 106)','rgb(246, 99, 86)'],
                       hole=.5,
                       showlegend=False
                       ),      
                row=1,
                col=1)

fig_2.add_trace(go.Bar(name='No',
                            x=extra_inning_no['Month'],
                            y=extra_inning_no['count'],
                            offsetgroup=0,
                            marker_color='rgb(243, 173, 106)',
                            ),
                row=1,
                col=2)
fig_2.add_trace(go.Bar(name='Yes',
                            x=extra_inning_yes['Month'],
                            y=extra_inning_yes['count'],
                            offsetgroup=1,
                            marker_color='rgb(246, 99, 86)',
                            ),
                row=1,
                col=2)
    
fig_2.update_layout(title_text='Extra Inning Games')
pio.write_html(fig_2,path+'/../img_plots/fig_2.html')
fig_2.show()

In [117]:
extra_inning_duration = season_up_to_date_DF[season_up_to_date_DF['extra_inning']=='Y'].iloc[:,13:22]
extra_inning_counts = extra_inning_duration['total_innings'].value_counts().reset_index()

fig_3 = go.Figure(go.Pie(labels=extra_inning_counts['total_innings'],
                         values=extra_inning_counts['count'],
                         hole=.5,
                         marker_colors=px.colors.sequential.Oryel[2:],
                         hovertemplate='Innings: %{label}<br>Games: %{value}</br>',
                         name=''
                         )).update_layout(title='Extra Inning Games Duration',
                                          legend_title_text='Innings')
fig_3.show()
pio.write_html(fig_3,path+'/../img_plots/fig_3.html')
extra_inning_duration

Unnamed: 0,10th_inning,11th_inning,12th_inning,13th_inning,14th_inning,15th_inning,16th_inning,extra_inning,total_innings
1156,1,0,0,0,0,0,0,Y,10
1157,0,0,0,0,0,0,0,Y,10
1180,0,2,0,0,0,0,0,Y,11
1181,0,1,0,0,0,0,0,Y,11
1182,2,0,0,0,0,0,0,Y,10
1183,1,0,0,0,0,0,0,Y,10
1236,2,0,0,0,0,0,0,Y,10
1237,0,0,0,0,0,0,0,Y,10
1264,1,0,0,0,0,0,0,Y,10
1265,2,0,0,0,0,0,0,Y,10


In [46]:
season_up_to_date_DF[["team","1st_inning","2nd_inning","3rd_inning","4th_inning","5th_inning","6th_inning","7th_inning","8th_inning","9th_inning"]].groupby("team").sum().sort_values("1st_inning",ascending=False)

Unnamed: 0_level_0,1st_inning,2nd_inning,3rd_inning,4th_inning,5th_inning,6th_inning,7th_inning,8th_inning,9th_inning
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Braves,146,103,104,114,117,101,82,105,54
Dodgers,116,86,98,97,95,123,96,119,59
Rays,112,101,83,95,101,104,83,103,59
Astros,110,77,90,93,82,91,111,89,75
Padres,110,78,81,73,97,86,74,80,61
Nationals,100,85,68,77,63,78,75,70,67
Phillies,94,74,82,101,99,95,96,68,65
Brewers,92,74,93,58,93,91,97,76,37
Royals,92,71,67,87,65,88,73,67,56
D-backs,90,84,78,80,70,85,93,87,63


In [16]:
games_per_day = season_up_to_date_DF.groupby(by='game_date').game_date.count()/2

<h5>top teams with more runs in Spring Training</h5>

In [17]:
season_up_to_date_DF.groupby(by=season_up_to_date_DF['team'])['runs'].sum().sort_values(ascending=False).head()

team
Braves     947
Dodgers    906
Rangers    881
Rays       860
Astros     827
Name: runs, dtype: int64

In [18]:
season_up_to_date_DF.groupby(by=season_up_to_date_DF['team'])['hits'].sum().sort_values(ascending=False).head()

team
Braves     1543
Rangers    1470
Astros     1441
Red Sox    1437
Rays       1432
Name: hits, dtype: int64

In [19]:
season_up_to_date_DF.groupby(by=season_up_to_date_DF['team'])['errors'].sum().sort_values(ascending=False).head()

team
Giants       110
Athletics     98
Red Sox       97
Marlins       95
Tigers        93
Name: errors, dtype: int64

In [20]:
season_up_to_date_DF[['team','opponent','record','runs','result']][season_up_to_date_DF['team']=='Royals']

Unnamed: 0,team,opponent,record,runs,result
1077,Royals,Twins,0 - 1,0,L
1121,Royals,Twins,0 - 2,0,L
1143,Royals,Twins,0 - 3,4,L
1175,Royals,Blue Jays,1 - 3,9,W
1203,Royals,Blue Jays,1 - 4,1,L
1241,Royals,Blue Jays,1 - 5,0,L
1245,Royals,Blue Jays,1 - 6,3,L
1268,Royals,Giants,2 - 6,3,W
1284,Royals,Giants,3 - 6,6,W
1334,Royals,Giants,3 - 7,1,L


In [21]:
season_up_to_date_DF[(season_up_to_date_DF['team'].isin(['Royals','Rangers'])) & (season_up_to_date_DF['opponent'].isin(['Royals','Rangers']))]

Unnamed: 0,game_date,team,opponent,record,1st_inning,2nd_inning,3rd_inning,4th_inning,5th_inning,6th_inning,7th_inning,8th_inning,9th_inning,10th_inning,11th_inning,12th_inning,13th_inning,14th_inning,15th_inning,16th_inning,extra_inning,runs,hits,errors,result
1360,2023-04-10,Royals,Rangers,3 - 8,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,N,2,4,1,L
1361,2023-04-10,Rangers,Royals,6 - 4,1,0,2,1,0,7,0,0,0,0,0,0,0,0,0,0,N,11,11,1,W
1390,2023-04-11,Royals,Rangers,3 - 9,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,Y,5,11,0,L
1391,2023-04-11,Rangers,Royals,7 - 4,0,1,2,0,0,0,1,0,0,4,0,0,0,0,0,0,Y,8,10,0,W
1426,2023-04-12,Royals,Rangers,4 - 9,0,3,1,2,0,0,4,0,0,0,0,0,0,0,0,0,N,10,14,0,W
1427,2023-04-12,Rangers,Royals,7 - 5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,1,4,0,L
1538,2023-04-17,Rangers,Royals,10 - 6,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,N,4,5,1,W
1539,2023-04-17,Royals,Rangers,4 - 13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,0,1,1,L
1572,2023-04-18,Rangers,Royals,11 - 6,0,0,1,2,0,5,0,4,0,0,0,0,0,0,0,0,N,12,12,0,W
1573,2023-04-18,Royals,Rangers,4 - 14,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,N,2,8,0,L


In [22]:
season_up_to_date_DF.result[season_up_to_date_DF.team == 'Royals'].count()

162

In [23]:
season_up_to_date_DF['16th_inning'].value_counts()

16th_inning
0    4860
Name: count, dtype: int64

In [24]:
df.head(200)

Unnamed: 0,game_date,team,opponent,record,1st_inning,2nd_inning,3rd_inning,4th_inning,5th_inning,6th_inning,7th_inning,8th_inning,9th_inning,10th_inning,11th_inning,12th_inning,13th_inning,14th_inning,15th_inning,16th_inning,extra_inning,runs,hits,errors,result
0,2023-04-02,Tigers,Rays,0 - 3,0,0,0,0,0,0,0,0,1,X,X,X,X,X,X,X,N,1,2,0,L
1,2023-04-02,Rays,Tigers,3 - 0,0,0,0,1,0,3,0,1,X,X,X,X,X,X,X,X,N,5,8,0,W
2,2023-04-02,Orioles,Red Sox,1 - 2,0,0,0,0,3,0,2,0,0,X,X,X,X,X,X,X,N,5,10,2,L
3,2023-04-02,Red Sox,Orioles,2 - 1,1,1,1,0,3,0,2,1,X,X,X,X,X,X,X,X,N,9,14,0,W
4,2023-04-02,Braves,Nationals,2 - 1,0,0,0,1,0,0,0,0,0,X,X,X,X,X,X,X,N,1,4,0,L
5,2023-04-02,Nationals,Braves,1 - 2,4,0,0,0,0,0,0,0,X,X,X,X,X,X,X,X,N,4,6,0,W
6,2023-04-02,Giants,Yankees,1 - 2,0,0,0,0,0,0,0,0,0,X,X,X,X,X,X,X,N,0,3,0,L
7,2023-04-02,Yankees,Giants,2 - 1,0,0,3,1,0,0,2,0,X,X,X,X,X,X,X,X,N,6,7,1,W
8,2023-04-02,Pirates,Reds,1 - 2,0,0,0,1,0,0,0,0,0,X,X,X,X,X,X,X,N,1,6,0,L
9,2023-04-02,Reds,Pirates,2 - 1,0,1,1,0,1,0,0,0,X,X,X,X,X,X,X,X,N,3,9,0,W
