# Problem Statements

1. Problem Statement: 
MLB scouting departments have become heavily reliant on data analysis since the advent of Moneyball. Teams are now looking to get that extra edge from Machine Learning, so the goal of this project is to use historical data from Baseball-Reference to predict whether an offensive player is going to be an All-Star this upcoming year. 


2. Problem Statement: 
The 2017 Houston Astros achieved infamy for their Major League Baseball cheating scandal by relaying what pitch was about to be thrown to their hitters. The goal of this project is to determine whether a Machine Learning model can accurately predict the next pitch coming based on game situation and current pitcher’s arsenal.  


In [2]:
import pandas as pd

from baseball_scraper import playerid_lookup
from baseball_scraper import statcast_pitcher
from baseball_scraper import batting_stats_range
from baseball_scraper import statcast
from baseball_scraper import statcast_batter

In [3]:
pd.set_option('display.max_columns', 200)

In [4]:
data = batting_stats_range('2008-05-01', '2020-10-27')
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlb_ID
1,David Aardsma,33,2060,"MLB-NL,MLB-AL","Atlanta,Boston",2,2,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0.0,0.0,0.0,0.0,430911
2,Fernando Abad,30,1765,"MLB-NL,MLB-AL","Houston,Minnesota",7,9,9,0,1,0,0,0,0,0,0,5,0,0,0,1,0,0,0.111,0.111,0.111,0.222,472551
3,Reggie Abercrombie,27,4530,MLB-NL,Houston,32,60,55,10,17,5,0,2,5,1,0,23,2,1,1,0,5,2,0.309,0.339,0.509,0.848,430631
4,Bobby Abreu,40,2339,"MLB-AL,MLB-NL","Los Angeles,New York",751,2901,2493,368,676,155,10,64,363,381,20,520,5,1,21,57,103,30,0.271,0.366,0.418,0.785,110029
5,José Abreu,33,148,MLB-AL,Chicago,961,4168,3787,526,1114,233,14,198,671,263,51,835,83,0,35,120,10,2,0.294,0.35,0.52,0.87,547989


In [5]:
len(data)

3249

In [6]:
data[data['G'] > 100]

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS,mlb_ID
4,Bobby Abreu,40,2339,"MLB-AL,MLB-NL","Los Angeles,New York",751,2901,2493,368,676,155,10,64,363,381,20,520,5,1,21,57,103,30,0.271,0.366,0.418,0.785,110029
5,José Abreu,33,148,MLB-AL,Chicago,961,4168,3787,526,1114,233,14,198,671,263,51,835,83,0,35,120,10,2,0.294,0.350,0.520,0.870,547989
6,Tony Abreu,29,2401,"MLB-NL,MLB-AL","Arizona,San Francisco,Kansas City,Los Angeles",163,438,414,42,102,25,5,4,43,15,1,96,2,1,6,15,2,3,0.246,0.272,0.360,0.632,473234
8,Dustin Ackley,28,1730,MLB-AL,"Seattle,New York",624,2348,2126,261,512,94,18,46,216,194,10,420,5,11,12,25,31,11,0.241,0.304,0.367,0.671,554429
10,Ronald Acuna Jr.,22,149,MLB-NL,Atlanta,313,1404,1219,251,342,59,6,81,194,159,8,371,19,0,4,15,61,13,0.281,0.371,0.538,0.909,660670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374,Ryan Zimmerman,34,512,MLB-NL,Washington,1320,5537,4958,736,1384,295,14,223,795,499,33,1027,25,0,54,160,28,6,0.279,0.345,0.479,0.824,475582
3375,Jordan Zimmermann,33,614,"MLB-NL,MLB-AL","Washington,Detroit",175,390,334,18,56,6,0,1,15,10,0,104,0,44,2,2,0,0,0.168,0.191,0.195,0.385,519455
3376,Barry Zito,35,2707,MLB-NL,San Francisco,150,310,247,10,26,0,0,0,8,16,0,66,0,47,0,4,0,0,0.105,0.160,0.105,0.265,217096
3377,Ben Zobrist,38,512,"MLB-AL,MLB-NL","Tampa Bay,Chicago,Kansas City,Oakland",1565,6533,5600,866,1510,341,42,164,741,819,34,947,30,22,62,121,112,34,0.270,0.362,0.433,0.796,450314


In [7]:
hitters = data[data['G'] > 100]

hitter_ids = hitters['mlb_ID'].to_list()

In [8]:
hitter_ids

[110029,
 547989,
 473234,
 554429,
 660670,
 542436,
 642715,
 572669,
 571431,
 451192,
 501303,
 542583,
 605113,
 593643,
 645277,
 570489,
 595751,
 450315,
 656185,
 501659,
 546991,
 624413,
 475174,
 571437,
 514888,
 476883,
 506560,
 407792,
 435042,
 605119,
 502624,
 110236,
 461811,
 641313,
 435180,
 462101,
 609280,
 150449,
 493114,
 606115,
 542455,
 571448,
 450317,
 435078,
 650333,
 453562,
 276520,
 605125,
 641319,
 425548,
 110383,
 110385,
 592122,
 488671,
 449107,
 430947,
 430632,
 664056,
 595879,
 456701,
 425557,
 434633,
 132720,
 434605,
 542908,
 150148,
 408036,
 425549,
 605131,
 488681,
 446381,
 571466,
 620439,
 430583,
 435558,
 641343,
 430832,
 488689,
 424726,
 607461,
 493596,
 542921,
 605137,
 150071,
 641355,
 474832,
 136860,
 134181,
 643217,
 204020,
 465668,
 407556,
 450641,
 542932,
 435358,
 400140,
 542194,
 605141,
 488703,
 624415,
 451532,
 595885,
 444448,
 453568,
 232694,
 408213,
 433217,
 453923,
 111072,
 452035,
 430599,
 

In [9]:
# find David Ortiz's player id (mlbam_key)
# playerid_lookup('ortiz','david')

# get all available data
# data = statcast_batter('2008-04-01', '2017-07-15', player_id = 120074)

# get data for August 16th, 2014
# data = statcast_batter('2014-08-16', player_id = 120074)

In [10]:
def get_hitter_info(player_id):
    
    stats = statcast_batter('2008-05-01', '2020-10-27')
    return stats

In [11]:
# sc_data = statcast(start_dt='2008-05-01', end_dt= '2020-10-29')

In [12]:
sc_data = pd.read_csv('./statcast_data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [13]:
sc_data.head()

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715.0,628711.0,strikeout,called_strike,,,,,4.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,2.0,,0.0,2.0,2020.0,0.18,1.63,-0.53,2.29,,,,2.0,9.0,Top,,,,,605131.0,,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,3.5,1.69,,,,95.4,2615.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.82,,,0.0,1.0,0.0,0.0,,65.0,3.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715.0,628711.0,,called_strike,,,,,1.0,,W,R,L,LAD,TB,S,,,0.0,1.0,2020.0,0.75,1.3,-0.55,3.03,,,,2.0,9.0,Top,,,,,605131.0,,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,3.5,1.69,,,,93.4,2470.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.59,,,,,,,,65.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715.0,628711.0,,swinging_strike,,,,,2.0,,W,R,L,LAD,TB,S,,,0.0,0.0,2020.0,0.23,1.47,-0.04,3.32,,,,2.0,9.0,Top,,,,,605131.0,,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,3.5,1.69,,,,94.0,2397.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.76,,,,,,,,65.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
3,3,FF,2020-10-27,94.4,1.66,5.93,Julio Urias,670712.0,628711.0,strikeout,called_strike,,,,,4.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,2.0,,3.0,2.0,2020.0,0.27,1.5,-0.37,2.15,,,,1.0,9.0,Top,,,,,605131.0,,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,3.34,1.53,,,,93.7,2508.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.6,,,0.0,1.0,0.0,0.0,,64.0,6.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
4,4,CU,2020-10-27,81.4,1.46,6.06,Julio Urias,670712.0,628711.0,,ball,,,,,13.0,,W,R,L,LAD,TB,B,,,2.0,2.0,2020.0,-1.67,-0.15,-0.14,0.96,,,,1.0,9.0,Top,,,,,605131.0,,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,3.34,1.53,,,,80.2,3031.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.84,,,,,,,,64.0,5.0,Curveball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard


In [14]:
sc_data.shape

(8842068, 90)

In [15]:
# initial columns 
init_cols = ['pitch_type', 'release_speed', 'batter', 'pitcher', 'events', 'description',
            'zone', 'p_throws', 'hit_location', 'balls', 'strikes', 'game_year',
            'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 
             'fielder_2', 'ax', 'ay', 'az', 'hit_distance_sc', 'launch_speed',
            'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension',
            'game_pk', 'release_pos_y', 'estimated_ba_using_speedangle',
            'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value',
            'iso_value', 'launch_speed_angle', 'at_bat_number', 'pitch_number',
            'pitch_name', 'bat_score', 'fld_score', 'if_fielding_alignment', 
             'of_fielding_alignment']

In [16]:
sc_data['batter'] = sc_data['batter'].astype(int)

In [17]:
vet_hitter_statcast = sc_data[sc_data['batter'].isin(hitter_ids)]

In [18]:
vet_hitter_statcast.head(-5)

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715,628711.0,strikeout,called_strike,,,,,4.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,2.0,,0.0,2.0,2020.0,0.18,1.63,-0.53,2.29,,,,2.0,9.0,Top,,,,,605131.0,,,-5.950264,-140.490456,-7.897391,3.772000,32.321911,-8.981441,3.50,1.69,,,,95.4,2615.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.82,,,0.0,1.0,0.0,0.0,,65.0,3.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715,628711.0,,called_strike,,,,,1.0,,W,R,L,LAD,TB,S,,,0.0,1.0,2020.0,0.75,1.30,-0.55,3.03,,,,2.0,9.0,Top,,,,,605131.0,,,-10.560246,-136.599519,-3.429867,11.723598,29.183810,-15.237217,3.50,1.69,,,,93.4,2470.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.59,,,,,,,,65.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715,628711.0,,swinging_strike,,,,,2.0,,W,R,L,LAD,TB,S,,,0.0,0.0,2020.0,0.23,1.47,-0.04,3.32,,,,2.0,9.0,Top,,,,,605131.0,,,-5.199252,-138.098234,-4.637970,4.158758,30.838499,-12.535677,3.50,1.69,,,,94.0,2397.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.76,,,,,,,,65.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
9,9,FF,2020-10-27,95.2,1.72,6.09,Julio Urias,622534,628711.0,field_out,hit_into_play,,,,,6.0,Manuel Margot flies out to right fielder Mooki...,W,R,L,LAD,TB,X,9.0,fly_ball,1.0,2.0,2020.0,0.40,1.48,0.40,2.56,,,,0.0,9.0,Top,191.34,99.03,,,605131.0,,,-4.306764,-138.539023,-6.867868,6.166594,30.824839,-11.731952,3.25,1.53,284.0,83.4,44.0,94.0,2450.0,5.6,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.92,0.029,0.04,0.0,1.0,0.0,0.0,3.0,63.0,4.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
10,10,CH,2020-10-27,87.2,1.90,5.90,Julio Urias,622534,628711.0,,foul,,,,,8.0,,W,R,L,LAD,TB,S,,,1.0,1.0,2020.0,1.29,0.30,0.16,1.78,,,,0.0,9.0,Top,,,,,605131.0,,,-6.787951,-126.842197,-4.319413,15.263710,25.705872,-28.270065,3.25,1.53,,,,86.6,2000.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.61,,,,,,,,63.0,3.0,Changeup,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8842058,22467,FT,2008-05-01,87.3,-2.88,5.48,Ian Kennedy,122111,453178.0,,called_strike,,,,,5.0,,R,R,R,NYY,DET,S,,,0.0,0.0,2008.0,-1.46,1.76,-0.18,2.38,,,,2.0,1.0,Top,,,,,,,080501_191025,9.862000,-126.665000,-4.855000,-15.532000,26.385000,-14.409000,3.23,1.45,,,,,,,234180.0,453178.0,,,,,,,,,50.00,,,,,,,,3.0,1.0,2-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
8842059,22470,FT,2008-05-01,88.6,-2.87,5.60,Ian Kennedy,135784,453178.0,field_out,hit_into_play,,,,,49.0,Placido Polanco flies out to center fielder Me...,R,R,R,NYY,DET,X,8.0,fly_ball,3.0,1.0,2008.0,-1.47,1.58,0.16,3.58,,,,1.0,1.0,Top,107.43,51.20,,,,,080501_190943,10.990000,-128.500000,-2.130000,-16.534000,20.764000,-15.593000,2.90,1.30,,,,,,,234180.0,453178.0,,,,,,,,,50.00,,,0.0,,0.0,0.0,,2.0,5.0,2-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
8842060,22485,FT,2008-05-01,87.7,-2.91,5.74,Ian Kennedy,135784,453178.0,,ball,,,,,,,R,R,R,NYY,DET,B,,,2.0,1.0,2008.0,-1.16,1.29,-0.40,4.30,,,,1.0,1.0,Top,,,,,,,080501_190924,8.826000,-127.437000,0.170000,-12.467000,21.406000,-19.336000,2.90,1.30,,,,,,,234180.0,453178.0,,,,,,,,,50.00,,,,,,,,2.0,4.0,2-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
8842061,22494,FT,2008-05-01,88.0,-2.72,5.50,Ian Kennedy,135784,453178.0,,ball,,,,,58.0,,R,R,R,NYY,DET,B,,,1.0,1.0,2008.0,-1.25,1.46,1.26,2.10,,,,1.0,1.0,Top,,,,,,,080501_190908,12.728000,-127.384000,-5.188000,-13.450000,21.902000,-17.406000,2.90,1.30,,,,,,,234180.0,453178.0,,,,,,,,,50.00,,,,,,,,2.0,3.0,2-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [19]:
vet_hitter_statcast['game_date'].dtype

dtype('O')

In [26]:
vet_hitter_statcast.loc[:, 'game_date'] = pd.to_datetime(vet_hitter_statcast.loc[:, 'game_date'],
                                                        format= '%Y-%m-%d')

In [48]:
d = {}

for year, sc_year in vet_hitter_statcast.groupby(vet_hitter_statcast.game_date.dt.year):
    d['df_{}'.format(int(year))] = sc_year


In [67]:
df_2008 = d['df_2008']
df_2009 = d['df_2009']
df_2010 = d['df_2010']
df_2011 = d['df_2011']
df_2012 = d['df_2012']
df_2013 = d['df_2013']
df_2014 = d['df_2014']
df_2015 = d['df_2015']
df_2016 = d['df_2016']
df_2017 = d['df_2017']
df_2018 = d['df_2018']
df_2019 = d['df_2019']
df_2020 = d['df_2020']

# for i, key in enumerate(df_names):
#     df_names[i] = d[key]
#     print(df_2008)
    

In [84]:
df_2008['launch_speed'].isnull().sum()

541456

In [86]:
df_2009['launch_speed'].isnull().sum() 

697771

In [87]:
df_2010['launch_speed'].isnull().sum() 

696228

In [88]:
df_2011['launch_speed'].isnull().sum() 

695413

In [89]:
df_2012['launch_speed'].isnull().sum() 

695321

In [98]:
df_2013['launch_speed'].isnull().sum() == len(df_2013)

True

In [97]:
df_2014['launch_speed'].isnull().sum() == len(df_2014)

True

In [96]:
df_2015['launch_speed'].isnull().sum() == len(df_2015)

False

In [99]:
df_2016['launch_speed'].isnull().sum() == len(df_2015)

False

In [100]:
df_2017['launch_speed'].isnull().sum()

584799

In [94]:
df_2020['launch_speed'].isnull().sum() 

168068

In [43]:
# statcast_by_hitter = sc_data.groupby('batter')

In [44]:
statcast_by_hitter

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc57ce8f880>

In [40]:
statcast_by_h_init_cols

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fca40c02880>

In [10]:
#sc_data.to_csv('statcast_data.csv', index=False)