# Major League Baseball Home Run Exit Velocities

In [1]:
import pandas as pd
import numpy as np
from scipy import stats 

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

from bs4 import BeautifulSoup
import requests
import json

### Import the Data

Importing each home run hit from the following seasons: (16626 records)
- 2015
- 2016
- 2017

In [2]:
hr_df = pd.read_csv('../data/home_runs_15_16_17.csv')
baseballs_df = pd.read_excel('../data/baseballs.xlsx')

In [3]:
hr_df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,CH,6/10/17,84.5,-3.402,6.4696,Aaron Judge,592450,501957,home_run,hit_into_play_score,...,0,0,0,0,0,0,0,0,Infield shift,Standard
1,FF,4/28/17,97.1,-2.8091,5.9279,Aaron Judge,592450,592332,home_run,hit_into_play_score,...,2,9,2,9,9,2,2,9,Standard,Standard
2,CU,6/23/15,86.7,-1.5647,5.3406,Giancarlo Stanton,519317,593372,home_run,hit_into_play_score,...,0,0,0,0,0,0,0,0,Standard,Standard
3,SL,9/28/17,89.5,2.0682,6.1177,Giancarlo Stanton,519317,571521,home_run,hit_into_play_score,...,5,1,5,1,1,5,5,1,Infield shift,Standard
4,SL,6/11/17,84.7,-1.9795,5.686,Aaron Judge,592450,548337,home_run,hit_into_play_score,...,7,3,7,3,3,7,7,3,Infield shift,Standard


In [4]:
col = [ 
    'umpire',
    'spin_dir',
    'spin_rate_deprecated',
    'break_angle_deprecated',
    'break_length_deprecated',
    'events',
    'description', 
    'des',
    'game_type', 
    'stand',
    'type', 
    'hit_location', 
    'balls', 
    'strikes', 
    'plate_z', 
    'on_3b', 
    'on_2b', 
    'on_1b', 
    'outs_when_up',
    'inning',
    'inning_topbot', 
    'hc_x', 
    'hc_y',
    'tfs_deprecated', 
    'tfs_zulu_deprecated', 
    'fielder_2', 
    'sv_id', 
    'pitcher.1', 
    'fielder_2.1', 
    'fielder_3', 
    'fielder_4', 
    'fielder_5', 
    'fielder_6', 
    'fielder_7', 
    'fielder_8', 
    'fielder_9', 
    'estimated_ba_using_speedangle', 
    'estimated_woba_using_speedangle', 
    'woba_value', 
    'woba_denom', 
    'babip_value', 
    'at_bat_number', 
    'pitch_number', 
    'home_score', 
    'away_score', 
    'bat_score', 
    'fld_score', 
    'post_away_score', 
    'post_home_score', 
    'post_bat_score', 
    'post_fld_score', 
    'if_fielding_alignment', 
    'of_fielding_alignment'
]

In [5]:
hr_df.drop(col, axis=1, inplace=True)

In [6]:
col=['pitch_type']
hr_df.dropna(subset=col, inplace=True)

In [7]:
len(hr_df.player_name.unique())

759

In [8]:
hr_df.player_name.value_counts().index[hr_df.player_name.value_counts() == 1]

Index(['Jaime Garcia', 'Jon Singleton', 'Clayton Richard', 'Robbie Ray',
       'Ramiro Pena', 'Michael Martinez', 'Gavin Cecchini', 'Daniel Norris',
       'Charlie Culberson', 'Brennan Boesch',
       ...
       'Hector Gomez', 'Chris Rusin', 'Alex Verdugo', 'Collin Cowgill',
       'Shane Victorino', 'Mat Latos', 'Jacob Hannemann', 'Chris Marrero',
       'Pete Kozma', 'Alex Mejia'],
      dtype='object', length=110)

In [9]:
hr_df['game_date'] = pd.to_datetime(hr_df['game_date'])

In [10]:
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16602 entries, 0 to 16625
Data columns (total 36 columns):
pitch_type            16602 non-null object
game_date             16602 non-null datetime64[ns]
release_speed         16600 non-null float64
release_pos_x         16491 non-null float64
release_pos_z         16491 non-null float64
player_name           16602 non-null object
batter                16602 non-null int64
pitcher               16602 non-null int64
zone                  16600 non-null float64
p_throws              16602 non-null object
home_team             16602 non-null object
away_team             16602 non-null object
bb_type               16602 non-null object
game_year             16602 non-null int64
pfx_x                 16600 non-null float64
pfx_z                 16600 non-null float64
plate_x               16600 non-null float64
vx0                   16600 non-null float64
vy0                   16600 non-null float64
vz0                   16600 non-null floa

In [11]:
hr_df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,p_throws,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
0,CH,2017-06-10,84.5,-3.402,6.4696,Aaron Judge,592450,501957,4.0,R,...,121.1,24.626,84.12,1682.0,6.342,491020,54.1561,3,6,Changeup
1,FF,2017-04-28,97.1,-2.8091,5.9279,Aaron Judge,592450,592332,5.0,R,...,119.4,16.541,96.019,2335.0,6.043,490431,54.4549,3,6,4-Seam Fastball
2,CU,2015-06-23,86.7,-1.5647,5.3406,Giancarlo Stanton,519317,593372,7.0,R,...,119.2,21.97,84.343,,5.035,414723,55.4629,3,6,Curveball
3,SL,2017-09-28,89.5,2.0682,6.1177,Giancarlo Stanton,519317,571521,8.0,L,...,118.7,22.736,88.69,2173.0,5.433,492474,55.0652,3,6,Slider
4,SL,2017-06-11,84.7,-1.9795,5.686,Aaron Judge,592450,548337,5.0,R,...,118.6,28.419,84.757,2183.0,6.153,491036,54.3442,3,6,Slider


In [12]:
hr_df['launch_speed'].describe()

count    16602.000000
mean       103.231159
std          4.415599
min         36.600000
25%        100.500000
50%        103.300000
75%        106.200000
max        121.100000
Name: launch_speed, dtype: float64

- CH : Changeup
- CU : Curveball
- EP : Eephus
- FC : Cut Fastball (Cutter)
- FF : Four-seam Fastball
- FO : Pitch Out
- FS : Sinking Fastball / Split-Fingered (Splitter)
- FT : Two-seam Fastball
- KC : Knuckle-curve
- KN : Knuckleball
- SC : Screwball
- SI : Sinker
- SL : Slider

In [13]:
hr_df[(hr_df.pitch_type == 'FF') & (hr_df.pitcher == 517414)]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,p_throws,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
1066,FF,2015-05-09,94.3,-0.7633,6.2443,Justin Upton,457708,517414,8.0,R,...,109.7,19.077,94.305,2109.0,6.064,414106,54.4355,3,6,4-Seam Fastball
2828,FF,2016-07-01,93.4,-0.7002,6.4708,Jarrett Parker,592620,517414,13.0,R,...,107.3,32.845,94.11,2114.0,6.337,448070,54.1639,3,6,4-Seam Fastball
3253,FF,2015-04-15,94.7,-1.9226,6.6718,Justin Upton,457708,517414,2.0,R,...,106.9,22.2869,,,,413769,54.2295,3,6,4-Seam Fastball
3960,FF,2016-05-31,92.5,-2.7624,6.2747,Luis Valbuena,472528,517414,4.0,R,...,106.3,27.292,93.546,2360.0,6.383,447643,54.1182,3,6,4-Seam Fastball
5812,FF,2016-08-02,92.4,-1.2452,6.2537,Jayson Werth,150029,517414,4.0,R,...,104.9,19.406,92.23,2275.0,6.004,448460,54.4976,3,6,4-Seam Fastball
9267,FF,2017-04-04,,,,Brandon Crawford,543063,517414,,R,...,102.8,30.199,,,,490118,,3,6,4-Seam Fastball
12014,FF,2017-06-09,94.3,-1.4991,6.1734,Hernan Perez,541650,517414,4.0,R,...,100.9,35.227,94.207,2270.0,5.966,491013,54.5331,3,5,4-Seam Fastball


In [14]:
pitch_avg = hr_df.groupby(['pitcher', 'pitch_type'])[['release_speed', 
                                                      'release_pos_x', 
                                                      'release_pos_y',
                                                      'release_pos_z',
                                                      'release_spin_rate',
                                                      'release_extension',
                                                      'pfx_x',
                                                      'pfx_z',
                                                      'plate_x',
                                                      'vx0',
                                                      'vy0',
                                                      'vz0',
                                                      'ax',
                                                      'ay',
                                                      'az',
                                                      'effective_speed']].mean()

In [15]:
pitch_avg.reset_index(inplace=True)

In [16]:
pitch_avg[(pitch_avg['pitcher']==112526)]['release_speed']

0    81.540000
1    91.388235
2    87.585106
3    81.700000
Name: release_speed, dtype: float64

In [17]:
pitch_avg.head()

Unnamed: 0,pitcher,pitch_type,release_speed,release_pos_x,release_pos_y,release_pos_z,release_spin_rate,release_extension,pfx_x,pfx_z,plate_x,vx0,vy0,vz0,ax,ay,az,effective_speed
0,112526,CH,81.54,-1.39152,55.25098,5.75408,1637.6,5.2492,-1.142933,0.8357,-0.22384,4.45196,-118.45622,-2.98594,-10.6673,22.15458,-25.1512,79.8946
1,112526,FF,91.388235,-0.988865,55.251547,5.992818,2276.5625,5.2575,-0.717337,1.424175,-0.022853,3.560453,-132.811671,-4.633582,-8.148265,29.473624,-16.233365,89.426625
2,112526,FT,87.585106,-1.425957,55.216726,5.713583,2109.446809,5.283426,-1.339216,0.963503,-0.173119,5.304594,-127.276523,-3.549502,-14.556315,26.805947,-22.843936,85.687319
3,112526,SL,81.7,-1.5452,55.437425,5.83525,2309.0,5.06125,0.077197,0.502796,0.172225,3.291075,-118.803012,-1.377837,1.219612,21.471113,-28.5423,79.894375
4,115629,CH,84.3,-1.8307,54.3197,6.3655,1669.0,6.148,-1.051283,1.3653,-0.379,4.877,-122.597,-4.7,-10.074,24.338,-19.731,83.946


In [18]:
def pitcher_avg(pitch, col):
    return float(pitch_avg[(pitch_avg['pitcher']==pitch.pitcher) & 
                           (pitch_avg['pitch_type']==pitch.pitch_type)][col])

In [19]:
np.isnan(hr_df.loc[9267, 'release_speed'])

True

In [20]:
float(pitcher_avg(hr_df.loc[9267, :], 'release_speed'))

93.59999999999998

In [21]:
hr_df['release_speed'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_speed') 
                                     if np.isnan(x['release_speed']) 
                                     else x['release_speed'],1)

In [22]:
hr_df[(hr_df.pitch_type == 'FF') & (hr_df.pitcher == 517414)]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,p_throws,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
1066,FF,2015-05-09,94.3,-0.7633,6.2443,Justin Upton,457708,517414,8.0,R,...,109.7,19.077,94.305,2109.0,6.064,414106,54.4355,3,6,4-Seam Fastball
2828,FF,2016-07-01,93.4,-0.7002,6.4708,Jarrett Parker,592620,517414,13.0,R,...,107.3,32.845,94.11,2114.0,6.337,448070,54.1639,3,6,4-Seam Fastball
3253,FF,2015-04-15,94.7,-1.9226,6.6718,Justin Upton,457708,517414,2.0,R,...,106.9,22.2869,,,,413769,54.2295,3,6,4-Seam Fastball
3960,FF,2016-05-31,92.5,-2.7624,6.2747,Luis Valbuena,472528,517414,4.0,R,...,106.3,27.292,93.546,2360.0,6.383,447643,54.1182,3,6,4-Seam Fastball
5812,FF,2016-08-02,92.4,-1.2452,6.2537,Jayson Werth,150029,517414,4.0,R,...,104.9,19.406,92.23,2275.0,6.004,448460,54.4976,3,6,4-Seam Fastball
9267,FF,2017-04-04,93.6,,,Brandon Crawford,543063,517414,,R,...,102.8,30.199,,,,490118,,3,6,4-Seam Fastball
12014,FF,2017-06-09,94.3,-1.4991,6.1734,Hernan Perez,541650,517414,4.0,R,...,100.9,35.227,94.207,2270.0,5.966,491013,54.5331,3,5,4-Seam Fastball


In [23]:
hr_df['release_pos_x'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_pos_x') 
                                     if np.isnan(x['release_pos_x']) 
                                     else x['release_pos_x'],1)

In [24]:
hr_df[hr_df['release_pos_x'].isnull()]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,p_throws,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
7232,CH,2016-09-27,83.2,,,Hunter Pence,452254,608566,7.0,R,...,104.0,24.0,,,,449217,,3,6,Changeup
8849,FF,2015-04-06,91.5,,,Alex Rios,425567,475138,13.0,R,...,103.0,30.0,,,,413653,,3,6,4-Seam Fastball
8947,CH,2016-06-11,86.9,,,Edwin Encarnacion,429665,488846,7.0,L,...,103.0,30.0,,,,447787,,3,6,Changeup
8954,CH,2015-04-30,81.5,,,Darin Ruf,573131,608641,14.0,L,...,103.0,30.0,,,,413978,,3,6,Changeup
8956,CU,2015-05-13,80.4,,,Buster Posey,457763,465679,2.0,R,...,103.0,30.0,,,,414160,,3,6,Curveball
8976,CU,2015-06-17,70.3,,,Todd Frazier,453943,465657,12.0,R,...,103.0,30.0,,,,414632,,3,6,Curveball


In [25]:
pitch_avg[pitch_avg['pitcher'] == 608566]

Unnamed: 0,pitcher,pitch_type,release_speed,release_pos_x,release_pos_y,release_pos_z,release_spin_rate,release_extension,pfx_x,pfx_z,plate_x,vx0,vy0,vz0,ax,ay,az,effective_speed
2834,608566,CH,83.2,,,,,,-1.286475,0.4365,-0.483,7.225,-120.863,-4.047,-12.354,22.388,-29.653,
2835,608566,CU,81.7,-2.6466,55.7998,6.14795,2784.5,4.7,0.518,-0.56035,0.3949,5.725,-118.46855,-1.367,3.3039,22.65395,-37.2675,79.138
2836,608566,FF,94.66,-2.742973,55.28748,6.117993,2153.066667,5.212333,-0.335233,1.369593,-0.1292,7.34782,-137.26556,-6.130647,-5.505493,26.682973,-14.390487,92.984
2837,608566,FT,92.025,-3.023667,54.9588,6.016267,2142.0,5.543667,-1.348821,1.247108,-0.178925,9.639075,-133.402525,-5.359075,-17.3365,26.526925,-17.52105,90.827667
2838,608566,SL,87.64,-2.92792,55.50422,6.22986,1999.0,4.995,0.16838,0.93808,-0.1464,6.20562,-127.14482,-4.1856,0.69356,23.25264,-21.75152,85.7536


In [26]:
hr_df['release_pos_z'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_pos_z') 
                                     if np.isnan(x['release_pos_z']) 
                                     else x['release_pos_z'],1)

In [27]:
hr_df['release_pos_y'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_pos_y') 
                                     if np.isnan(x['release_pos_y']) 
                                     else x['release_pos_y'],1)

In [28]:
hr_df['release_spin_rate'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_spin_rate') 
                                     if np.isnan(x['release_spin_rate']) 
                                     else x['release_spin_rate'],1)

In [29]:
hr_df['release_extension'] = hr_df.apply(lambda x: pitcher_avg(x, 'release_extension') 
                                     if np.isnan(x['release_extension']) 
                                     else x['release_extension'],1)

In [30]:
hr_df['pfx_x'] = hr_df.apply(lambda x: pitcher_avg(x, 'pfx_x') 
                                     if np.isnan(x['pfx_x']) 
                                     else x['pfx_x'],1)

In [31]:
hr_df['pfx_z'] = hr_df.apply(lambda x: pitcher_avg(x, 'pfx_z') 
                                     if np.isnan(x['pfx_z']) 
                                     else x['pfx_z'],1)

In [32]:
hr_df['plate_x'] = hr_df.apply(lambda x: pitcher_avg(x, 'plate_x') 
                                     if np.isnan(x['plate_x']) 
                                     else x['plate_x'],1)

In [33]:
hr_df['vy0'] = hr_df.apply(lambda x: pitcher_avg(x, 'vx0') 
                                     if np.isnan(x['vy0']) 
                                     else x['vy0'],1)

In [34]:
hr_df['vz0'] = hr_df.apply(lambda x: pitcher_avg(x, 'vz0') 
                                     if np.isnan(x['vz0']) 
                                     else x['vz0'],1)

In [35]:
hr_df['ax'] = hr_df.apply(lambda x: pitcher_avg(x, 'ax') 
                                     if np.isnan(x['ax']) 
                                     else x['ax'],1)

In [36]:
hr_df['ay'] = hr_df.apply(lambda x: pitcher_avg(x, 'ay') 
                                     if np.isnan(x['ay']) 
                                     else x['ay'],1)

In [37]:
hr_df['az'] = hr_df.apply(lambda x: pitcher_avg(x, 'az') 
                                     if np.isnan(x['az']) 
                                     else x['az'],1)

In [38]:
hr_df['effective_speed'] = hr_df.apply(lambda x: pitcher_avg(x, 'effective_speed') 
                                     if np.isnan(x['effective_speed']) 
                                     else x['effective_speed'],1)

In [39]:
pitch_avg.head()

Unnamed: 0,pitcher,pitch_type,release_speed,release_pos_x,release_pos_y,release_pos_z,release_spin_rate,release_extension,pfx_x,pfx_z,plate_x,vx0,vy0,vz0,ax,ay,az,effective_speed
0,112526,CH,81.54,-1.39152,55.25098,5.75408,1637.6,5.2492,-1.142933,0.8357,-0.22384,4.45196,-118.45622,-2.98594,-10.6673,22.15458,-25.1512,79.8946
1,112526,FF,91.388235,-0.988865,55.251547,5.992818,2276.5625,5.2575,-0.717337,1.424175,-0.022853,3.560453,-132.811671,-4.633582,-8.148265,29.473624,-16.233365,89.426625
2,112526,FT,87.585106,-1.425957,55.216726,5.713583,2109.446809,5.283426,-1.339216,0.963503,-0.173119,5.304594,-127.276523,-3.549502,-14.556315,26.805947,-22.843936,85.687319
3,112526,SL,81.7,-1.5452,55.437425,5.83525,2309.0,5.06125,0.077197,0.502796,0.172225,3.291075,-118.803012,-1.377837,1.219612,21.471113,-28.5423,79.894375
4,115629,CH,84.3,-1.8307,54.3197,6.3655,1669.0,6.148,-1.051283,1.3653,-0.379,4.877,-122.597,-4.7,-10.074,24.338,-19.731,83.946


In [40]:
hr_df[hr_df['zone'].isnull()][['pitch_type']]

Unnamed: 0,pitch_type
9267,FF
9357,CH


In [41]:
hr_df[(hr_df['pitcher'] == 517414) & (hr_df['pitch_type'] == 'FF')]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,p_throws,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
1066,FF,2015-05-09,94.3,-0.7633,6.2443,Justin Upton,457708,517414,8.0,R,...,109.7,19.077,94.305,2109.0,6.064,414106,54.4355,3,6,4-Seam Fastball
2828,FF,2016-07-01,93.4,-0.7002,6.4708,Jarrett Parker,592620,517414,13.0,R,...,107.3,32.845,94.11,2114.0,6.337,448070,54.1639,3,6,4-Seam Fastball
3253,FF,2015-04-15,94.7,-1.9226,6.6718,Justin Upton,457708,517414,2.0,R,...,106.9,22.2869,93.6796,2225.6,6.1508,413769,54.2295,3,6,4-Seam Fastball
3960,FF,2016-05-31,92.5,-2.7624,6.2747,Luis Valbuena,472528,517414,4.0,R,...,106.3,27.292,93.546,2360.0,6.383,447643,54.1182,3,6,4-Seam Fastball
5812,FF,2016-08-02,92.4,-1.2452,6.2537,Jayson Werth,150029,517414,4.0,R,...,104.9,19.406,92.23,2275.0,6.004,448460,54.4976,3,6,4-Seam Fastball
9267,FF,2017-04-04,93.6,-1.482133,6.348117,Brandon Crawford,543063,517414,,R,...,102.8,30.199,93.6796,2225.6,6.1508,490118,54.329633,3,6,4-Seam Fastball
12014,FF,2017-06-09,94.3,-1.4991,6.1734,Hernan Perez,541650,517414,4.0,R,...,100.9,35.227,94.207,2270.0,5.966,491013,54.5331,3,5,4-Seam Fastball


In [42]:
hr_df.reset_index(inplace=True)

In [43]:
hr_df[(hr_df['pitcher'] == 517414) & (hr_df['pitch_type'] == 'FF')]

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
1066,1066,FF,2015-05-09,94.3,-0.7633,6.2443,Justin Upton,457708,517414,8.0,...,109.7,19.077,94.305,2109.0,6.064,414106,54.4355,3,6,4-Seam Fastball
2827,2828,FF,2016-07-01,93.4,-0.7002,6.4708,Jarrett Parker,592620,517414,13.0,...,107.3,32.845,94.11,2114.0,6.337,448070,54.1639,3,6,4-Seam Fastball
3252,3253,FF,2015-04-15,94.7,-1.9226,6.6718,Justin Upton,457708,517414,2.0,...,106.9,22.2869,93.6796,2225.6,6.1508,413769,54.2295,3,6,4-Seam Fastball
3959,3960,FF,2016-05-31,92.5,-2.7624,6.2747,Luis Valbuena,472528,517414,4.0,...,106.3,27.292,93.546,2360.0,6.383,447643,54.1182,3,6,4-Seam Fastball
5809,5812,FF,2016-08-02,92.4,-1.2452,6.2537,Jayson Werth,150029,517414,4.0,...,104.9,19.406,92.23,2275.0,6.004,448460,54.4976,3,6,4-Seam Fastball
9248,9267,FF,2017-04-04,93.6,-1.482133,6.348117,Brandon Crawford,543063,517414,,...,102.8,30.199,93.6796,2225.6,6.1508,490118,54.329633,3,6,4-Seam Fastball
11993,12014,FF,2017-06-09,94.3,-1.4991,6.1734,Hernan Perez,541650,517414,4.0,...,100.9,35.227,94.207,2270.0,5.966,491013,54.5331,3,5,4-Seam Fastball


In [44]:
hr_df['zone'].iloc[9248] = 4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [45]:
hr_df['zone'].iloc[9248]

4.0

In [46]:
hr_df[(hr_df['pitcher'] == 573186) & (hr_df['pitch_type'] == 'CH')]

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,...,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,iso_value,launch_speed_angle,pitch_name
7295,7306,CH,2016-06-19,85.5,-1.0559,5.5616,Matt Wieters,446308,573186,4.0,...,104.0,24.0,82.673,1474.5,5.6465,447910,54.9406,3,6,Changeup
9338,9357,CH,2017-06-22,83.725,-1.016825,5.5545,Robinson Chirinos,455139,573186,,...,102.8,30.199,82.673,1474.5,5.6465,491197,54.950825,3,6,Changeup
10677,10697,CH,2016-05-28,83.9,-0.8911,5.3561,Travis Shaw,543768,573186,4.0,...,101.9,35.761,83.308,1384.0,5.807,447595,54.693,3,6,Changeup
14968,14992,CH,2016-04-19,83.3,-0.9497,5.7024,Matt Wieters,446308,573186,5.0,...,97.6,27.999,82.038,1565.0,5.486,447072,55.0126,3,6,Changeup
15655,15679,CH,2015-09-30,82.2,-1.1706,5.5979,Steve Pearce,456665,573186,1.0,...,96.1,34.612,82.673,1474.5,5.6465,416021,55.1571,3,3,Changeup


In [47]:
hr_df['zone'].iloc[9338] = 4

In [48]:
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16602 entries, 0 to 16601
Data columns (total 37 columns):
index                 16602 non-null int64
pitch_type            16602 non-null object
game_date             16602 non-null datetime64[ns]
release_speed         16602 non-null float64
release_pos_x         16596 non-null float64
release_pos_z         16596 non-null float64
player_name           16602 non-null object
batter                16602 non-null int64
pitcher               16602 non-null int64
zone                  16602 non-null float64
p_throws              16602 non-null object
home_team             16602 non-null object
away_team             16602 non-null object
bb_type               16602 non-null object
game_year             16602 non-null int64
pfx_x                 16602 non-null float64
pfx_z                 16602 non-null float64
plate_x               16602 non-null float64
vx0                   16600 non-null float64
vy0                   16602 non-null float6

In [49]:
batter_distance = hr_df.groupby(['pitch_type', 'batter', 'bb_type']).mean()[['hit_distance_sc']]

In [50]:
batter_distance.reset_index(inplace=True)

In [51]:
batter_distance.head()

Unnamed: 0,pitch_type,batter,bb_type,hit_distance_sc
0,CH,116338,fly_ball,391.666667
1,CH,120074,fly_ball,379.0
2,CH,120074,line_drive,413.666667
3,CH,121347,fly_ball,385.5
4,CH,121347,line_drive,412.0


In [52]:
def pitcher_avg(pitch, col):
    return float(pitch_avg[(pitch_avg['pitcher']==pitch.pitcher) & 
                           (pitch_avg['pitch_type']==pitch.pitch_type)][col])

In [53]:
def batter_avg_distance(pitch, col):
    return float(batter_distance[(batter_distance['batter']==pitch.batter) &
                                 (batter_distance['pitch_type']==pitch.pitch_type) &
                                 (batter_distance['bb_type']==pitch.bb_type)][col])

In [54]:
hr_df['hit_distance_sc'] = hr_df.apply(lambda x: batter_avg_distance(x, 'hit_distance_sc') 
                                       if np.isnan(x['hit_distance_sc']) 
                                       else x['hit_distance_sc'],1)

In [55]:
hr_df.dropna(inplace=True)

In [78]:
hr_df.to_csv('../data/clean_hr.csv')

### Baseballs

In [56]:
baseballs_df.head()

Unnamed: 0,BALL CODE,Year,SN,Weight (oz),Circumference (in),AVG Seam Height,STD of Seam Height,AVG CCOR,AVG DS,# OF GOOD SHOTS BEFORE DAMAGE
0,MSCC0051,2014-05-15,731,5.135,9.13,0.03587,0.01049,0.491,12133,
1,MSCC0032,2014-07-15,228,5.149,9.09,0.04403,0.01781,0.489,12468,
2,MSCC0030,2015-04-15,196,5.143,9.06,0.03726,0.0064,0.489,12518,
3,MSCC0045,2015-04-15,351,5.109,9.09,0.04574,0.01216,0.496,13442,
4,MSCC0048,2015-04-15,499,5.192,9.19,0.04862,0.00828,0.474,12394,5.0


In [57]:
baseballs_df.columns = [i.lower().replace(' ', '_') for i in baseballs_df.columns]

In [58]:
baseballs_df.head()

Unnamed: 0,ball_code,year,sn,weight_(oz),circumference_(in),avg_seam_height,std_of_seam_height,avg_ccor,avg_ds,#_of_good_shots_before_damage
0,MSCC0051,2014-05-15,731,5.135,9.13,0.03587,0.01049,0.491,12133,
1,MSCC0032,2014-07-15,228,5.149,9.09,0.04403,0.01781,0.489,12468,
2,MSCC0030,2015-04-15,196,5.143,9.06,0.03726,0.0064,0.489,12518,
3,MSCC0045,2015-04-15,351,5.109,9.09,0.04574,0.01216,0.496,13442,
4,MSCC0048,2015-04-15,499,5.192,9.19,0.04862,0.00828,0.474,12394,5.0


In [59]:
col = [
    'ball_code',
    '#_of_good_shots_before_damage',
    'sn'
]

baseballs_df.drop(col, axis=1, inplace=True)

In [60]:
baseballs_df.head()

Unnamed: 0,year,weight_(oz),circumference_(in),avg_seam_height,std_of_seam_height,avg_ccor,avg_ds
0,2014-05-15,5.135,9.13,0.03587,0.01049,0.491,12133
1,2014-07-15,5.149,9.09,0.04403,0.01781,0.489,12468
2,2015-04-15,5.143,9.06,0.03726,0.0064,0.489,12518
3,2015-04-15,5.109,9.09,0.04574,0.01216,0.496,13442
4,2015-04-15,5.192,9.19,0.04862,0.00828,0.474,12394


In [61]:
baseballs_df['year'] = pd.to_datetime(baseballs_df['year'])

In [62]:
baseballs_df.set_index('year', inplace=True)

In [63]:
baseballs_df.head()

Unnamed: 0_level_0,weight_(oz),circumference_(in),avg_seam_height,std_of_seam_height,avg_ccor,avg_ds
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-05-15,5.135,9.13,0.03587,0.01049,0.491,12133
2014-07-15,5.149,9.09,0.04403,0.01781,0.489,12468
2015-04-15,5.143,9.06,0.03726,0.0064,0.489,12518
2015-04-15,5.109,9.09,0.04574,0.01216,0.496,13442
2015-04-15,5.192,9.19,0.04862,0.00828,0.474,12394


In [64]:
bb_15 = baseballs_df['2015'][['weight_(oz)',
                      'circumference_(in)',
                      'avg_seam_height',
                      'std_of_seam_height',
                      'avg_ccor',
                      'avg_ds']].mean().to_frame().T
bb_15['year'] = 2015
bb_15

Unnamed: 0,weight_(oz),circumference_(in),avg_seam_height,std_of_seam_height,avg_ccor,avg_ds,year
0,5.120125,9.107917,0.040502,0.011194,0.491333,12740.875,2015


In [65]:
bb_16 = baseballs_df['2016'][['weight_(oz)',
                      'circumference_(in)',
                      'avg_seam_height',
                      'std_of_seam_height',
                      'avg_ccor',
                      'avg_ds']].mean().to_frame().T
bb_16['year'] = 2016
bb_16

Unnamed: 0,weight_(oz),circumference_(in),avg_seam_height,std_of_seam_height,avg_ccor,avg_ds,year
0,5.1225,9.079,0.038432,0.010468,0.4941,12926.4,2016


In [66]:
mlb_baseballs = bb_15.append(bb_16).reset_index()
mlb_baseballs.drop('index', axis=1, inplace=True)
mlb_baseballs.drop('std_of_seam_height', axis=1, inplace=True)
mlb_baseballs

Unnamed: 0,weight_(oz),circumference_(in),avg_seam_height,avg_ccor,avg_ds,year
0,5.120125,9.107917,0.040502,0.491333,12740.875,2015
1,5.1225,9.079,0.038432,0.4941,12926.4,2016


In [67]:
bb_17 = pd.DataFrame(
    [[.494,.03843,9.08,5.122,12926,2017]], columns=['avg_ccor', 
                                                   'avg_seam_height',
                                                   'circumference_(in)',
                                                   'weight_(oz)',
                                                   'avg_ds',
                                                   'year']
)
bb_17

Unnamed: 0,avg_ccor,avg_seam_height,circumference_(in),weight_(oz),avg_ds,year
0,0.494,0.03843,9.08,5.122,12926,2017


In [68]:
mlb_baseballs = mlb_baseballs.append(bb_17, sort=False).reset_index().drop('index', axis=1)

In [69]:
mlb_baseballs

Unnamed: 0,weight_(oz),circumference_(in),avg_seam_height,avg_ccor,avg_ds,year
0,5.120125,9.107917,0.040502,0.491333,12740.875,2015
1,5.1225,9.079,0.038432,0.4941,12926.4,2016
2,5.122,9.08,0.03843,0.494,12926.0,2017


### Player Personal Stats

In [75]:
url = "https://baseballsavant.mlb.com/savant-player/j-d-martinez-502110?stats=career-r-hitting-mlb"
res =  requests.get(url)

In [76]:
res.status_code

200

In [77]:
soup = BeautifulSoup(res.content, 'lxml')

In [136]:
jd_martinez_player = soup.find('section', {'id':'player'})

In [137]:

jd_martinez_player

<section id="player">
<div class="container white-bg padding">
<style>
          .player-background::after {
            content: '';
              background: url('https://www.mlbstatic.com/team-logos/111.svg') center center/10% no-repeat;
            // background: url('http://mlb.mlb.com/images/players/action_shots/502110.jpg') center center/100% no-repeat;
            // background-position: 20% 0;
            opacity: 0.2;
            top: 0;
            left: 0;
            bottom: 0;
            right: 0;
            position: absolute;
            z-index: 0;   
          }    
        
          @media (max-width: 768px) {
              .player-background::after {
              content: '';
              background: url('https://www.mlbstatic.com/team-logos/111.svg') center center/15% no-repeat;
              opacity: 0.2;
              top: 0;
              left: 0;
              bottom: 0;
              right: 0;
              position: absolute;
              z-index: 0;   

In [2]:
hr_df = pd.read_csv('../data/clean_hr.csv')

batter_id = list(hr_df['batter'].unique())

In [3]:
len(batter_id)

751

In [4]:
def get_batters_info(batter_id):
    attrs = {
        'batter_id': [str(x) for x in batter_id],
        'height':[],
        'weight':[],
        'age':[],
        'position':[],
        'bats':[],
        'throws':[]
    }

    attr_names = [
        'height',
        'weight',
        'age',
        'position',
        'bats',
        'throws'
    ]

    for idx, b in enumerate(batter_id):
        url = "https://baseballsavant.mlb.com/savant-player/{}?stats=career-r-hitting-mlb".format(b)
        res =  requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        for col_name, attribute in zip(attr_names, soup.find_all('div', {'class':'box-text'})):
            attrs[col_name].append(str(attribute.text))
        if idx % 50 == 0:
            with open('../data/baseball_player.json', 'w') as f:
                json.dump(attrs, f)
    
    with open('../data/baseball_player.json', 'w') as f:
        json.dump(attrs, f)
    
    return attrs

In [None]:
batters_info = get_batters_info(batter_id)

In [2]:
import pandas as pd
import json

In [3]:
pd.read_json('../data/baseball_player.json')

ValueError: arrays must all be same length

In [31]:
type(a['batter_id'][0])

str

In [54]:
df_a = pd.DataFrame(a)

In [61]:
def height(height_str):
    feet, inches = height_str.split("' ")
    feet = int(feet)
    inches = int(inches[:-1])
    return 12 * feet + inches

In [62]:
df_a['height'].map(height)

0    79
1    78
2    73
3    74
4    75
5    77
6    75
7    77
8    76
9    75
Name: height, dtype: int64

"'"

In [41]:
a['batter_id']

['592450',
 '519317',
 '471865',
 '443558',
 '121347',
 '608336',
 '547180',
 '573627',
 '541645',
 '543807']

In [None]:
pd.DataFrame(batter_final_dict)

In [13]:
8 * len(batter_id) /60

100.13333333333334