# Hypothesis testing for humans: Data Pull

In [1]:
import pandas as pd
import pybaseball

In [2]:
date_range = pd.date_range('2018-03-29', pd.datetime.today()).strftime('%Y-%m-%d')

In [3]:
date_range = date_range[:-1]

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

results = []
errors = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(pybaseball.statcast, date): date for date in date_range}
    for future in as_completed(futures):
        try:
            results.append(future.result())
        except Exception as err:
            errors.append(err)
            print(f'could not get data for data {futures[future]}')

could not get data for data 2018-05-12
could not get data for data 2018-05-11
could not get data for data 2018-07-16
could not get data for data 2018-07-17
could not get data for data 2018-07-18
could not get data for data 2018-08-15


In [5]:
df = pd.concat(results)

In [6]:
df.shape

(650842, 90)

# feature engineering

In [7]:
# take only pitches in bottom of an extra inning where batter did not swing
data = df.loc[
    (df.inning > 9)
    & (df.inning_topbot == 'Bot')
    & ((df.description == 'called_strike') | (df.type == 'B'))]

In [8]:
data.shape

(3115, 90)

In [9]:
with pd.option_context('display.max_columns', None):
    display(data.head())

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,pos2_person_id,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pos1_person_id,pos2_person_id.1,pos3_person_id,pos4_person_id,pos5_person_id,pos6_person_id,pos7_person_id,pos8_person_id,pos9_person_id,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
1919,2,FF,2018-04-06,92.6,-2.8473,6.186,Brad Brach,592122.0,542960.0,,called_strike,,,,,13.0,,R,R,R,NYY,BAL,S,,,0.0,0.0,2018.0,-0.5382,1.7198,-0.9484,2.3415,,,,2.0,14.0,Bot,,,,,543376.0,,180407_042840,5.8862,-134.4473,-7.4873,-7.6452,30.1096,-10.546,3.4636,1.6351,,,,91.006,2265.0,5.521,529509.0,542960.0,543376.0,641820.0,570731.0,542921.0,592518.0,458675.0,430945.0,623993.0,54.9783,,,,,,,,115.0,1.0,4-Seam Fastball,3.0,7.0,3.0,7.0,7.0,3.0,3.0,7.0,Standard,Standard
1921,4,FF,2018-04-06,92.4,-2.7254,6.1289,Brad Brach,591720.0,542960.0,,called_strike,,,,,9.0,,R,R,R,NYY,BAL,S,,,2.0,1.0,2018.0,-0.5107,1.7237,0.7218,1.6812,,,,1.0,14.0,Bot,,,,,543376.0,,180407_042753,9.7326,-133.748,-8.9756,-8.2223,30.6771,-10.1566,3.0535,1.4243,,,,90.912,2382.0,5.824,529509.0,542960.0,543376.0,641820.0,570731.0,542921.0,592518.0,458675.0,430945.0,623993.0,54.6751,,,,,,,,114.0,4.0,4-Seam Fastball,3.0,7.0,3.0,7.0,7.0,3.0,3.0,7.0,Standard,Standard
1922,5,SL,2018-04-06,84.4,-3.1374,6.2782,Brad Brach,591720.0,542960.0,,ball,,,,,14.0,,R,R,R,NYY,BAL,B,,,1.0,1.0,2018.0,0.4269,0.2367,2.2032,0.8762,,,,1.0,14.0,Bot,,,,,543376.0,,180407_042734,11.3355,-122.1434,-6.4141,1.7712,25.2011,-28.8525,3.1366,1.4451,,,,82.335,2592.0,5.32,529509.0,542960.0,543376.0,641820.0,570731.0,542921.0,592518.0,458675.0,430945.0,623993.0,55.1792,,,,,,,,114.0,3.0,Slider,3.0,7.0,3.0,7.0,7.0,3.0,3.0,7.0,Standard,Strategic
1923,6,FF,2018-04-06,92.8,-2.6519,6.1802,Brad Brach,591720.0,542960.0,,called_strike,,,,,7.0,,R,R,R,NYY,BAL,S,,,1.0,0.0,2018.0,-0.5057,1.568,-0.3851,1.751,,,,1.0,14.0,Bot,,,,,543376.0,,180407_042714,6.7467,-134.5335,-8.6691,-7.4349,29.2719,-12.0451,3.0951,1.4676,,,,91.186,2212.0,5.528,529509.0,542960.0,543376.0,641820.0,570731.0,542921.0,592518.0,458675.0,430945.0,623993.0,54.9714,,,,,,,,114.0,2.0,4-Seam Fastball,3.0,7.0,3.0,7.0,7.0,3.0,3.0,7.0,Standard,Strategic
1924,7,FF,2018-04-06,92.0,-2.6388,6.2926,Brad Brach,591720.0,542960.0,,ball,,,,,11.0,,R,R,R,NYY,BAL,B,,,0.0,0.0,2018.0,-0.8124,1.5153,-1.0701,2.3643,,,,1.0,14.0,Bot,,,,,543376.0,,180407_042700,5.6425,-133.6211,-7.2478,-10.7667,28.6822,-13.037,2.9593,1.4049,,,,91.065,2254.0,5.819,529509.0,542960.0,543376.0,641820.0,570731.0,542921.0,592518.0,458675.0,430945.0,623993.0,54.6804,,,,,,,,114.0,1.0,4-Seam Fastball,3.0,7.0,3.0,7.0,7.0,3.0,3.0,7.0,Standard,Standard


- identify which team had the advantage
- identify incorrect calls
- identify if these calls helped the advantaged

In [10]:
import numpy as np
data['away_team_advantage'] = np.where(data.away_score > data.home_score, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
data['is_strike'] = (data.plate_x.abs() < 8.5) & (data.plate_z < data.sz_top) & (data.plate_z > data.sz_bot)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
data['strike_call_incorrect'] = np.where((data.type == 'S') & ~data.is_strike, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
df = data.groupby([
    'game_pk', 'inning', 'away_team_advantage'
], as_index=False).strike_call_incorrect.sum()
df.head()

Unnamed: 0,game_pk,inning,away_team_advantage,strike_call_incorrect
0,529412.0,10.0,0,0
1,529412.0,11.0,0,1
2,529413.0,10.0,0,0
3,529413.0,11.0,0,0
4,529413.0,12.0,1,0


In [18]:
df[[
    'game_pk', 'inning', 'away_team_advantage', 'strike_call_incorrect',
]].to_csv('hypothesis_testing_for_humans.csv', index=False)