In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pip install pybaseball

In [73]:
from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
from pybaseball import statcast
from pybaseball import cache

cache.enable()

In [74]:
def get_statcast_data(pitcher_csv, batter_csv):

    '''
    function for obtaining dataframes from Statcast

    parameters_type:
    (pitcher_csv, batter_csv) --> pitcher_dataframe, batter_dataframe

    important_note:
    these Statcast dataframes will be concatenated to pybaseball dataframes at the end.
    '''

    # read pitcher/batter csv file
    df_pitchers = pd.read_csv(pitcher_csv) 
    df_batters = pd.read_csv(batter_csv)

    # sort a df by 'last name' and 'run_value_per_100'
    # drop unnecessary columns
    df_pitchers =  df_pitchers.sort_values(by = ['last_name', 'run_value_per_100'], ascending = (True, False)).drop(['pitch_type', 'run_value', 'team_name_alt', 
                                                                                                                   'est_ba', 'est_slg', 'est_woba',
                                                                                                                   'hard_hit_percent', 'pitches',
                                                                                                                   'pitch_usage', 'pa', 'put_away',
                                                                                                                   ' first_name', 'last_name'], axis =1)

    df_batters =  df_batters.sort_values(by = ['last_name', 'run_value_per_100'], ascending = (True, False)).drop(['pitch_type', 'run_value', 'team_name_alt',
                                                                                                                 'est_ba', 'est_slg', 'est_woba',
                                                                                                                 'hard_hit_percent', 'pitches',
                                                                                                                 'pitch_usage', 'pa', 'put_away',
                                                                                                                 ' first_name', 'last_name'], axis =1)

    # rename some columns in order to merge with pybaseball dataframe (for future purposes)                                               
    df_pitchers.rename(columns={'run_value_per_100': 'p_run_value_per_100',
                              'ba':'p_ba',
                              'slg':'p_slg', 
                              'woba': 'p_woba', 
                              'whiff_percent': 'p_whiff_percent', 
                              'k_percent': 'p_k_percent',
                              'player_id': 'pitcher_id',
                              'pitch_name':'pitch_type'}, inplace=True)

    df_batters.rename(columns={'run_value_per_100': 'b_run_value_per_100', 
                             'ba':'b_ba', 
                             'slg':'b_slg', 
                             'woba': 'b_woba', 
                             'whiff_percent': 'b_whiff_percent', 
                             'k_percent': 'b_k_percent',
                             'player_id': 'batter_id',
                             'pitch_name':'pitch_type'}, inplace=True)

    return df_pitchers, df_batters

In [75]:
def get_pybaseball_data(start_date, end_date, pitcher_data, batter_data):

    '''
    function for obtaining pybaseball dataframes and finalizing into a final dataframe

    parameters_type:
    (start_date, end_date, pitcher_dataframe, batter_dataframe) --> final dataframe

    important_note:
    pitcher_dataframe & batter_dataframe are the dataframes obtained using get_statcast_data function!
    '''


    # API call using pybaseball library
    data = statcast(start_dt=start_date, end_dt=end_date)


    # drop unnecessary columns
    data = data.drop(['release_speed', 'release_pos_x', 'release_pos_z', 'spin_dir',
                    'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated',
                    'zone', 'game_type', 'type', 'hit_location', 'bb_type', 'pfx_x', 'pfx_z',
                    'plate_x', 'plate_z','hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated',
                    'fielder_2', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
                    'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
                    'release_spin_rate', 'release_extension', 'pitcher.1', 'fielder_2.1', 'fielder_3',
                    'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 
                    'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
                    'woba_value', 'woba_denom', 'babip_value', 'iso_value', 'launch_speed_angle',
                    'bat_score', 'fld_score', 'post_bat_score', 'post_fld_score', 
                    'if_fielding_alignment', 'of_fielding_alignment', 'spin_axis', 'delta_home_win_exp', 
                    'delta_run_exp', 'pitch_type'], axis = 1)

    # rename to follow the same convention as Statcast dataframes
    data.rename(columns={'player_name': 'pitcher_name'}, inplace=True)


    # combine ball_count and save as one column called 'count'
    # follow the basic convention --> 'ball'-'strike'
    # ie) '1-2' : 1 ball, 2 strikes

    count = []

    for ball, strike in zip(data['balls'].tolist(), data['strikes'].tolist()):
        combined = str(ball) + '-' + str(strike)
        count.append(combined)

    data['count'] = count
    data = data.drop(['balls', 'strikes'], axis=1)


    # reorganize runners on bases columns
    # 1 --> runner is on the base
    # 0 --> runner is not on the base
    # ie) if runners are on the first and third base only,
    #       'on_3b' : 1
    #       'on_2b' : 0
    #       'on_1b' : 1

    values = {'on_3b': 0, 'on_2b': 0, 'on_1b': 0}
    data = data.fillna(value=values)

    on_third_base = []
    on_second_base = []
    on_first_base = []

    for i in data['on_3b'].tolist():
        if i != 0:
            on_third_base.append(1)
        else:
            on_third_base.append(0)

    for j in data['on_2b'].tolist(): 
        if j != 0:
            on_second_base.append(1)
        else:
            on_second_base.append(0)

    for k in data['on_1b'].tolist():
        if k != 0:
            on_first_base.append(1)
        else:
            on_first_base.append(0)

    data['on_3b'] = on_third_base

    data['on_2b'] = on_second_base

    data['on_1b'] = on_first_base


    # exception handling: removing the row with infeasible ball_count (ie. 4-1)
    infeasible_indices = data[data['count']=='4-1'].index

    if len(infeasible_indices) > 0:
        data = data.drop(infeasible_indices, axis=0)


    # extract batter names from 'des' column & save under a column called 'batter_name'
    # key_note:
    #   first two words are the batter names

    str_des = []
    for des in data['des'].tolist():
        str_des.append(str(des))

    data['des'] = str_des

    data['batter_name'] = data['des'].apply(lambda x: ', '.join(x.split()[:2][::-1]))
    data= data.drop('des', axis =1) # drop 'des' column since it's not needed anymore


    # reorder the columns
    data = data[[
    'game_pk',
    'game_date',
    'game_year',
    'home_team',
    'away_team',
    'pitcher_name',
    'p_throws',
    'pitcher',
    'pitch_number',
    'batter_name',
    'stand',
    'batter',
    'at_bat_number',
    'inning',
    'inning_topbot',
    'outs_when_up',
    'count',
    'on_3b',
    'on_2b',
    'on_1b',
    'pitch_name',
    'events',
    'description',
    'home_score',
    'away_score',
    'post_home_score',
    'post_away_score'
    ]]


    # rename few columns in the same convention that is used in Statcast dataframe
    data.rename(columns={'pitcher': 'pitcher_id',
                       'batter':'batter_id', 
                       'game_pk': 'game_id', 
                       'pitch_name':'pitch_type',
                       'stand':'b_bats'}, inplace=True)


    # extra EDA process, in which similar pitch_types are clustered into one pitch_type
    data.replace({'Knuckle Curve':'Curveball',
                'Split-Finger': 'Splitter',
                '2-Seam Fastball': 'Sinker',
                '4-Seam Fastball' : '4-Seamer'}, inplace=True)

    # drop some uncommon pitch_types (only ~1% of the entire dataset)
    data = data.drop(data[data['pitch_type'] == 'Knuckleball'].index)
    data = data.drop(data[data['pitch_type'] == 'Eephus'].index)
    data = data.drop(data[data['pitch_type'] == 'Fastball'].index)

    # drop additional missing rows
    data['events'].fillna('None', inplace=True)
    data.dropna(axis='rows', inplace=True)

    # merge pybaseball dataframe with Statcast dataframe
    data = data.merge(pitcher_data, on=['pitcher_id','pitch_type'])
    data = data.merge(batter_data, on=['batter_id','pitch_type'])
    data.drop(columns=['events', 'description'], axis=1, inplace=True)

    
    # to make data into a correct chronological, play-by-play format, we sort the columns accordingly
    data = data.sort_values(by = ['game_date', 'game_id', 'inning', 'inning_topbot', 'at_bat_number', 'pitch_number'], 
                         ascending = (True, True, True, False, True, True))



    return data

In [76]:
###### 2019 DATA ######
pitcher_data, batter_data = get_statcast_data('2019/Pitch-Arsenal-Pitchers-2019.csv', '2019/Pitch-Arsenal-Batters-2019.csv')
data_2019 = get_pybaseball_data('2019-03-20', '2019-09-29' , pitcher_data, batter_data)
data_2019.head()

This is a large query, it may take a moment to complete


100%|████████████████████████████████████████████████████████████████████████████████| 194/194 [00:09<00:00, 20.71it/s]


Unnamed: 0,game_id,game_date,game_year,home_team,away_team,pitcher_name,p_throws,pitcher_id,pitch_number,batter_name,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
375236,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,1,"Frazier, Adam",...,0.504,0.388,27.5,17.9,-0.7,0.247,0.441,0.342,12.8,10.6
277684,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,1,"Cabrera, Melky",...,0.504,0.388,27.5,17.9,-0.3,0.33,0.468,0.364,11.9,15.2
504456,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,2,"Cabrera, Melky",...,0.199,0.199,47.8,47.0,-0.6,0.276,0.345,0.272,23.7,10.3
504455,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,4,"Cabrera, Melky",...,0.199,0.199,47.8,47.0,-0.6,0.276,0.345,0.272,23.7,10.3
506465,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,2,"Dickerson, Corey",...,0.199,0.199,47.8,47.0,0.2,0.317,0.537,0.379,23.5,7.0


In [77]:
###### 2020 DATA ######
pitcher_data, batter_data = get_statcast_data('2020/Pitch-Arsenal-Pitchers-2020.csv', '2020/Pitch-Arsenal-Batters-2020.csv')
data_2020 = get_pybaseball_data('2020-07-23', '2020-09-27' , pitcher_data, batter_data)
data_2020.head()

This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [00:05<00:00, 12.86it/s]


Unnamed: 0,game_id,game_date,game_year,home_team,away_team,pitcher_name,p_throws,pitcher_id,pitch_number,batter_name,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
133026,630851,2020-07-23,2020,WSH,NYY,"Scherzer, Max",R,453286,1,"Hicks, Aaron",...,0.509,0.363,28.8,28.2,1.7,0.204,0.51,0.403,23.3,11.9
133025,630851,2020-07-23,2020,WSH,NYY,"Scherzer, Max",R,453286,2,"Hicks, Aaron",...,0.509,0.363,28.8,28.2,1.7,0.204,0.51,0.403,23.3,11.9
133024,630851,2020-07-23,2020,WSH,NYY,"Scherzer, Max",R,453286,3,"Hicks, Aaron",...,0.509,0.363,28.8,28.2,1.7,0.204,0.51,0.403,23.3,11.9
189734,630851,2020-07-23,2020,WSH,NYY,"Scherzer, Max",R,453286,4,"Hicks, Aaron",...,0.75,0.558,27.6,16.7,-1.2,0.136,0.182,0.223,35.0,29.2
133023,630851,2020-07-23,2020,WSH,NYY,"Scherzer, Max",R,453286,5,"Hicks, Aaron",...,0.509,0.363,28.8,28.2,1.7,0.204,0.51,0.403,23.3,11.9


In [78]:
###### 2021 DATA ######
pitcher_data, batter_data = get_statcast_data('2021/Pitch-Arsenal-Pitchers-2021.csv', '2021/Pitch-Arsenal-Batters-2021.csv')
data_2021 = get_pybaseball_data('2021-04-01', '2021-06-15', pitcher_data, batter_data)
data_2021.head()

This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 76/76 [00:04<00:00, 16.63it/s]


Unnamed: 0,game_id,game_date,game_year,home_team,away_team,pitcher_name,p_throws,pitcher_id,pitch_number,batter_name,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
33868,634615,2021-04-01,2021,COL,LAD,"Márquez, Germán",R,608566,1,"Betts, Mookie",...,0.338,0.324,14.9,5.9,-0.3,0.204,0.452,0.352,14.2,14.0
33867,634615,2021-04-01,2021,COL,LAD,"Márquez, Germán",R,608566,2,"Betts, Mookie",...,0.338,0.324,14.9,5.9,-0.3,0.204,0.452,0.352,14.2,14.0
33866,634615,2021-04-01,2021,COL,LAD,"Márquez, Germán",R,608566,3,"Betts, Mookie",...,0.338,0.324,14.9,5.9,-0.3,0.204,0.452,0.352,14.2,14.0
33865,634615,2021-04-01,2021,COL,LAD,"Márquez, Germán",R,608566,4,"Betts, Mookie",...,0.338,0.324,14.9,5.9,-0.3,0.204,0.452,0.352,14.2,14.0
33864,634615,2021-04-01,2021,COL,LAD,"Márquez, Germán",R,608566,5,"Betts, Mookie",...,0.338,0.324,14.9,5.9,-0.3,0.204,0.452,0.352,14.2,14.0


In [161]:
###### CONCATENATE 2019, 2020, 2021 DATA ######
frames = [data_2019, data_2020, data_2021]

final_df = pd.concat(frames)
final_df = final_df.groupby('game_id').filter(lambda x : len(x) >= 200)
final_df.head()


Unnamed: 0,game_id,game_date,game_year,home_team,away_team,pitcher_name,p_throws,pitcher_id,pitch_number,batter_name,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
375236,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,1,"Frazier, Adam",...,0.504,0.388,27.5,17.9,-0.7,0.247,0.441,0.342,12.8,10.6
277684,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,1,"Cabrera, Melky",...,0.504,0.388,27.5,17.9,-0.3,0.33,0.468,0.364,11.9,15.2
504456,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,2,"Cabrera, Melky",...,0.199,0.199,47.8,47.0,-0.6,0.276,0.345,0.272,23.7,10.3
504455,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,4,"Cabrera, Melky",...,0.199,0.199,47.8,47.0,-0.6,0.276,0.345,0.272,23.7,10.3
506465,565220,2019-03-28,2019,CIN,PIT,"Castillo, Luis",R,622491,2,"Dickerson, Corey",...,0.199,0.199,47.8,47.0,0.2,0.317,0.537,0.379,23.5,7.0


In [166]:
###### FIND THE PROPER ORDER FOR EACH GAME_ID'S COUNTS (FOR FUTURE SPLITTING PURPOSES) ######

dfd = pd.DataFrame(final_df['game_id'].value_counts())
dfd['count'] = dfd.index.values
dfd.rename(columns = {'game_id': 'count', 'count':'game_id'}, inplace = True)
dfd.head()

dfd_dict = {}

for i, j in zip(dfd['count'].tolist(), dfd['game_id'].tolist()):
    dfd_dict[j] = i


actual_id_count = {}

for g_id in np.array(final_df['game_id'].unique()).tolist():
    if g_id in dfd_dict:
        actual_id_count[g_id] = dfd_dict[g_id]
    else:
        pass

[216,
 251,
 226,
 246,
 276,
 242,
 277,
 259,
 225,
 216,
 228,
 422,
 241,
 216,
 241,
 239,
 205,
 263,
 289,
 279,
 224,
 315,
 228,
 220,
 282,
 225,
 294,
 260,
 235,
 262,
 230,
 226,
 251,
 280,
 231,
 287,
 274,
 215,
 282,
 303,
 243,
 218,
 250,
 202,
 329,
 225,
 265,
 221,
 223,
 228,
 354,
 228,
 261,
 238,
 235,
 213,
 269,
 247,
 221,
 236,
 278,
 208,
 278,
 216,
 252,
 252,
 269,
 241,
 247,
 207,
 219,
 242,
 284,
 238,
 266,
 255,
 270,
 220,
 267,
 234,
 280,
 280,
 233,
 318,
 272,
 255,
 235,
 268,
 277,
 218,
 284,
 242,
 276,
 221,
 280,
 246,
 245,
 229,
 254,
 294,
 298,
 280,
 224,
 318,
 261,
 248,
 224,
 251,
 269,
 242,
 230,
 334,
 225,
 266,
 312,
 272,
 281,
 296,
 268,
 246,
 270,
 325,
 217,
 210,
 252,
 212,
 206,
 305,
 252,
 241,
 251,
 275,
 233,
 243,
 268,
 233,
 207,
 303,
 285,
 261,
 239,
 245,
 254,
 219,
 252,
 255,
 304,
 238,
 277,
 283,
 329,
 234,
 280,
 251,
 245,
 253,
 268,
 294,
 258,
 241,
 201,
 206,
 281,
 276,
 225,
 240,
 265

In [136]:
###### PLAYER ID MAPPING ######

player_dict = {}

for pitcher_name, batter_name, p_id, b_id in zip(final_df['pitcher_name'].unique().tolist(), final_df['batter_name'].unique().tolist(),
                                                np.array(final_df['pitcher_id'].unique()).tolist(), np.array(final_df['batter_id'].unique()).tolist()):
    player_dict[p_id] = pitcher_name
    player_dict[b_id] = batter_name
    
player_dict

{622491: 'Castillo, Luis',
 624428: 'Frazier, Adam',
 592791: 'Taillon, Jameson',
 466320: 'Cabrera, Melky',
 453172: 'Hughes, Jared',
 572816: 'Dickerson, Corey',
 435043: 'Duke, Zach',
 605137: 'Bell, Josh',
 593144: 'Rodríguez, Richard',
 608385: 'Winker, Jesse',
 628452: 'Iglesias, Raisel',
 458015: 'Votto, Joey',
 595897: 'Burdi, Nick',
 624577: 'Puig, Yasiel',
 607237: 'Garrett, Amir',
 553993: 'Suarez, Eugenio',
 456696: 'Hernandez, David',
 465041: 'Cervelli, Francisco',
 641745: 'Keller, Brad',
 628356: 'Ho, Jung',
 607074: 'Rodón, Carlos',
 543776: 'Shuck, JB',
 518858: 'Jones, Nate',
 606299: 'Peraza, Jose',
 621114: 'Burr, Ryan',
 571466: 'Barnhart, Tucker',
 592229: 'Covey, Dylan',
 578428: 'Iglesias, Jose',
 453178: 'Kennedy, Ian',
 570481: 'Gonzalez, Erik',
 503449: 'Peralta, Wily',
 594988: 'Schebler, Scott',
 518617: 'Diekman, Jake',
 641838: 'McCarthy, Kevin',
 518618: 'Dietrich, Derek',
 502202: 'Boxberger, Brad',
 592567: 'Moran, Colin',
 547943: 'Greinke, Zack',
 6

In [254]:
###### TRAIN, TEST SPLIT ######


lst_value_cnt = list(actual_id_count.values())     

lst_value_cnt_sum= []
for i in range(len(lst_value_cnt)):
    lst_value_cnt_sum.append(sum(lst_value_cnt[:i+1]))


lst = final_df.values.tolist()

val_check = 0
idx = 0
tmp_idx = 0

final_lst = []
for val in lst:
    if val_check != val[0]:
        temp = lst[tmp_idx:lst_value_cnt_sum[idx]]
        tmp_idx = lst_value_cnt_sum[idx]
        idx += 1
        final_lst.append(temp)
        val_check = val[0]

    else:
        pass

In [255]:

import random 
random.seed(101)
random.shuffle(final_lst)
shuffled_lst = final_lst
# shuffled_lst[:100]

In [294]:
###### CREATE 'MINI-DATAFRAMES' BY ITERATING THROUGH THE LIST, AND THEN MERGE INTO FINAL DATAFRAME ######

lst_of_randomized_dfs = []

for mini_df in shuffled_lst: # CHANGE TO LST
    df = pd.DataFrame(mini_df,columns=final_df.columns.tolist())
    lst_of_randomized_dfs.append(df) 

In [305]:
shuffled_df = pd.concat(lst_of_randomized_dfs)
shuffled_df

Unnamed: 0,game_id,game_date,game_year,home_team,away_team,pitcher_name,p_throws,pitcher_id,pitch_number,batter_name,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
0,566902,2019-06-28,2019,TOR,KC,"Reid-Foley, Sean",R,656887,1,"Merrifield, Whit",...,0.491,0.418,26.2,20.3,1.4,0.339,0.554,0.416,15.1,13.9
1,566902,2019-06-28,2019,TOR,KC,"Reid-Foley, Sean",R,656887,2,"Merrifield, Whit",...,0.375,0.332,25.2,20.0,-1.7,0.225,0.310,0.236,29.0,27.9
2,566902,2019-06-28,2019,TOR,KC,"Reid-Foley, Sean",R,656887,1,"Lopez, Nicky",...,0.491,0.418,26.2,20.3,-0.9,0.268,0.352,0.316,11.5,10.9
3,566902,2019-06-28,2019,TOR,KC,"Reid-Foley, Sean",R,656887,2,"Lopez, Nicky",...,0.491,0.418,26.2,20.3,-0.9,0.268,0.352,0.316,11.5,10.9
4,566902,2019-06-28,2019,TOR,KC,"Reid-Foley, Sean",R,656887,1,"Gordon, Alex",...,0.491,0.418,26.2,20.3,-0.4,0.280,0.391,0.347,15.1,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,631137,2020-07-29,2020,PIT,MIL,"Hader, Josh",L,623352,2,"Frazier, Adam",...,0.462,0.353,33.3,30.6,2.0,0.286,0.524,0.420,14.9,12.8
239,631137,2020-07-29,2020,PIT,MIL,"Hader, Josh",L,623352,3,"Frazier, Adam",...,0.038,0.103,51.0,55.2,-4.2,0.095,0.095,0.086,23.9,23.8
240,631137,2020-07-29,2020,PIT,MIL,"Hader, Josh",L,623352,1,"Evans, Phillip",...,0.038,0.103,51.0,55.2,8.9,0.400,0.800,0.505,41.2,20.0
241,631137,2020-07-29,2020,PIT,MIL,"Hader, Josh",L,623352,3,"Evans, Phillip",...,0.462,0.353,33.3,30.6,2.2,0.357,0.357,0.406,27.6,11.1


In [306]:
###### DROP SOME COLUMNS FOR THE LAST TIME (SOME COLUMNS EXISTED PREVIOUSLY PURELY FOR SORTING PURPOSES))######
shuffled_df.columns
shuffled_df = shuffled_df.drop(['pitch_number', 'at_bat_number', 'pitcher_name', 'batter_name', 'game_year'], axis = 1)
shuffled_df.head()

Unnamed: 0,game_id,game_date,home_team,away_team,p_throws,pitcher_id,b_bats,batter_id,inning,inning_topbot,...,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,b_slg,b_woba,b_whiff_percent,b_k_percent
0,566902,2019-06-28,TOR,KC,R,656887,R,593160,1,Top,...,0.491,0.418,26.2,20.3,1.4,0.339,0.554,0.416,15.1,13.9
1,566902,2019-06-28,TOR,KC,R,656887,R,593160,1,Top,...,0.375,0.332,25.2,20.0,-1.7,0.225,0.31,0.236,29.0,27.9
2,566902,2019-06-28,TOR,KC,R,656887,L,670032,1,Top,...,0.491,0.418,26.2,20.3,-0.9,0.268,0.352,0.316,11.5,10.9
3,566902,2019-06-28,TOR,KC,R,656887,L,670032,1,Top,...,0.491,0.418,26.2,20.3,-0.9,0.268,0.352,0.316,11.5,10.9
4,566902,2019-06-28,TOR,KC,R,656887,L,460086,1,Top,...,0.491,0.418,26.2,20.3,-0.4,0.28,0.391,0.347,15.1,11.5


In [307]:
shuffled_df.columns

Index(['game_id', 'game_date', 'home_team', 'away_team', 'p_throws',
       'pitcher_id', 'b_bats', 'batter_id', 'inning', 'inning_topbot',
       'outs_when_up', 'count', 'on_3b', 'on_2b', 'on_1b', 'pitch_type',
       'home_score', 'away_score', 'post_home_score', 'post_away_score',
       'p_run_value_per_100', 'p_ba', 'p_slg', 'p_woba', 'p_whiff_percent',
       'p_k_percent', 'b_run_value_per_100', 'b_ba', 'b_slg', 'b_woba',
       'b_whiff_percent', 'b_k_percent'],
      dtype='object')

In [308]:
###### SHUFFLED_DF  (BASIC IDEA)--> [[STATIONARY COLUMNS], [SEQUENTIAL COLUMNS], [LABEL]] ######

shuffled_df = shuffled_df[['pitcher_id', 'batter_id', 'p_run_value_per_100', 'p_ba',
       'p_slg', 'p_woba', 'p_whiff_percent', 'p_k_percent',
       'b_run_value_per_100', 'b_ba', 'b_slg', 'b_woba', 'b_whiff_percent',
       'b_k_percent', 'game_id', 'home_team', 'away_team', 'inning', 'inning_topbot', 'outs_when_up', 'count',
       'on_3b', 'on_2b', 'on_1b','home_score', 'away_score',
       'post_home_score', 'post_away_score', 'pitch_type']]

shuffled_df.head()

Unnamed: 0,pitcher_id,batter_id,p_run_value_per_100,p_ba,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,...,outs_when_up,count,on_3b,on_2b,on_1b,home_score,away_score,post_home_score,post_away_score,pitch_type
0,656887,593160,0.5,0.283,0.491,0.418,26.2,20.3,1.4,0.339,...,0,0-0,0,0,0,0,0,0,0,4-Seamer
1,656887,593160,-0.4,0.229,0.375,0.332,25.2,20.0,-1.7,0.225,...,0,0-1,0,0,0,0,0,0,0,Slider
2,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,1,0-0,0,0,0,0,0,0,0,4-Seamer
3,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,1,0-1,0,0,0,0,0,0,0,4-Seamer
4,656887,460086,0.5,0.283,0.491,0.418,26.2,20.3,-0.4,0.28,...,2,0-0,0,0,0,0,0,0,0,4-Seamer


In [309]:
###### ONE-HOT ENCODING ######

shuffled_df = pd.get_dummies(shuffled_df)
shuffled_df.head()

Unnamed: 0,pitcher_id,batter_id,p_run_value_per_100,p_ba,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,...,count_3-0,count_3-1,count_3-2,pitch_type_4-Seamer,pitch_type_Changeup,pitch_type_Curveball,pitch_type_Cutter,pitch_type_Sinker,pitch_type_Slider,pitch_type_Splitter
0,656887,593160,0.5,0.283,0.491,0.418,26.2,20.3,1.4,0.339,...,0,0,0,1,0,0,0,0,0,0
1,656887,593160,-0.4,0.229,0.375,0.332,25.2,20.0,-1.7,0.225,...,0,0,0,0,0,0,0,0,1,0
2,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,0,0,0,1,0,0,0,0,0,0
3,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,0,0,0,1,0,0,0,0,0,0
4,656887,460086,0.5,0.283,0.491,0.418,26.2,20.3,-0.4,0.28,...,0,0,0,1,0,0,0,0,0,0


In [312]:
###### SPLIT INTO X,Y SETS ######

shuffled_df_X = shuffled_df.loc[:, 'pitcher_id': 'count_3-2']
shuffled_df_y = shuffled_df.loc[:, 'pitch_type_4-Seamer':]

Unnamed: 0,pitcher_id,batter_id,p_run_value_per_100,p_ba,p_slg,p_woba,p_whiff_percent,p_k_percent,b_run_value_per_100,b_ba,...,count_0-2,count_1-0,count_1-1,count_1-2,count_2-0,count_2-1,count_2-2,count_3-0,count_3-1,count_3-2
0,656887,593160,0.5,0.283,0.491,0.418,26.2,20.3,1.4,0.339,...,0,0,0,0,0,0,0,0,0,0
1,656887,593160,-0.4,0.229,0.375,0.332,25.2,20.0,-1.7,0.225,...,0,0,0,0,0,0,0,0,0,0
2,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,0,0,0,0,0,0,0,0,0,0
3,656887,670032,0.5,0.283,0.491,0.418,26.2,20.3,-0.9,0.268,...,0,0,0,0,0,0,0,0,0,0
4,656887,460086,0.5,0.283,0.491,0.418,26.2,20.3,-0.4,0.28,...,0,0,0,0,0,0,0,0,0,0


In [319]:
###### NORMALIZATION OF FEATURES ######
from sklearn.preprocessing import MinMaxScaler

###### TRAINING DATA ~ 70% ######
X_train = shuffled_df_X.iloc[:692285]
y_train = shuffled_df_y.iloc[:692285]

###### TEST DATA ~ 15% ######
X_test = shuffled_df_X.iloc[692285:840794]
y_test = shuffled_df_y.iloc[692285:840794]


###### VALIDATION DATA ~ 15% ######
X_val = shuffled_df_X.iloc[840794:]
y_val = shuffled_df_y.iloc[692285:840794]

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)