<a href="https://colab.research.google.com/github/david-j-cox/Man-vs-Machine/blob/master/add_re24_col.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [None]:
# Set working directory
from google.colab import drive
drive.mount('/content/gdrive')
%cd './gdrive/My Drive/GME & MLB'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/GME & MLB


In [None]:
# Connect TPU
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Tensorflow version 2.7.0
Running on TPU  ['10.88.207.34:8470']
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Initializing the TPU system: grpc://10.88.207.34:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.88.207.34:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [None]:
# Packages and modules we'll use
# System
import glob
from google.colab import files
import zipfile, io, os
import warnings
import time
# files.upload() # Load in RE24 game state function; uncomment this line if first time using

# Data manipulation
import pandas as pd
import numpy as np
from RE24 import RE24_calc

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display, clear_output

print("Packages loaded")

Packages loaded


### Functions

In [None]:
# Function to add re24 vals to dataframes
def calc_re_24(df, year):
  """
  Add a column with the RE24 value corresponding to each pitch context in the dataset.
  -----
  Params:
    df: dataframe
      - Dataframe containing the data needed to find the RE24 value to map. 
    year: int
      - The year within the dataframe you want to isolate for batch computing. 
  -----
  Returns: 
    The original dataframe with an added column containing the mapped RE24 value for each pitch. 
  """
  re24_vals = []   # Empty list to store the identified 
  for i in range(len(df)):    # Iterate through dataframe
    # Get pitch context values as keys to get RE24 value
    outs = df['outs_when_up'][i]  
    first = df['on_1b'][i]
    second = df['on_2b'][i]
    third = df['on_3b'][i]
    
    # Pass pitch context keys to RE24_calc function
    val = RE24_calc(outs=outs, runner_first=first, 
                    runner_second=second, runner_third=third)
    
    # Add to empty list for short-term storage
    re24_vals.append(val)

    # Providing notebook user with updating progress in loop. 
    if i%100==0:
      clear_output()
      print(f'Season: {year}\n{i} of {len(df)} completed ({int((i/len(df))*100)}%)')

  # Add data to original passed dataframe    
  df['re_24'] = re24_vals
  return df

# Function to get change in RE24 pre-post pitch
def re24_change(df, year):
  """
  Add two columns with the change in RE24 value resulting each pitch in the dataset.
    1.) Change in RE24 on a Continuous scale. 
    2.) Change in RE24 on an ordinal scale (-1=worse; 0=no change; 1=better)
  -----
  Params:
    df: dataframe
      - Dataframe containing the data with the RE24 states and cols necessary 
        to temporally sort the data. 
    year: int
      - The year within the dataframe you want to isolate for batch computing. 
  -----
  Returns: 
    The original dataframe with an added column containing the mapped RE24 value for each pitch. 
  """
  df = df.sort_values(by=['pitcher', 'game_date', 'game_pk', 'inning', 'outs_when_up', 'pitch_number'])
  df = df.reset_index(drop=True)

  # CHANGE IN RE24 - CONTINUOUS
  temp_list = []  # Empty list to store continuous change values
  for index in range(len(df)):
    # First pitch in the dataframe
    if index==0: 
      temp_list.append(0)

    # Last pitch in the dataframe
    elif index==len(df)-1:                    
      temp_list.append(df['inning'][index] - 0.461)
    
    # Changes between pitchers
    elif df['pitcher'][index-1]!=df['pitcher'][index]:
      temp_list.append(0)

    # Changes between games
    elif df['game_pk'][index-1]!=df['game_pk'][index]:
      temp_list.append(0)
    
    # Within game changes
    elif df['game_pk'][index-1]==df['game_pk'][index]:
      # Last pitch of a game
      if df['game_pk'][index+1]!=df['game_pk'][index]:
          temp_list.append(df['inning'][index] - 0.461)
      # First pitch of new inning within ongoing game. 
      elif df['inning'][index-1]!=df['inning'][index]: 
        temp_list.append(df['inning'][index] - 0.461)
      # All pitches within the same inning. 
      elif df['inning'][index-1]==df['inning'][index]:  
        temp_list.append(df['re_24'][index-1]-df['re_24'][index])
    
    # Providing notebook user with updating progress after every 100th loop. 
    if index%100==0:
      clear_output()
      print(f'Season: {year}\nRE24 Continuous Change: {index} of {len(df)} completed ({int((index/len(df))*100)}%)')

  # Add data to the original passed dataframe
  df['re_24_change_raw'] = temp_list
  """--------------------------------------------------------------------------------------------"""
  # CHANGE IN RE24 - ORDINAL
  # Empty list to store ordinal change values. 
  temp_list = []                               

  for index in range(len(df)):  
    # First pitch in the dataframe
    if index==0: 
      temp_list.append(0)
    
    # Last pitch thrown in the dataframe 
    elif index==len(df)-1: 
      if (df['inning'][index] - 0.461) > 0:
        temp_list.append(1)
      elif (df['inning'][index] - 0.461) == 0:
        temp_list.apend(0)
      else:
        temp_list.append(-1)
      
    # Changes between pitchers
    elif df['pitcher'][index-1]!=df['pitcher'][index]:
      temp_list.append(0)

    # Changes between games
    elif df['game_pk'][index-1]!=df['game_pk'][index]:
      temp_list.append(0)

    # Within game changes
    elif df['game_pk'][index-1]==df['game_pk'][index]:
      # Last pitch of a game
      if df['game_pk'][index+1]!=df['game_pk'][index]: 
        if (df['inning'][index] - 0.461) > 0:
          temp_list.append(1)
        elif (df['inning'][index] - 0.461) == 0:
          temp_list.apend(0)
        else:
          temp_list.append(-1)
      # First pitch of a new inning within ongoing game
      elif df['inning'][index-1]!=df['inning'][index]: 
        if (df['inning'][index] - 0.461) > 0:
          temp_list.append(1)
        elif (df['inning'][index] - 0.461) == 0:
          temp_list.apend(0)
        else:
          temp_list.append(-1)
      # All pitches within the same inning
      elif df['inning'][index-1]==df['inning'][index]:
        if df['re_24'][index-1]>df['re_24'][index]:
          temp_list.append(1)
        elif df['re_24'][index-1]<df['re_24'][index]:
          temp_list.append(-1)
        else:
          temp_list.append(0)

    # Providing notebook user with updating progress after every 100th loop. 
    if index%100==0:
      clear_output()
      print(f'Season: {year}\nRE24 Ordinal Change: {index} of {len(df)} completed ({int((index/len(df))*100)}%)')
    
  # Add data to the original passed dataframe
  df['re_24_change_ord'] = temp_list

  return df

# Load data & handle NAs

In [None]:
# Load dataframe of all pitches from 2009 to 2019
data_raw = pd.read_csv('./Data/01_raw/all_pitches_08_19.csv').drop(['Unnamed: 0'], axis=1)
df_gs_spec = data_raw.copy()
df_gs_spec.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,...,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,FF,2019-09-29,94.9,-2.3738,5.1379,Ian Kennedy,570731,453178,field_out,hit_into_play,,,,,3.0,Jonathan Schoop pops out to second baseman Eri...,R,R,R,KC,MIN,X,4.0,popup,1,1,2019,-0.9097,1.1972,0.3906,3.19,,,,2,9,Top,134.2,170.51,,...,-16.6055,3.54,1.68,,80.0,69.0,95.087,2430.0,6.245,565782,453178,664926.0,656811.0,625510.0,596144.0,600858.0,643436.0,621433.0,641531.0,54.2545,0.002,0.0,0.0,1.0,0.0,0.0,3.0,67,3,4-Seam Fastball,4,4,4,4,4,4,4,4,Standard,Standard
1,FC,2019-09-29,91.6,-2.3719,5.2549,Ian Kennedy,570731,453178,,ball,,,,,14.0,,R,R,R,KC,MIN,B,,,0,1,2019,-0.1234,0.6398,1.1979,1.7012,,,,2,9,Top,,,,...,-23.8739,3.5473,1.6873,,,,92.04,2206.0,6.192,565782,453178,664926.0,656811.0,625510.0,596144.0,600858.0,643436.0,621433.0,641531.0,54.3079,,,,,,,,67,2,Cutter,4,4,4,4,4,4,4,4,Standard,Standard
2,FF,2019-09-29,95.0,-2.1334,5.19,Ian Kennedy,570731,453178,,called_strike,,,,,8.0,,R,R,R,KC,MIN,S,,,0,0,2019,-0.8063,1.094,-0.0142,1.8812,,,,2,9,Top,,,,...,-16.9976,3.5473,1.6873,,,,96.259,2400.0,6.533,565782,453178,664926.0,656811.0,625510.0,596144.0,600858.0,643436.0,621433.0,641531.0,53.9663,,,,,,,,67,1,4-Seam Fastball,4,4,4,4,4,4,4,4,Standard,Standard
3,FF,2019-09-29,95.5,-2.3222,5.1256,Ian Kennedy,595909,453178,strikeout,swinging_strike,,,,,11.0,Jake Cave strikes out swinging.,R,L,R,KC,MIN,S,2.0,,1,2,2019,-0.9011,1.058,-0.8813,2.9723,,,,1,9,Top,,,,...,-18.0513,3.54,1.68,,,,96.088,2564.0,6.223,565782,453178,664926.0,656811.0,625510.0,596144.0,600858.0,643436.0,621433.0,641531.0,54.2767,,,0.0,1.0,0.0,0.0,,66,6,4-Seam Fastball,4,4,4,4,4,4,4,4,Standard,Standard
4,FF,2019-09-29,95.6,-2.1953,5.1823,Ian Kennedy,595909,453178,,foul,,,,,3.0,,R,L,R,KC,MIN,S,,,1,2,2019,-0.7817,1.2909,0.6739,3.3859,,,,1,9,Top,,,,...,-15.0374,3.35,1.63,161.0,61.0,52.2,96.121,2363.0,6.49,565782,453178,664926.0,656811.0,625510.0,596144.0,600858.0,643436.0,621433.0,641531.0,54.0092,,,,,,,,66,5,4-Seam Fastball,4,4,4,4,4,4,4,4,Standard,Standard


In [None]:
# Trim down the feature space to only what we need for this study. 
df = df_gs_spec[['game_year', 'game_pk', 'pitch_type', 'game_date', 'type', 'balls', 'strikes', 
                 'inning_topbot', 'pitch_number', 'pitch_name',  
                 'home_score',  'away_score', 'bat_score', 'fld_score',
                 'events', 'description', 
                 'inning', 'pitcher', 'player_name', 'batter', 'on_1b', 'on_2b', 'on_3b', 	'outs_when_up']]
# Take a look
df[::10000]

Unnamed: 0,game_year,game_pk,pitch_type,game_date,type,balls,strikes,inning_topbot,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,events,description,inning,pitcher,player_name,batter,on_1b,on_2b,on_3b,outs_when_up
0,2019,565782,FF,2019-09-29,X,1,1,Top,3,4-Seam Fastball,4,4,4,4,field_out,hit_into_play,9,453178,Ian Kennedy,570731,,,,2
10000,2019,565750,SL,2019-07-16,S,1,1,Top,3,Slider,11,0,0,11,,swinging_strike,9,642098,Glenn Sparkman,547989,,,,0
20000,2019,565717,CH,2019-04-29,S,3,1,Top,5,Changeup,4,8,8,4,,called_strike,9,503449,Wily Peralta,650490,,596847.0,,1
30000,2018,531234,SI,2018-08-17,B,1,0,Top,2,Sinker,2,5,5,2,,ball,4,664192,Joey Lucchesi,502671,572041.0,,,2
40000,2018,530234,FT,2018-05-30,B,1,0,Top,2,2-Seam Fastball,1,2,2,1,,ball,7,453385,Clayton Richard,543776,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8420000,2014,380568,FF,2014-04-02,S,3,0,Bot,4,4-Seam Fastball,0,0,0,0,,called_strike,3,450306,Jason Vargas,488671,,,,0
8430000,2015,415091,FF,2015-07-24,S,0,0,Top,1,4-Seam Fastball,0,0,0,0,,called_strike,1,488768,Andrew Cashner,400085,,,,0
8440000,2015,414064,FF,2015-05-06,X,1,1,Bot,3,4-Seam Fastball,0,5,0,5,sac_bunt,hit_into_play,3,453178,Ian Kennedy,518790,622110.0,,,1
8450000,2013,348572,FC,8/17/2013,S,1,1,Top,3,Cutter,1,4,4,1,,swinging_strike,9,445926,Jesse Chavez,456422,453211.0,,,2


In [None]:
# Fill NAs in the runner and out cols with 0 so the RE24 calc script will work
warnings.filterwarnings('ignore')
for col in ['on_1b', 'on_2b', 'on_3b']:
  df[col] = df[col].astype(float)
  df[col] = df[col].fillna(0.0)
df[::10000]

Unnamed: 0,game_year,game_pk,pitch_type,game_date,type,balls,strikes,inning_topbot,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,events,description,inning,pitcher,player_name,batter,on_1b,on_2b,on_3b,outs_when_up
0,2019,565782,FF,2019-09-29,X,1,1,Top,3,4-Seam Fastball,4,4,4,4,field_out,hit_into_play,9,453178,Ian Kennedy,570731,0.0,0.0,0.0,2
10000,2019,565750,SL,2019-07-16,S,1,1,Top,3,Slider,11,0,0,11,,swinging_strike,9,642098,Glenn Sparkman,547989,0.0,0.0,0.0,0
20000,2019,565717,CH,2019-04-29,S,3,1,Top,5,Changeup,4,8,8,4,,called_strike,9,503449,Wily Peralta,650490,0.0,596847.0,0.0,1
30000,2018,531234,SI,2018-08-17,B,1,0,Top,2,Sinker,2,5,5,2,,ball,4,664192,Joey Lucchesi,502671,572041.0,0.0,0.0,2
40000,2018,530234,FT,2018-05-30,B,1,0,Top,2,2-Seam Fastball,1,2,2,1,,ball,7,453385,Clayton Richard,543776,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8420000,2014,380568,FF,2014-04-02,S,3,0,Bot,4,4-Seam Fastball,0,0,0,0,,called_strike,3,450306,Jason Vargas,488671,0.0,0.0,0.0,0
8430000,2015,415091,FF,2015-07-24,S,0,0,Top,1,4-Seam Fastball,0,0,0,0,,called_strike,1,488768,Andrew Cashner,400085,0.0,0.0,0.0,0
8440000,2015,414064,FF,2015-05-06,X,1,1,Bot,3,4-Seam Fastball,0,5,0,5,sac_bunt,hit_into_play,3,453178,Ian Kennedy,518790,622110.0,0.0,0.0,1
8450000,2013,348572,FC,8/17/2013,S,1,1,Top,3,Cutter,1,4,4,1,,swinging_strike,9,445926,Jesse Chavez,456422,453211.0,0.0,0.0,2


In [None]:
# Save it as a separate dataframe so we don't have to play with that large ass dataset again. 
df.to_csv('df_analysis.csv')

# Split into year dfs

In [None]:
# Segment out the years into individual dfs for batch processing
for year in df['game_year'].unique():
  print(f'Starting to work on {year}')
  temp_df = df[df['game_year']==year]
  temp_df.reset_index(drop=True)
  temp_df.to_csv(f'./Data/02_intermediate/df_{year}.csv')
  clear_output()
  print(f"Finished segementing and saving {year}")

Finished segementing and saving 2015


# Add RE24 to each year df

In [None]:
# Add RE24 to all year dfs
for year in [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]:
  temp_df = pd.read_csv(f'./Data/02_intermediate/df_{year}.csv')
  drop_list = [i for i in list(temp_df) if 'Unnamed' in i]
  temp_df = temp_df.drop(drop_list, axis=1)
  for col in ['on_1b', 'on_2b', 'on_3b']:
    temp_df[col] = temp_df[col].fillna(0)
  temp_df = calc_re_24(temp_df, year)
  temp_df.to_csv(f'./Data/02_intermediate/df_{year}.csv')

# Take a look at the result
temp_df.head(10)

Season: 2019
732400 of 732473 completed (99%)


Unnamed: 0,game_year,game_pk,pitch_type,game_date,type,balls,strikes,inning_topbot,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,events,description,inning,pitcher,player_name,batter,on_1b,on_2b,on_3b,outs_when_up,re_24
0,2019,565782,FF,2019-09-29,X,1,1,Top,3,4-Seam Fastball,4,4,4,4,field_out,hit_into_play,9,453178,Ian Kennedy,570731,0.0,0.0,0.0,2,0.095
1,2019,565782,FC,2019-09-29,B,0,1,Top,2,Cutter,4,4,4,4,,ball,9,453178,Ian Kennedy,570731,0.0,0.0,0.0,2,0.095
2,2019,565782,FF,2019-09-29,S,0,0,Top,1,4-Seam Fastball,4,4,4,4,,called_strike,9,453178,Ian Kennedy,570731,0.0,0.0,0.0,2,0.095
3,2019,565782,FF,2019-09-29,S,1,2,Top,6,4-Seam Fastball,4,4,4,4,strikeout,swinging_strike,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
4,2019,565782,FF,2019-09-29,S,1,2,Top,5,4-Seam Fastball,4,4,4,4,,foul,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
5,2019,565782,KC,2019-09-29,S,1,2,Top,4,Knuckle Curve,4,4,4,4,,foul,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
6,2019,565782,FF,2019-09-29,S,1,1,Top,3,4-Seam Fastball,4,4,4,4,,swinging_strike,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
7,2019,565782,FF,2019-09-29,S,1,0,Top,2,4-Seam Fastball,4,4,4,4,,foul,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
8,2019,565782,FF,2019-09-29,B,0,0,Top,1,4-Seam Fastball,4,4,4,4,,ball,9,453178,Ian Kennedy,595909,0.0,0.0,0.0,1,0.243
9,2019,565782,FF,2019-09-29,S,2,2,Top,5,4-Seam Fastball,4,4,4,4,strikeout,called_strike,9,453178,Ian Kennedy,543068,0.0,0.0,0.0,0,0.461


# Add changes in RE24

In [None]:
# Add RE24 changes to all dfs
for year in [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]:
  temp_df = pd.read_csv(f'./Data/02_intermediate/df_{year}.csv')
  drop_list = [i for i in list(temp_df) if 'Unnamed' in i]
  temp_df = temp_df.drop(drop_list, axis=1)
  temp_df = temp_df.reset_index(drop=True)
  for col in ['on_1b', 'on_2b', 'on_3b']:
    temp_df[col] = temp_df[col].fillna(0)
  temp_df = re24_change(temp_df, year)
  temp_df.to_csv(f'./Data/02_intermediate/df_{year}.csv')

# Take a look at the result
temp_df.head(20)

Season: 2019
RE24 Ordinal Change: 732400 of 732473 completed (99%)


Unnamed: 0,game_year,game_pk,pitch_type,game_date,type,balls,strikes,inning_topbot,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,events,description,inning,pitcher,player_name,batter,on_1b,on_2b,on_3b,outs_when_up,re_24,re_24_change_raw,re_24_change_ord
0,2019,567463,FC,2019-04-13,B,0,0,Top,1,Cutter,0,0,0,0,,ball,1,282332,CC Sabathia,544725,0.0,0.0,0.0,0,0.461,0.0,0
1,2019,567463,FC,2019-04-13,S,1,0,Top,2,Cutter,0,0,0,0,,called_strike,1,282332,CC Sabathia,544725,0.0,0.0,0.0,0,0.461,0.0,0
2,2019,567463,FC,2019-04-13,X,1,1,Top,3,Cutter,0,0,0,0,field_out,hit_into_play,1,282332,CC Sabathia,544725,0.0,0.0,0.0,0,0.461,0.0,0
3,2019,567463,FC,2019-04-13,S,0,0,Top,1,Cutter,0,0,0,0,,foul,1,282332,CC Sabathia,641313,0.0,0.0,0.0,1,0.243,0.218,1
4,2019,567463,SL,2019-04-13,S,0,1,Top,2,Slider,0,0,0,0,,swinging_strike,1,282332,CC Sabathia,641313,0.0,0.0,0.0,1,0.243,0.0,0
5,2019,567463,FC,2019-04-13,B,0,2,Top,3,Cutter,0,0,0,0,,ball,1,282332,CC Sabathia,641313,0.0,0.0,0.0,1,0.243,0.0,0
6,2019,567463,SL,2019-04-13,S,1,2,Top,4,Slider,0,0,0,0,strikeout,swinging_strike,1,282332,CC Sabathia,641313,0.0,0.0,0.0,1,0.243,0.0,0
7,2019,567463,FC,2019-04-13,B,0,0,Top,1,Cutter,0,0,0,0,,ball,1,282332,CC Sabathia,547989,0.0,0.0,0.0,2,0.095,0.148,1
8,2019,567463,FC,2019-04-13,B,1,0,Top,2,Cutter,0,0,0,0,,ball,1,282332,CC Sabathia,547989,0.0,0.0,0.0,2,0.095,0.0,0
9,2019,567463,SL,2019-04-13,S,2,0,Top,3,Slider,0,0,0,0,,called_strike,1,282332,CC Sabathia,547989,0.0,0.0,0.0,2,0.095,0.0,0
