# The Book

Code that computes tables found in Tom Tango's book "The Book".

### import python modules and set up environment

In [1]:
import json
from pybaseball import statcast
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### download and save data

### define the 24 base/out states

In [7]:
game_state_dict = {
    (0, False, False, False): '1',
    (1, False, False, False): '2',
    (2, False, False, False): '3',
    (0, True, False, False): '4',
    (1, True, False, False): '5',
    (2, True, False, False): '6',
    (0, False, True, False): '7',
    (1, False, True, False): '8',
    (2, False, True, False): '9',
    (0, True, True, False): '10',
    (1, True, True, False): '11',
    (2, True, True, False): '12',
    (0, False, False, True): '13',
    (1, False, False, True): '14',
    (2, False, False, True): '15',
    (0, True, False, True): '16',
    (1, True, False, True): '17',
    (2, True, False, True): '18',
    (0, False, True, True): '19',
    (1, False, True, True): '20',
    (2, False, True, True): '21',
    (0, True, True, True): '22',
    (1, True, True, True): '23',
    (2, True, True, True): '24'
}

### function to map game/states into integers

In [8]:
def get_game_state(row):
    return game_state_dict[row['outs_when_up'], row['on_1b'], row['on_2b'], row['on_3b']]

## Run Expectancy

__Function that takes in a range of years and computes the number of runs scored, on average,  for all 24 base/out states.__

In [9]:
def compute_run_expectancies(start_year=2010, end_year=2010):
    
    # build the range of years
    year_range = range(start_year, end_year+1, 1)
    
    # build the complete dataset by concatenating year-by-year
    data_list = []
    for year in year_range:

        print(f"Getting the data for {year}...")
        data = pd.read_csv(f"../data/pitch_data_{year}.csv")

        cols_to_keep = ['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'outs_when_up', 
                        'on_1b', 'on_2b', 'on_3b', 'bat_score', 'events']

        for col in ['game_pk', 'inning', 'at_bat_number', 'outs_when_up', 'bat_score']:
            data[col] = data[col].astype(int)

        for col in ['on_1b', 'on_2b', 'on_3b']:
            data[col] = data[col] == data[col]

        data = data[data['events'] == data['events']]

        data = data[cols_to_keep]

        data = data[data['inning'] <= 8]

        data_list.append(data)

    data = pd.concat(data_list)
    
    # get the game state
    data['game_state'] = data.apply(get_game_state, axis=1)

    # keep only the columns we need
    data = data[['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'game_state', 'bat_score', 'events']]

    # sort the data so it's in chronological order
    data.sort_values(by=['game_pk', 'inning_topbot', 'inning', 'at_bat_number'], inplace=True)
          
    # intialize the game/state counts and run totals
    game_state_runs = {
        '1': {'count': 0, 'runs': 0},      # no runners on, 0 outs
        '2': {'count': 0, 'runs': 0},
        '3': {'count': 0, 'runs': 0},
        '4': {'count': 0, 'runs': 0},
        '5': {'count': 0, 'runs': 0},
        '6': {'count': 0, 'runs': 0},
        '7': {'count': 0, 'runs': 0},
        '8': {'count': 0, 'runs': 0},
        '9': {'count': 0, 'runs': 0},
        '10': {'count': 0, 'runs': 0},
        '11': {'count': 0, 'runs': 0},
        '12': {'count': 0, 'runs': 0},
        '13': {'count': 0, 'runs': 0},
        '14': {'count': 0, 'runs': 0},
        '15': {'count': 0, 'runs': 0},
        '16': {'count': 0, 'runs': 0},
        '17': {'count': 0, 'runs': 0},
        '18': {'count': 0, 'runs': 0},
        '19': {'count': 0, 'runs': 0},
        '20': {'count': 0, 'runs': 0},
        '21': {'count': 0, 'runs': 0},
        '22': {'count': 0, 'runs': 0},
        '23': {'count': 0, 'runs': 0},
        '24': {'count': 0, 'runs': 0},
    }

    game_pk_list = data['game_pk'].unique().tolist()
    
    print(f"Number of games in the dataset: {len(game_pk_list)}")

    for j, game_pk in enumerate(game_pk_list):

        if j % 100 == 0:
            print(f"Game {j} of {len(game_pk_list)}")

        for inning_topbot in ['Top', 'Bot']:
            for inning in [1, 2, 3, 4, 5, 6, 7, 8]:

                data_temp = data[(data['game_pk'] == game_pk) & (data['inning'] == inning) & (data['inning_topbot'] == inning_topbot)]

                for i in range(len(data_temp)):
                    game_state = data_temp.iloc[i]['game_state']
                    final_score = data_temp.iloc[-1]['bat_score']
                    state_score = data_temp.iloc[i]['bat_score']
                    game_state_runs[game_state]['count'] += 1
                    game_state_runs[game_state]['runs'] += (final_score - state_score)
                
    run_expectancies = {i: game_state_runs[i]['runs'] / game_state_runs[i]['count'] for i in list(game_state_runs.keys())}

    return run_expectancies

In [10]:
# compute the run expectancy matrix for 2019
run_expectancies = compute_run_expectancies(start_year=2013, end_year=2018)

# write to file as a json object
# with open('run_expectancies_2013_2018.json', 'w') as fp:
#     json.dump(run_expectancies_2019, fp, indent=4)

Getting the data for 2013...
Getting the data for 2014...
Getting the data for 2015...
Getting the data for 2016...
Getting the data for 2017...
Getting the data for 2018...
Number of games in the dataset: 14579
Game 0 of 14579
Game 100 of 14579
Game 200 of 14579
Game 300 of 14579
Game 400 of 14579
Game 500 of 14579
Game 600 of 14579
Game 700 of 14579
Game 800 of 14579
Game 900 of 14579
Game 1000 of 14579
Game 1100 of 14579
Game 1200 of 14579
Game 1300 of 14579
Game 1400 of 14579
Game 1500 of 14579
Game 1600 of 14579
Game 1700 of 14579
Game 1800 of 14579
Game 1900 of 14579
Game 2000 of 14579
Game 2100 of 14579
Game 2200 of 14579
Game 2300 of 14579
Game 2400 of 14579
Game 2500 of 14579
Game 2600 of 14579
Game 2700 of 14579
Game 2800 of 14579
Game 2900 of 14579
Game 3000 of 14579
Game 3100 of 14579
Game 3200 of 14579
Game 3300 of 14579
Game 3400 of 14579
Game 3500 of 14579
Game 3600 of 14579
Game 3700 of 14579
Game 3800 of 14579
Game 3900 of 14579
Game 4000 of 14579
Game 4100 of 14579
Ga

In [11]:
run_expectancies

{'1': 0.4881348004824853,
 '2': 0.25735344027875195,
 '3': 0.0984478653733389,
 '4': 0.8586830815146428,
 '5': 0.5090228976570204,
 '6': 0.21038255558795096,
 '7': 1.1219821377124748,
 '8': 0.6566511190656976,
 '9': 0.3045029736618522,
 '10': 1.4348082595870206,
 '11': 0.8836735559988117,
 '12': 0.4127787659050486,
 '13': 1.3731734732109404,
 '14': 0.9538492954870329,
 '15': 0.3470828933474129,
 '16': 1.7628417564208783,
 '17': 1.1466756083540908,
 '18': 0.4446133796698523,
 '19': 1.963518681965284,
 '20': 1.3653797307594615,
 '21': 0.530365224713188,
 '22': 2.261624649859944,
 '23': 1.5325869180907483,
 '24': 0.7130789397028838}

## Run Values

In [12]:
def compute_run_values(start_year=2010, end_year=2010):
    
    # build the range of years
    year_range = range(start_year, end_year+1, 1)
    
    # build the complete dataset by concatenating year-by-year
    data_list = []
    for year in year_range:

        print(f"Getting the data for {year}...")
        data = pd.read_csv(f"../data/pitch_data_{year}.csv")
        
        cols_to_keep = ['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'outs_when_up', 
                        'on_1b', 'on_2b', 'on_3b', 'bat_score', 'events']
        
        data = data[cols_to_keep]

        data = data[data['inning'] <= 8]

        for col in ['game_pk', 'inning', 'at_bat_number', 'outs_when_up', 'bat_score']:
            data[col] = data[col].astype(int)

        for col in ['on_1b', 'on_2b', 'on_3b']:
            data[col] = data[col] == data[col]

        data = data[data['events'] == data['events']]

        data_list.append(data)

    data = pd.concat(data_list)
    
    # get the game state
    data['game_state'] = data.apply(get_game_state, axis=1)

    # keep only the columns we need
    data = data[['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'game_state', 'bat_score', 'events']]
    
    # sort the data so it's in chronological order
    data.sort_values(by=['game_pk', 'inning_topbot', 'inning', 'at_bat_number'], inplace=True)
    
    # group all of the different types of caught_stealing into one category
    def map_caught_stealing(x):
        caught_stealing_list = ['caught_stealing_2b', 'caught_stealing_3b', 'caught_stealing_home',
                                'pickoff_caught_stealing_2b', 'pickoff_caught_stealing_3b', 'pickoff_caught_stealing_home']
        if x in caught_stealing_list:
            return 'caught_stealing'
        else:
            return x
    data['events'] = data['events'].apply(map_caught_stealing)

    # intialize the event counts and run totals
    event_runs = {
        'home_run': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'triple': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'double': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'single': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'field_error': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'hit_by_pitch': {'count': 0, 'runs': 0, 'game_state_re': [0]},
        'walk': {'count': 0, 'runs': 0, 'game_state_re': [0]}
    }

    game_pk_list = data['game_pk'].unique().tolist()
    
    print(f"Number of games in the dataset: {len(game_pk_list)}")
    
    for j, game_pk in enumerate(game_pk_list):

        if j % 100 == 0:
            print(f"Game {j} of {len(game_pk_list)}")

        for inning_topbot in ['Top', 'Bot']:
            for inning in [1, 2, 3, 4, 5, 6, 7, 8]:

                data_temp = data[(data['game_pk'] == game_pk) & (data['inning'] == inning) & (data['inning_topbot'] == inning_topbot)]

                for i in range(len(data_temp)):
                    event = data_temp.iloc[i]['events']
                    game_state = data_temp.iloc[i]['game_state']
                    final_score = data_temp.iloc[-1]['bat_score']
                    state_score = data_temp.iloc[i]['bat_score']
                    try:
                        event_runs[event]['count'] += 1
                        event_runs[event]['runs'] += (final_score - state_score)
                        event_runs[event]['game_state_re'].append(run_expectancies[game_state])
                    except KeyError:
                        continue
        
    run_values = {i: {'N': event_runs[i]['count'],
                      'Runs to End of Inning': event_runs[i]['runs'],
                      'Average': event_runs[i]['runs'] / event_runs[i]['count'], 
                      'Starting RE': sum(event_runs[i]['game_state_re']) / (len(event_runs[i]['game_state_re'])+1),
                      'Run Value': (event_runs[i]['runs'] / (event_runs[i]['count']+1)) - (sum(event_runs[i]['game_state_re']) / (len(event_runs[i]['game_state_re'])+1))} for i in list(event_runs.keys())}        
    return run_values

In [13]:
run_values = compute_run_values(start_year=2013, end_year=2018)

Getting the data for 2013...
Getting the data for 2014...
Getting the data for 2015...
Getting the data for 2016...
Getting the data for 2017...
Getting the data for 2018...
Number of games in the dataset: 14579
Game 0 of 14579
Game 100 of 14579
Game 200 of 14579
Game 300 of 14579
Game 400 of 14579
Game 500 of 14579
Game 600 of 14579
Game 700 of 14579
Game 800 of 14579
Game 900 of 14579
Game 1000 of 14579
Game 1100 of 14579
Game 1200 of 14579
Game 1300 of 14579
Game 1400 of 14579
Game 1500 of 14579
Game 1600 of 14579
Game 1700 of 14579
Game 1800 of 14579
Game 1900 of 14579
Game 2000 of 14579
Game 2100 of 14579
Game 2200 of 14579
Game 2300 of 14579
Game 2400 of 14579
Game 2500 of 14579
Game 2600 of 14579
Game 2700 of 14579
Game 2800 of 14579
Game 2900 of 14579
Game 3000 of 14579
Game 3100 of 14579
Game 3200 of 14579
Game 3300 of 14579
Game 3400 of 14579
Game 3500 of 14579
Game 3600 of 14579
Game 3700 of 14579
Game 3800 of 14579
Game 3900 of 14579
Game 4000 of 14579
Game 4100 of 14579
Ga

In [14]:
run_values

{'home_run': {'N': 28097,
  'Runs to End of Inning': 52596,
  'Average': 1.871943623874435,
  'Starting RE': 0.47161691843812564,
  'Run Value': 1.4002600834837193},
 'triple': {'N': 4633,
  'Runs to End of Inning': 7161,
  'Average': 1.5456507662421757,
  'Starting RE': 0.4738732977974829,
  'Run Value': 1.0714439227463237},
 'double': {'N': 45032,
  'Runs to End of Inning': 55477,
  'Average': 1.2319461716112987,
  'Starting RE': 0.47682316913733014,
  'Run Value': 0.7550956459538252},
 'single': {'N': 148766,
  'Runs to End of Inning': 137864,
  'Average': 0.9267171262250783,
  'Starting RE': 0.48367867422639726,
  'Run Value': 0.44303222267950254},
 'field_error': {'N': 8533,
  'Runs to End of Inning': 8590,
  'Average': 1.0066799484354858,
  'Starting RE': 0.5163040346109578,
  'Run Value': 0.4902579527337809},
 'hit_by_pitch': {'N': 9049,
  'Runs to End of Inning': 7424,
  'Average': 0.820422146093491,
  'Starting RE': 0.4952138635700028,
  'Run Value': 0.32511762814270434},
 'wa