In [1]:
# Import appropriate libraries

import pandas, json, requests, urllib, io, json
import tensorflow as tf
from tensorflow import keras        
import numpy as np        
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tensorflow.keras import models, layers, Input, optimizers, callbacks
from tensorflow.keras import backend as K
import tensorflow_probability as tfp
import random


In [None]:
# Gets if an event is an on-base event

def get_ob(row):
    on_base_events = ['Single', 'Walk', 'Double', 'Home Run', 'Hit By Pitch', 'Triple']
    value = row["event"]
    if value in on_base_events:
        return 1
    else:
        return 0


In [2]:
# Gets at-bat data csv

def get_at_bat_data():
    at_bats = pandas.read_csv("../all_atbat.csv")
    #get rid of Sac Bunt, Intentional walk, Runner Out, Batter Interference, Catcher Interference because not included when computing OBP
    at_bat_remove_events = ['Sac Bunt', 'Intent Walk', 'Runner Out', 'Batter Interference', 'Catcher Interference']

    at_bats = at_bats.loc[~at_bats['event'].isin(at_bat_remove_events)]
    
    at_bats["on_base"] = at_bats.apply(get_ob, axis=1)
    
    return at_bats

at_bats = get_at_bat_data()

at_bats.head()

Unnamed: 0.1,Unnamed: 0,ab_id,batter_id,event,pitcher_id,on_base
0,0,2015000001,572761,Groundout,452657,0
1,1,2015000002,518792,Double,452657,1
2,2,2015000003,407812,Single,452657,1
3,3,2015000004,425509,Strikeout,452657,0
4,4,2015000005,571431,Strikeout,452657,0


In [8]:
# Gets the pitch data csv
def get_total_pitch_data(at_bats):

    pitchData = pandas.read_csv("../all_pitch.csv", error_bad_lines=False)
    pitchData = pitchData.rename({"res":"result"}, axis = 1)
    del pitchData["zone"]
    pitchData["zone"] = pitchData['zones']
    pitchData["zones"] = pitchData['zones'].map(lambda x: x.rstrip('aAbB')).astype(int)
    
    pitchData = pitchData.merge(at_bats[["ab_id","on_base"]], on = "ab_id")
  
    return pitchData


total_pitch_data = get_total_pitch_data(at_bats)


In [None]:
# Prints OBP across counts for a given pitcher and batter
def print_obp_across_counts(pitcher_id,batter_id, pitch_df):
    filtered_df = pitch_df[(pitch_df["pitcher_id"]==pitcher_id)&(pitch_df["batter_id"]==batter_id)]
    for b_count in range(4):    
        for s_count in range(3):
        
            print("%s%s COUNT" %(b_count,s_count))
            obp = get_obp_at_count(filtered_df, s_count, b_count)
            print("Value: %s" %(obp))

In [9]:

# Gets OBP at a count
def get_obp_at_count(filtered_df, s_count, b_count):
    count_df = filtered_df[(filtered_df["s_count"]==s_count) & (filtered_df["b_count"]==b_count)]
    outcomes = count_df["on_base"].to_numpy()
    print("Num of pitches seen at this count: %s" %(len(outcomes)))
    return np.mean(outcomes)
    
print_obp_across_counts(453286, 546318, total_pitch_data) #Garrit Cole, Chris Davis)

00 COUNT
Num of pitches seen at this count: 47
Value: 0.425531914893617
01 COUNT
Num of pitches seen at this count: 24
Value: 0.2916666666666667
02 COUNT
Num of pitches seen at this count: 13
Value: 0.38461538461538464
10 COUNT
Num of pitches seen at this count: 20
Value: 0.6
11 COUNT
Num of pitches seen at this count: 13
Value: 0.23076923076923078
12 COUNT
Num of pitches seen at this count: 22
Value: 0.22727272727272727
20 COUNT
Num of pitches seen at this count: 9
Value: 0.6666666666666666
21 COUNT
Num of pitches seen at this count: 8
Value: 0.75
22 COUNT
Num of pitches seen at this count: 21
Value: 0.5238095238095238
30 COUNT
Num of pitches seen at this count: 3
Value: 0.6666666666666666
31 COUNT
Num of pitches seen at this count: 5
Value: 0.8
32 COUNT
Num of pitches seen at this count: 11
Value: 0.9090909090909091


In [13]:
#Selected pitchers/batter for each group
selected_thirds =  {'pitchers': {0: [527048, 451596, 501957, 503449, 543022, 460059, 606131, 430912, 453385, 446321, 571800, 572096, 628333, 430580, 572750, 554234, 605541, 605156, 643230, 425386], 1: [489119, 430935, 502043, 592662, 543699, 488768, 461829, 282332, 518633, 608379, 502327, 519455, 434538, 467100, 573186, 458681, 425794, 433587, 592717, 605200], 2: [453286, 452657, 518516, 519242, 527054, 434378, 519144, 500779, 502042, 425844, 594798, 453562, 545333, 502188, 571666, 543294, 477132, 572971, 457918, 544931]}, 'batters': {0: [572287, 429667, 488721, 595978, 543376, 425784, 506560, 542208, 425772, 408299, 572204, 435064, 543216, 641525, 592444, 431171, 571912, 596143, 542194, 571974], 1: [453943, 448801, 405395, 446334, 520471, 516770, 607680, 435622, 543063, 596059, 430945, 457803, 545341, 608365, 595281, 500871, 578428, 461314, 571740, 474568], 2: [502671, 467793, 458015, 605141, 545361, 593428, 592178, 547180, 547989, 518626, 453568, 519203, 474832, 572821, 592518, 518934, 543333, 451594, 429665, 458731]}}

In [14]:
#Gathers and aggregates OBP at counts for all matchups between selected players in each group

outcomes_total = {}
results = {}
for i in selected_thirds["pitchers"].keys():
    for j in selected_thirds["batters"].keys():
        group_key =str(i)+str(j)
        outcomes_total[group_key] = {}
        results[group_key] = {}
        for b_count in range(4):
            for s_count in range(3):
                count = str(b_count)+str(s_count)
                results[group_key][count] = None
                outcomes_total[group_key][count] = np.array([])


for i in selected_thirds["pitchers"].keys():
    for j in selected_thirds["batters"].keys():
        group_key =str(i)+str(j)
        for count in outcomes_total[group_key].keys():
            for pitcher in selected_thirds["pitchers"][i]:
                for batter in selected_thirds["batters"][j]:
                    b_count = int(count[0])
                    s_count = int(count[1])
                    df_filter = (total_pitch_data["pitcher_id"] == pitcher) & (total_pitch_data["batter_id"] == batter) & (total_pitch_data["s_count"] == s_count) & (total_pitch_data["b_count"] == b_count)
                    filtered_df = total_pitch_data[df_filter]
                    ob_series = filtered_df["on_base"]
                    ob_array = ob_series.to_numpy()
                    outcomes_total[group_key][count] = np.concatenate((outcomes_total[group_key][count], ob_array))
            results[group_key][count] = np.mean(outcomes_total[group_key][count])
                

In [24]:
# Converts arrays to lists for JSON serialization

outcomes_total_for_json = {}
outcomes_total
for matchup in outcomes_total.keys():
    outcomes_total_for_json[matchup] = {}
    for count in outcomes_total[matchup].keys():
        outcomes_total_for_json[matchup][count] = outcomes_total[matchup][count].tolist()
        

In [25]:
# Saves emperical OBP at counts

with open("empirical_state_outcomes.json","w") as outfile:
    json.dump(outcomes_total_for_json, outfile)