## Code for pulling timeframe data for each stat

In [31]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import random
from datetime import datetime, timedelta

import tqdm

n = 10000

In [34]:
data = pd.read_csv('pitch_by_pitch_data.csv')


# # https://github.com/jldbc/pybaseball/tree/master/docs
# from pybaseball import statcast
# from pybaseball import batting_stats_bref
# from pybaseball import statcast_running_splits
# from pybaseball import statcast_batter
# from pybaseball import playerid_lookup
# from pybaseball import batting_stats_range

# id = int(playerid_lookup('judge', 'aaron')['key_mlbam'])
# speeds = statcast_running_splits(2022, 50)
# print(speeds)
# hitter_stats = statcast_batter('2022-06-01', '2022-06-30', id)
# # hitter_stats.head(2)


def compute_PA_stats(data, n, timeframes):
    
    data_start_dt = datetime.strptime(data['game_date'].iloc[-1], '%Y-%m-%d').date()
    PAs = []
    # Transform data to only contain 1 row for each PA
    # for index, row in data.iterrows():
    #     if not pd.isnull(row['events'])  :
    #         # 'intentionally' in row['des']
    #     # if row['events'] in ['strikeout', 'field_error', 'field_out', 'single', 'double', 'triple', 'home_run', 'double_play', 'grounded_into_double_play', 'force_out', 'sac_fly', 'fielders_choice', 'walk', 'hit_by_pitch', 'triple_play']:
    #         PAs.append(row)
    PA_df = data.dropna(subset = ['events'])
#     PA_stats = ['walk', 'strikeout', 'hit_by_pitch', 'ground_ball', 'fly_ball', 'line_drive', 'popup', 'xwoba']
    PA_stats = ['walk', 'hit_by_pitch', 'xwoba']
    stats_dict = dict(zip(PA_stats, ([] for _ in PA_stats)))
    time_frame_stats = dict(zip(PA_stats, ([] for _ in PA_stats)))
    used_idx = []
    # Select random samples
    for i in tqdm.tqdm(range(n)):
        idx = random.randint(0, len(PA_df))
        counter = 0
        while (idx in used_idx or datetime.strptime(PA_df['game_date'].iloc[idx], '%Y-%m-%d').date() < data_start_dt + timedelta(days=timeframes[-1])):
            counter +=1 
            idx =  (idx+1) % len(PA_df)
            if counter == len(PA_df):
                return concatenate_PA_data(time_frame_stats, stats_dict, PA_stats) 
            

        used_idx.append(idx)
        id = int(PA_df['batter'].iloc[idx])
 
        # id = int(playerid_lookup(last_nm, first_nm)['key_mlbam'])
        date = datetime.strptime(PA_df['game_date'].iloc[idx], '%Y-%m-%d').date()
        player_data = PA_df[PA_df['batter'].astype(int) == id]
        timeframe_data = []
        for index, row in player_data.iterrows():
            if datetime.strptime(row['game_date'], '%Y-%m-%d').date() < date:
                timeframe_data.append(row)
        player_timeframe_data = pd.DataFrame(timeframe_data)

       
        stats_dict = find_PA_outcome(stats_dict, PA_df, idx)
        
        for stat in stats_dict:
            time_frame_stats[stat].append(find_PA_time_stats(player_timeframe_data, stat, timeframes, date))
    return concatenate_PA_data(time_frame_stats, stats_dict, PA_stats)
def concatenate_PA_data(time_frame_stats, stats_dict, PA_stats):
    PA_data = dict(zip(PA_stats, ([] for _ in PA_stats)))
    for stat in PA_stats:
        
        PA_stat_data = []
        for i in range (n):
            data_row = time_frame_stats[stat][i]
            data_row.append(stats_dict[stat][i])
            
            PA_stat_data.append(data_row)
        PA_data[stat] = PA_stat_data
    return(PA_data)
def find_PA_outcome(stats_dict, PA_df, idx):
    for stat in stats_dict:
        # Find outcome of PA for stat
        if stat in ['walk', 'strikeout', 'hit_by_pitch']:
            if PA_df['events'].iloc[idx] == stat:
                stats_dict[stat].append(1)
            else:
                stats_dict[stat].append(0)
#         elif stat in ['ground_ball', 'fly_ball', 'line_drive', 'popup'] :
#             if not pd.isnull(PA_df['bb_type'].iloc[idx]):
#                 if PA_df['bb_type'].iloc[idx] == stat:
#                     stats_dict[stat].append(1)
#                 else:
#                     stats_dict[stat].append(0)
#             else:
#                 stats_dict[stat].append(0)
        else:
            if not pd.isnull(PA_df['estimated_woba_using_speedangle'].iloc[idx]):
                stats_dict[stat].append(PA_df['estimated_woba_using_speedangle'].iloc[idx])
            else:
                stats_dict[stat].append(PA_df['woba_value'].iloc[idx])
    return stats_dict
def find_PA_time_stats(player_timeframe_data, stat, time_frames, date):
    time_frame_stat_ctr = dict(zip(time_frames, (0 for _ in time_frames)))
    time_frame_PA_ctr = dict(zip(time_frames, (0 for _ in time_frames)))
    for timeframe in time_frames:
        start_dt =  date - timedelta(days=timeframe)
        for index, row in player_timeframe_data.iterrows():
            if datetime.strptime(row['game_date'], '%Y-%m-%d').date() >= start_dt:
                time_frame_PA_ctr[timeframe] += 1
                if stat in ['walk', 'strikeout', 'hit_by_pitch']:
                    if row['events'] == stat:
                        time_frame_stat_ctr[timeframe] += 1
#                 elif stat in ['ground_ball', 'fly_ball', 'line_drive', 'popup'] :
#                     if not pd.isnull(row['bb_type']):
#                         if row['bb_type'] == stat:
#                             time_frame_stat_ctr[timeframe] += 1
                else:
                    if not pd.isnull(row['estimated_woba_using_speedangle']):
                        time_frame_stat_ctr[timeframe] += row['estimated_woba_using_speedangle']
                    else:
                        time_frame_stat_ctr[timeframe] += row['woba_value']
    time_frame_stats = []
    for time in time_frames:
        if time_frame_PA_ctr[time] == 0:
            time_frame_stats.append(0)
        else:
            time_frame_stats.append(time_frame_stat_ctr[time]/time_frame_PA_ctr[time])
    return time_frame_stats
def compute_bb_stats(data, n, timeframes):
    bb_df = data.dropna(subset = ['type', 'launch_speed', 'launch_angle'])
    bb_df = bb_df[bb_df['type'] == 'X']
    bb_stats = ['launch_speed', 'launch_angle']
    stats_dict = dict(zip(bb_stats, ([] for _ in bb_stats)))
    data_start_dt = datetime.strptime(bb_df['game_date'].iloc[-1], '%Y-%m-%d').date()
    time_frame_stats = dict(zip(bb_stats, ([] for _ in bb_stats)))

    used_idx = []
    for i in tqdm.tqdm(range(n)):
    
        idx = random.randint(0, len(bb_df))
        counter = 0
        while (idx in used_idx or datetime.strptime(bb_df['game_date'].iloc[idx], '%Y-%m-%d').date() < data_start_dt + timedelta(days=timeframes[-1])):
            counter +=1 
            idx =  (idx+1) % len(bb_df)
            if counter == len(bb_df):
                return concatenate_PA_data(time_frame_stats, stats_dict, bb_stats) 

        used_idx.append(idx)
        id = int(bb_df['batter'].iloc[idx])
        date = datetime.strptime(bb_df['game_date'].iloc[idx], '%Y-%m-%d').date()
        player_data = bb_df[bb_df['batter'].astype(int) == id]
        timeframe_data = []
        for index, row in player_data.iterrows():
            if datetime.strptime(row['game_date'], '%Y-%m-%d').date() < date:
                timeframe_data.append(row)
        player_timeframe_data = pd.DataFrame(timeframe_data)
        
        stats_dict = find_bb_outcome(stats_dict, bb_df, idx)
        for stat in bb_stats:
            time_frame_stats[stat].append(find_bb_time_stats(player_timeframe_data, stat, timeframes, date))
    return concatenate_bb_data(bb_stats, time_frame_stats, stats_dict)
def concatenate_bb_data(bb_stats, time_frame_stats, stats_dict):
    bb_data = dict(zip(bb_stats, ([] for _ in bb_stats)))
    for stat in bb_stats:
        
        bb_stat_data = []
        for i in range (n):
            data_row = time_frame_stats[stat][i]
            data_row.append(stats_dict[stat][i])
            
            bb_stat_data.append(data_row)
        bb_data[stat] = bb_stat_data
    return(bb_data)
def find_bb_outcome(stats_dict, bb_df, idx):
    stats_dict['launch_speed'].append(bb_df['launch_speed'].iloc[idx])
    stats_dict['launch_angle'].append(bb_df['launch_angle'].iloc[idx])
    return stats_dict
def find_bb_time_stats(player_timeframe_data, stat, time_frames, date):
    time_frame_stat_ctr = dict(zip(time_frames, (0 for _ in time_frames)))
    time_frame_PA_ctr = dict(zip(time_frames, (0 for _ in time_frames)))
    for timeframe in time_frames:
        start_dt =  date - timedelta(days=timeframe)
        for index, row in player_timeframe_data.iterrows():
            if datetime.strptime(row['game_date'], '%Y-%m-%d').date() >= start_dt:
                time_frame_PA_ctr[timeframe] += 1
                time_frame_stat_ctr[timeframe] += row[stat]
    time_frame_stats = []
    for time in time_frames:
        if time_frame_PA_ctr[time] == 0:
            time_frame_stats.append(0)
        else:
            time_frame_stats.append(time_frame_stat_ctr[time]/time_frame_PA_ctr[time])
    return time_frame_stats

timeframes = [1, 15, 30, 60]
print('Generating PA data...')
PA_data = compute_PA_stats(data, n, timeframes)
# print('Generating BB data...')
# bb_data = compute_bb_stats(data, n, timeframes)

Generating PA data...


100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [6:29:24<00:00,  2.34s/it]


In [37]:
PA_data['walk']

[[0, 0.06896551724137931, 0.05217391304347826, 0.06944444444444445, 0],
 [0, 0.0, 0.08771929824561403, 0.10377358490566038, 0],
 [0, 0.1, 0.07142857142857142, 0.09411764705882353, 0],
 [0.0, 0.017241379310344827, 0.058333333333333334, 0.06666666666666667, 0],
 [0.0, 0.017857142857142856, 0.019230769230769232, 0.028985507246376812, 0],
 [0.0, 0.11764705882352941, 0.08333333333333333, 0.0625, 0],
 [0, 0.15, 0.13333333333333333, 0.13178294573643412, 0],
 [0.2, 0.16, 0.12264150943396226, 0.10970464135021098, 0],
 [0.0, 0.04081632653061224, 0.10377358490566038, 0.12380952380952381, 0],
 [0.0, 0.10204081632653061, 0.08333333333333333, 0.10059171597633136, 0],
 [0, 0.06779661016949153, 0.05511811023622047, 0.04878048780487805, 0],
 [0.3333333333333333,
  0.14545454545454545,
  0.13157894736842105,
  0.10628019323671498,
  0],
 [0.0, 0.016129032258064516, 0.04201680672268908, 0.06363636363636363, 1],
 [0, 0.043478260869565216, 0.037037037037037035, 0.05084745762711865, 0],
 [0.25, 0.1269841269

In [38]:
names = ['walk', 'hit_by_pitch', 'xwoba']
for name in names:
    print(name)
    curr_data = PA_data[name]
    curr_df = pd.DataFrame(curr_data, columns=['One', 'Fifteen', 'Thirty', 'Sixty', 'Result'])
    
    curr_df.to_csv(name + '.csv')

walk
hit_by_pitch
xwoba


In [3]:
# names = ['launch_speed', 'launch_angle']
# for name in names:
#     print(name)
#     curr_data = bb_data[name]
#     curr_df = pd.DataFrame(curr_data, columns=['One', 'Fifteen', 'Thirty', 'Sixty', 'Result'])
    
#     curr_df.to_csv(name + '.csv')

launch_speed
launch_angle


## Load timeframe data, run regressions

In [9]:
walk = pd.read_csv('walk.csv', index_col=0)
strikeout = pd.read_csv('strikeout.csv', index_col=0)
hit_by_pitch = pd.read_csv('hit_by_pitch.csv', index_col=0)
ground_ball = pd.read_csv('ground_ball.csv', index_col=0)
fly_ball = pd.read_csv('fly_ball.csv', index_col=0)
line_drive = pd.read_csv('line_drive.csv', index_col=0)
popup = pd.read_csv('popup.csv', index_col=0)
xwoba = pd.read_csv('xwoba.csv', index_col=0)

In [10]:
launch_speed = pd.read_csv('launch_speed.csv', index_col=0)
launch_angle = pd.read_csv('launch_angle.csv', index_col=0)

In [11]:
X = walk.drop(['Result'], axis=1)
y = walk['Result']

In [28]:
clf = LogisticRegression().fit(X, y)
print(clf.coef_)
print(clf.score(X, y))
print(clf.predict_proba(np.array([[0.000000, 0.277800, 0.236824, 0.277804]])))




[[-0.1035807  -0.09838077 -0.13664003 -0.13527315]]
0.9
[[0.78825762 0.21174238]]


In [32]:
# predict probabilities for each class using the trained model
predictions = clf.predict_proba(X)

# calculate the sum of the squared differences of the predicted probabilities and the actual binary values of y
squared_differences = [(pred[0] - y[i])**2 + (pred[1] - y[i])**2 for i, pred in enumerate(predictions)]
sum_squared_differences = sum(squared_differences)

# print the result
print(f"Mean Squared Error: {sum_squared_differences / len(walk)}")

Mean Squared Error: 0.0006549404861320794
