In [1]:
import os, time, itertools, sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp
from copy import deepcopy
from pybaseball import *
sys.path.append('/home/dcooper/rockies/RockiesAnalysis/')
from utils.plotting.spray_chart import *
from utils.analysis.get_expected_outcomes import get_expected_outcomes



## Get batted ball data

In [2]:
exp_outcomes, all_BIP_data, ev_labels, la_labels, sa_labels = get_expected_outcomes()

## pCBB

In [3]:
def get_player_dfs(last_name=None, first_name=None, player_id=None, year=2025, home_team='COL', not_home_team=None):
    
    # Get Player ID
    if player_id is None:
        if last_name == 'Bell':
            player_id = playerid_lookup(last_name, first_name, ignore_accents=True)['key_mlbam'][1]
        else:
            player_id = playerid_lookup(last_name, first_name, ignore_accents=True)['key_mlbam'][0]

    # Add filters
    player_df = all_BIP_data.loc[all_BIP_data['batter'] == player_id]
    if year is not None:
        player_df = player_df.loc[player_df['game_year'] == year]
    if home_team is not None:
        player_df = player_df.loc[player_df['home_team'] == home_team]
    if not_home_team is not None:
        player_df = player_df.loc[player_df['home_team'] != not_home_team]
    assert player_df.shape[0] > 0

    print(f'Found {player_df.shape[0]} batted balls for {first_name} {last_name}')

    # Iterate through contact bins and add to player_df
    player_contact = pd.DataFrame(columns=['EV', 'LA', 'SA', 'count', 'ΔHR', 'Δ3B', 'Δ2B', 'Δ1B', 'ΔOut', 'ΔBases'])
    for i, ev_label in enumerate(ev_labels):
        for j, la_label in enumerate(la_labels):
            for k, sa_label in enumerate(sa_labels):
                count = player_df.loc[player_df['launch_speed_bin'] == ev_label].loc[player_df['launch_angle_bin'] == la_label].loc[player_df['spray_angle_bin'] == sa_label].shape[0]
                prob = count / player_df.shape[0]
                d_outcome = exp_outcomes.loc[exp_outcomes['EV'] == ev_label].loc[exp_outcomes['LA'] == la_label].loc[exp_outcomes['SA'] == sa_label]
                if d_outcome.shape[0] > 0:
                    d_outcome = d_outcome[['ΔHR', 'Δ3B', 'Δ2B', 'Δ1B', 'ΔOut', 'ΔBases']].to_numpy()[0]                
                else:
                    d_outcome = np.zeros(6)  
                player_contact.loc[player_contact.shape[0]] = [ev_label, la_label, sa_label, count] + list(d_outcome * prob)

    # Sort
    player_contact = player_contact.sort_values('ΔBases', ascending=False)
    player_contact = player_contact.reset_index(drop=True)

    return player_contact, player_df

def get_statcast_batter(last_name, first_name):
    count = 10
    while count > 0:
        try:
            if last_name == 'Bell':
                player_id = playerid_lookup(last_name, first_name, ignore_accents=True)['key_mlbam'][1]
            else:
                player_id = playerid_lookup(last_name, first_name, ignore_accents=True)['key_mlbam'][0]            
            player_data = statcast_batter(start_dt='2025-03-27', end_dt='2025-09-28', player_id=player_id)
            count = 0
        except:
            if count == 1:
                raise Exception(r['last'], r['first'])
            else:
                count -= 1

    return player_data

In [13]:
first_base_free_agents = np.array([
    ['Alonso', 'Pete'],
    ['Naylor', 'Josh'],
    ["O'Hearn", 'Ryan'],
    ['Santana', 'Carlos'],
    ['Lowe', 'Nathaniel'],
    ['Arraez', 'Luis'],
    ['Goldschmidt', 'Paul'],
    ['Hoskins', 'Rhys'],
    ['Turner', 'Justin'],
    ['Solano', 'Donovan'],
    ['Smith', 'Dominic'],
    ['Bell', 'Josh'],
    ['France', 'Ty'],
    ['Wade', 'LaMonte'],
    ['Tellez', 'Rowdy'],
    ['Elko', 'Tim'],
    ['Flores', 'Wilmer'],
    ['Toglia', 'Michael'],
])

In [27]:
# Methods
def get_CBwOBA(d1B, d2B, d3B, dHR, AB, BB, IBB, HBP, SF):
    return (0.882*d1B + 1.252*d2B + 1.584*d3B + 2.037*dHR) / (AB + BB - IBB + HBP + SF)

def get_wOBA(single, double, triple, home_run, AB=None, BB=None, IBB=None, HBP=None, SF=None, PA=None):
    if PA is not None:
        return 0.691*(BB-IBB) + 0.722*HBP + 0.882*single + 1.252*double + 1.584*triple + 2.037*home_run) / PA
    else:
        return (0.691*(BB-IBB) + 0.722*HBP + 0.882*single + 1.252*double + 1.584*triple + 2.037*home_run) / (AB + BB - IBB + HBP + SF)

def get_batting_runs(wRAA, PA, lPA=91666, lgR=10817, PF=1.13, lwRC=10864):
    return wRAA + ((lgR/lPA) - (PF*lgR/lPA)) * PA + ((lgR/lPA) - (lwRC/lPA)) * PA


In [24]:
first_basemen_CBB = pd.DataFrame(columns=['first', 'last', 'ΔHR', 'Δ3B', 'Δ2B', 'Δ1B', 'ΔOut', 'CBB', 'BIP'])
first_basemen_pCBB = pd.DataFrame(columns=['first', 'last', 'pBIP', 'pCBB', 'p1B', 'p2B', 'p3B', 'pHR'])
for last, first in first_base_free_agents:

    # Load data
    try:
        batter_CBB, batter_bb = get_player_dfs(last, first, home_team=None)
        batter_data = get_statcast_batter(last, first)
    except:
        print(f'Failed for {last}, {first}')


    # Check for >= 100 batted balls
    if batter_CBB['count'].sum() >= 100:

        # Get CBB 
        CBB = batter_CBB['ΔBases'].sum(axis=0)

        # Get events
        player_events = batter_data['events'].dropna().to_numpy()
        PA = len(player_events[player_events != 'truncated_pa'])
        AB = len(player_events[~np.isin(player_events, np.array(['sac_fly', 'walk', 'intent_walk' 'hit_by_pitch', 'sac_bunt', 'truncated_pa', 'fielders_choice', 'field_error', 'force_out', 'catcher_interf']))])
        BB = len(player_events[np.isin(player_events, np.array(['walk', 'intent_walk']))])
        IBB = len(player_events[player_events == 'intent_walk'])
        SO = len(player_events[player_events == 'strikeout'])
        HBP = len(player_events[player_events == 'hit_by_pitch'])
        SF = len(player_events[player_events == 'sac_fly'])
        singles = len(player_events[player_events == 'single'])
        doubles = len(player_events[player_events == 'double'])
        triples = len(player_events[player_events == 'triple'])
        home_runs = len(player_events[player_events == 'home_run'])        

        # Get BIP
        BIP = (PA - (SO + BB + HBP)) / PA
        
        # Get pBIP
        c_pBIP = (BIP + 0.0409) * 251
        nc_pBIP = (BIP) * 251
            
        # Get pCBB
        pCBB = CBB * c_pBIP

        # Get pOutcomes
        p1B = c_pBIP * ((singles / (BIP * PA)) + batter_CBB['Δ1B'].sum(axis=0)) + nc_pBIP * (singles / (BIP * PA))
        p2B = c_pBIP * ((doubles / (BIP * PA)) + batter_CBB['Δ2B'].sum(axis=0)) + nc_pBIP * (doubles / (BIP * PA))
        p3B = c_pBIP * ((triples / (BIP * PA)) + batter_CBB['Δ3B'].sum(axis=0)) + nc_pBIP * (triples / (BIP * PA))
        pHR = c_pBIP * ((home_runs / (BIP * PA)) + batter_CBB['ΔHR'].sum(axis=0)) + nc_pBIP * (home_runs / (BIP * PA))

        # Get projected value
        pwOBA = get_wOBA(p1B, p2B, p3B, pHR, 

        
        first_basemen_CBB.loc[first_basemen_CBB.shape[0]] = [first, last] + batter_CBB[['ΔHR', 'Δ3B', 'Δ2B', 'Δ1B', 'ΔOut']].sum(axis=0).to_list() + [CBB, BIP]
        first_basemen_pCBB.loc[first_basemen_pCBB.shape[0]] = [first, last, pBIP, pCBB, pwOBA, p1B, p2B, p3B, pHR]
    else:
        print('>>skipping', last, first)

    break

Found 469 batted balls for Pete Alonso
Gathering Player Data


In [25]:
first_basemen_CBB.sort_values('CBB', ascending=False)

Unnamed: 0,first,last,ΔHR,Δ3B,Δ2B,Δ1B,ΔOut,CBB,BIP
0,Pete,Alonso,0.002708,0.005586,0.00887,0.004478,-0.021343,0.048827,0.664316


In [26]:
first_basemen_pCBB.sort_values('pCBB', ascending=False)

Unnamed: 0,first,last,pBIP,pCBB,p1B,p2B,p3B,pHR
0,Pete,Alonso,177.0092,8.64288,66.47777,31.493314,1.718672,28.213069


In [None]:
# styled_df = first_basemen_CBB.sort_values('pCBB', ascending=False)[['first', 'last', 'CBB', 'CBB%', 'pCBB']].head(20).round(2)
# styled_df = styled_df.style.format({
#         'CBB': '{:.2f}',
#         'CBB%': '{:.2f}',
#         'pCBB': '{:.2f}'
#     }).background_gradient(
#     cmap='bwr',
#     vmin=0,
#     vmax=styled_df['pCBB'].max(),
#     subset=['pCBB']
# ).background_gradient(
#     cmap='bwr',
#     vmin=0,
#     vmax=styled_df['CBB%'].max(),
#     subset=['CBB%']
# ).background_gradient(
#     cmap='bwr',
#     vmin=0,
#     vmax=styled_df['CBB'].max(),
#     subset=['CBB']
# )
# styled_df

## pwRAA

In [None]:
"""
array(['field_out', 'sac_fly', 'double', 'single', 'home_run',
       'force_out', 'double_play', 'grounded_into_double_play',
       'field_error', 'fielders_choice_out', 'fielders_choice',
       'triple_play', 'triple', 'sac_fly_double_play', 'sac_bunt',
       'sac_bunt_double_play'], dtype=object)"""

In [None]:
# Methods
def get_CBwOBA(d1B, d2B, d3B, dHR, AB, BB, IBB, HBP, SF):
    return (0.882*d1B + 1.252*d2B + 1.584*d3B + 2.037*dHR) / (AB + BB - IBB + HBP + SF)

def get_wOBA(single, double, triple, home_run, AB, BB, IBB, HBP, SF):
    return (0.691*(BB-IBB) + 0.722*HBP + 0.882*single + 1.252*double + 1.584*triple + 2.037*home_run) / (AB + BB - IBB + HBP + SF)

def get_batting_runs(wRAA, PA, lPA=91666, lgR=10817, PF=1.13, lwRC=10864):
    return wRAA + ((lgR/lPA) - (PF*lgR/lPA)) * PA + ((lgR/lPA) - (lwRC/lPA)) * PA


In [None]:
first_basemen_CBB['wOBA'] = np.zeros(first_basemen_CBB.shape[0])
first_basemen_CBB['CBwOBA'] = np.zeros(first_basemen_CBB.shape[0])
first_basemen_CBB['pwOBA'] = np.zeros(first_basemen_CBB.shape[0])
first_basemen_CBB['pwRAA'] = np.zeros(first_basemen_CBB.shape[0])
first_basemen_CBB['pBattingRuns'] = np.zeros(first_basemen_CBB.shape[0])

for i, r in first_basemen_CBB.iterrows():


    # Get PA, AB, BB, IBB, HBP, SF
    player_data = get_statcast_batter(r['last'], r['first'])
    player_events = player_data['events'].dropna().to_numpy()
    PA = len(player_events[player_events != 'truncated_pa'])
    AB = len(player_events[~np.isin(player_events, np.array(['sac_fly', 'walk', 'intent_walk' 'hit_by_pitch', 'sac_bunt', 'truncated_pa', 'fielders_choice', 'field_error', 'force_out', 'catcher_interf']))])
    BB = len(player_events[np.isin(player_events, np.array(['walk', 'intent_walk']))])
    IBB = len(player_events[player_events == 'intent_walk'])
    HBP = len(player_events[player_events == 'hit_by_pitch'])
    SF = len(player_events[player_events == 'sac_fly'])
    singles = len(player_events[player_events == 'single'])
    doubles = len(player_events[player_events == 'double'])
    triples = len(player_events[player_events == 'triple'])
    home_runs = len(player_events[player_events == 'home_run'])
    
    try:
        wOBA = get_wOBA(singles, doubles, triples, home_runs, AB, BB, IBB, HBP, SF)
        CBwOBA = get_CBwOBA(r['Δ1B'], r['Δ2B'], r['Δ3B'], r['ΔHR'], AB, BB, IBB, HBP, SF)
        pwOBA = wOBA + CBwOBA
        pwRAA = 502 * ((pwOBA - 0.314) / 1.232)
        pBattingRuns = get_batting_runs(pwRAA, 502)
    except:
        CBwOBA = 0
    first_basemen_CBB.loc[i, 'CBwOBA'] = CBwOBA
    first_basemen_CBB.loc[i, 'wOBA'] = wOBA
    first_basemen_CBB.loc[i, 'pwOBA'] = pwOBA
    first_basemen_CBB.loc[i, 'pwRAA'] = pwRAA
    first_basemen_CBB.loc[i, 'pBattingRuns'] = pwRAA

In [None]:
first_basemen_CBB.sort_values('pBattingRuns', ascending=False).round(3)