#### Exploring Which Dataset to Use

In [None]:
import pandas as pd
import os
import numpy as np
from difflib import get_close_matches

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data/')

#### Load Data

In [67]:
m1_woba = pd.read_csv('sim_results/m1_wobacon_23_25.csv')
m2_woba = pd.read_csv('sim_results/m2_wobacon_23_25.csv')
woba = pd.read_csv('pitch/savant.csv')

#### Correlations

In [69]:
merged = woba.merge(m1_woba[['name', 'year', 'count', 'monte']], on=['name', 'year'], how='right', suffixes=('', '_m1'))
merged = merged.merge(m2_woba[['name', 'year', 'count', 'monte']], on=['name', 'year'], how='right', suffixes=('', '_m2'))
merged = merged.rename(columns={'count': 'count_m1', 'monte': 'm1_monte', 'count_m2': 'count_m2', 'monte_m2': 'm2_monte'})

In [70]:
missing_names = merged[merged.isnull().any(axis=1)]['name'].unique()
existing_names = merged[~merged.isnull().any(axis=1)]['name'].unique()

name_map = {}
for name in missing_names:
    matches = get_close_matches(name, existing_names, n=1, cutoff=0.75)
    if matches:
        name_map[name] = matches[0]

del name_map['aramis garcia']
del name_map['zack collins']


In [71]:
for missing_name, matched_name in name_map.items():
    idx_missing = merged[(merged['name'] == missing_name) & (merged.isnull().any(axis=1))].index
    for idx in idx_missing:
        year = merged.loc[idx, 'year']
        match_row = merged[(merged['name'] == matched_name) & (merged['year'] == year) & (~merged.isnull().any(axis=1))]
        if not match_row.empty:
            for col in merged.columns:
                if pd.isnull(merged.at[idx, col]):
                    merged.at[idx, col] = match_row.iloc[0][col]

In [72]:
merged = merged[['name', 'year', 'm1_monte', 'm2_monte', 'wobacon', 'xwobacon', 'pa']]

In [73]:
merged = merged[merged['pa'] > 50]

#### Correlation

stability

In [74]:
stats = ['m1_monte', 'm2_monte', 'wobacon', 'xwobacon']
corr_next = merged.copy()

for stat in stats:
    corr_next[f'{stat}_next'] = corr_next.groupby('name')[stat].shift(-1)

corr_next = corr_next.dropna(subset=[f'{stat}_next' for stat in stats])

corrs = {}
for stat in stats:
    corrs[stat] = corr_next[[stat, f'{stat}_next']].corr().iloc[0, 1]

print(corrs)

{'m1_monte': 0.7366470579840835, 'm2_monte': 0.8213546740863039, 'wobacon': 0.3637345842745023, 'xwobacon': 0.6119147654085051}


dataset to next year wobacon

In [76]:
prev_stat_next_woba_corr = {}
for stat in stats:
    prev_stat_next_woba_corr[stat] = corr_next[[stat, 'wobacon_next']].corr().iloc[0, 1]

print(prev_stat_next_woba_corr)

{'m1_monte': 0.48053778700640043, 'm2_monte': 0.4812626747032464, 'wobacon': 0.3637345842745023, 'xwobacon': 0.48420167040204526}


2023 stat to 2024 wobacon

In [77]:
stat_2024_to_2025_woba_corr = {}
df_2024 = corr_next[corr_next['year'] == 2023]
for stat in stats:
    stat_2024_to_2025_woba_corr[stat] = df_2024[[stat, 'wobacon_next']].corr().iloc[0, 1]

print(stat_2024_to_2025_woba_corr)

{'m1_monte': 0.450303028626735, 'm2_monte': 0.4959605353871911, 'wobacon': 0.3358929656530977, 'xwobacon': 0.46863996503932354}


2024 stat to 2025 wobacon

In [78]:
stat_2024_to_2025_woba_corr = {}
df_2024 = corr_next[corr_next['year'] == 2024]
for stat in stats:
    stat_2024_to_2025_woba_corr[stat] = df_2024[[stat, 'wobacon_next']].corr().iloc[0, 1]

print(stat_2024_to_2025_woba_corr)

{'m1_monte': 0.51724618052589, 'm2_monte': 0.46735983393446834, 'wobacon': 0.4012127582723429, 'xwobacon': 0.5061996175636763}
