### Correlation and Value of Each Stat

In [17]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Swing Decisons')

In [18]:
swing = pd.read_csv('./data/final_datasets/swing_values.csv')
fg = pd.read_csv('./data/final_datasets/data_15_24.csv')

In [19]:
dfs = swing.merge(fg, how='left', left_on=['batter', 'game_date'], right_on=['Name', 'Season'])

In [20]:
# Select columns of interest for correlation
cols_of_interest = [
'avg_swing_value',
]

# Add all other numeric columns except identifiers and duplicates
exclude_cols = [
    'Unnamed: 0_x', 'batter', 'game_date', 'Unnamed: 0_y', 'Season', 'Name', 'Team'
]
numeric_cols = [col for col in dfs.select_dtypes(include=[np.number]).columns if col not in exclude_cols]

# Ensure all columns of interest are included
corr_cols = list(set(cols_of_interest + numeric_cols))

# Compute correlation matrix
corr_matrix = dfs[corr_cols].corr()

# Show correlations of the 5 key stats with all other stats
corr_matrix_stat = corr_matrix[['avg_value_total', 'avg_swing_value', 'swing_value_added']]

In [21]:
# Unstack, filter, and sort by absolute value, excluding self and cols_of_interest-to-cols_of_interest
corr_unstacked = corr_matrix_stat.stack().reset_index()
corr_unstacked.columns = ['stat', 'other_stat', 'correlation']

# Exclude relationships where both are in cols_of_interest (self relationships)
mask = ~corr_unstacked['other_stat'].isin(cols_of_interest)
filtered_corrs = corr_unstacked[mask]

# Sort by absolute correlation value and show top 20
top20 = filtered_corrs.reindex(filtered_corrs['correlation'].abs().sort_values(ascending=False).index).head(20)
print(top20)

                 stat         other_stat  correlation
0     avg_value_total    avg_value_total     1.000000
17  swing_value_added  swing_value_added     1.000000
18    avg_swing_value    avg_value_total     0.849819
20    avg_swing_value  swing_value_added    -0.796730
84           O-Swing%    avg_value_total    -0.741924
72         bb_percent    avg_value_total     0.724742
87                BB%    avg_value_total     0.723811
15  swing_value_added    avg_value_total    -0.659730
2     avg_value_total  swing_value_added    -0.659730
33      swing_percent    avg_value_total    -0.613705
12             SwStr%    avg_value_total    -0.458797
78              xwoba    avg_value_total     0.402293
86           O-Swing%  swing_value_added     0.379782
38                 K%  swing_value_added     0.345675
14             SwStr%  swing_value_added     0.343805
11          k_percent  swing_value_added     0.339716
53               CSW%  swing_value_added     0.337685
30               woba    avg

In [22]:
metrics_to_test = ['avg_value_total', 'avg_swing_value','woba', 'BB%', 'O-Swing%', 'xwoba']

for metric in metrics_to_test:
    player_year = dfs[['batter', 'game_date', metric]].sort_values(['batter', 'game_date'])

    # Shift metric by 1 year for each player
    player_year[f'{metric}_next'] = player_year.groupby('batter')[metric].shift(-1)
    player_year['year_next'] = player_year.groupby('batter')['game_date'].shift(-1)

    # Only keep rows where the next year is consecutive
    player_year = player_year[player_year['game_date'] + 1 == player_year['year_next']]

    # Drop rows with missing values
    player_year = player_year.dropna(subset=[metric, f'{metric}_next'])

    # Compute correlation across all player-year pairs
    corr = player_year[metric].corr(player_year[f'{metric}_next'])
    print(f"Correlation between current and next year {metric}: {corr:.3f}")

Correlation between current and next year avg_value_total: 0.688
Correlation between current and next year avg_swing_value: 0.665
Correlation between current and next year woba: 0.406
Correlation between current and next year BB%: 0.688
Correlation between current and next year O-Swing%: 0.823
Correlation between current and next year xwoba: 0.601


In [23]:
# Sort the data by batter and game_date
dfs_sorted = dfs.sort_values(['batter', 'game_date'])

In [24]:
# List of stats to process (exclude 'woba' and 'BB%')
stats_to_process = [col for col in dfs_sorted.columns if col not in ['woba', 'BB%', 'batter', 'game_date', 'Name', 'Season', 'Team', ''] and dfs_sorted[col].dtype != 'O']

results = []

for stat in stats_to_process:
    temp = dfs_sorted[['batter', 'game_date', stat, 'woba', 'BB%']].copy()
    temp[f'{stat}_next'] = temp.groupby('batter')[stat].shift(-1)
    temp['year_next'] = temp.groupby('batter')['game_date'].shift(-1)
    # Only keep rows where the next year is consecutive
    temp = temp[temp['game_date'] + 1 == temp['year_next']]
    temp = temp.dropna(subset=[stat, f'{stat}_next', 'woba', 'BB%'])
    if not temp.empty:
        corr_woba = temp[f'{stat}_next'].corr(temp['woba'])
        corr_bb = temp[f'{stat}_next'].corr(temp['BB%'])
        results.append({'stat': stat, 'corr_next_woba': corr_woba, 'corr_next_BB%': corr_bb})

correlations_df = pd.DataFrame(results)
print(correlations_df)

                  stat  corr_next_woba  corr_next_BB%
0         Unnamed: 0_x        0.013743       0.007494
1      avg_swing_value        0.183604       0.504462
2    swing_value_added       -0.032352      -0.234363
3      avg_value_total        0.262777       0.557841
4         Unnamed: 0_y        0.011647       0.005952
5                   PA        0.381039       0.124491
6                  Age        0.036546       0.083775
7                   K%       -0.089815       0.126428
8                BABIP        0.097669      -0.059857
9                 wRC+        0.413755       0.290970
10                 BsR       -0.049588      -0.102095
11                 Off        0.416095       0.284181
12                 Def       -0.244962      -0.190978
13                 WAR        0.334190       0.177968
14             Barrel%        0.347484       0.297378
15               maxEV        0.365043       0.135039
16            HardHit%        0.345639       0.202850
17            O-Swing%      