### Correlation and Value of Each Stat

In [269]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Swing Decisons')

In [270]:
swing = pd.read_csv('./data/final_datasets/swing_values.csv')
fg = pd.read_csv('./data/final_datasets/data_15_24.csv')

In [271]:
dfs = swing.merge(fg, how='left', left_on=['batter', 'game_date'], right_on=['Name', 'Season'])

In [272]:
# Select columns of interest for correlation
cols_of_interest = [
'avg_swing_value',
]

# Add all other numeric columns except identifiers and duplicates
exclude_cols = [
    'Unnamed: 0_x', 'batter', 'game_date', 'Unnamed: 0_y', 'Season', 'Name', 'Team'
]
numeric_cols = [col for col in dfs.select_dtypes(include=[np.number]).columns if col not in exclude_cols]

# Ensure all columns of interest are included
corr_cols = list(set(cols_of_interest + numeric_cols))

# Compute correlation matrix
corr_matrix = dfs[corr_cols].corr()

# Show correlations of the 5 key stats with all other stats
corr_matrix_stat = corr_matrix[['avg_value_total', 'avg_swing_value', 'avg_take_value', 'avg_sum_value']]

In [273]:
# Unstack, filter, and sort by absolute value, excluding self and cols_of_interest-to-cols_of_interest
corr_unstacked = corr_matrix_stat.stack().reset_index()
corr_unstacked.columns = ['stat', 'other_stat', 'correlation']

# Exclude relationships where both are in cols_of_interest (self relationships)
mask = ~corr_unstacked['other_stat'].isin(cols_of_interest)
filtered_corrs = corr_unstacked[mask]

# Sort by absolute correlation value and show top 20
top20 = filtered_corrs.reindex(filtered_corrs['correlation'].abs().sort_values(ascending=False).index).head(20)
print(top20)

                stat       other_stat  correlation
128  avg_value_total  avg_value_total     1.000000
79     avg_sum_value    avg_sum_value     1.000000
38    avg_take_value   avg_take_value     1.000000
39    avg_take_value    avg_sum_value    -0.996024
78     avg_sum_value   avg_take_value    -0.996024
135  avg_swing_value    avg_sum_value     0.994810
134  avg_swing_value   avg_take_value    -0.981790
126         O-Swing%   avg_take_value    -0.917056
127         O-Swing%    avg_sum_value     0.916840
7      swing_percent    avg_sum_value     0.900847
6      swing_percent   avg_take_value    -0.896680
34        bb_percent   avg_take_value     0.721463
66               BB%   avg_take_value     0.717950
35        bb_percent    avg_sum_value    -0.708409
67               BB%    avg_sum_value    -0.704224
36    avg_take_value  avg_value_total     0.645097
130  avg_value_total   avg_take_value     0.645097
87          Z-Swing%    avg_sum_value     0.590203
131  avg_value_total    avg_sum

In [274]:
metrics_to_test = ['avg_value_total', 'avg_swing_value', 'avg_take_value', 'woba', 'BB%', 'O-Swing%', 'xwoba']

for metric in metrics_to_test:
    player_year = dfs[['batter', 'game_date', metric]].sort_values(['batter', 'game_date'])

    # Shift metric by 1 year for each player
    player_year[f'{metric}_next'] = player_year.groupby('batter')[metric].shift(-1)
    player_year['year_next'] = player_year.groupby('batter')['game_date'].shift(-1)

    # Only keep rows where the next year is consecutive
    player_year = player_year[player_year['game_date'] + 1 == player_year['year_next']]

    # Drop rows with missing values
    player_year = player_year.dropna(subset=[metric, f'{metric}_next'])

    # Compute correlation across all player-year pairs
    corr = player_year[metric].corr(player_year[f'{metric}_next'])
    print(f"Correlation between current and next year {metric}: {corr:.3f}")

Correlation between current and next year avg_value_total: 0.447
Correlation between current and next year avg_swing_value: 0.808
Correlation between current and next year avg_take_value: 0.800
Correlation between current and next year woba: 0.364
Correlation between current and next year BB%: 0.607
Correlation between current and next year O-Swing%: 0.787
Correlation between current and next year xwoba: 0.565


In [275]:
# Sort the data by batter and game_date
dfs_sorted = dfs.sort_values(['batter', 'game_date'])

In [276]:
# List of stats to process (exclude 'woba' and 'BB%')
stats_to_process = [col for col in dfs_sorted.columns if col not in ['woba', 'BB%', 'batter', 'game_date', 'Name', 'Season', 'Team', ''] and dfs_sorted[col].dtype != 'O']

results = []

for stat in stats_to_process:
    temp = dfs_sorted[['batter', 'game_date', stat, 'woba', 'BB%']].copy()
    temp[f'{stat}_next'] = temp.groupby('batter')[stat].shift(-1)
    temp['year_next'] = temp.groupby('batter')['game_date'].shift(-1)
    # Only keep rows where the next year is consecutive
    temp = temp[temp['game_date'] + 1 == temp['year_next']]
    temp = temp.dropna(subset=[stat, f'{stat}_next', 'woba', 'BB%'])
    if not temp.empty:
        corr_woba = temp[f'{stat}_next'].corr(temp['woba'])
        corr_bb = temp[f'{stat}_next'].corr(temp['BB%'])
        results.append({'stat': stat, 'corr_next_woba': corr_woba, 'corr_next_BB%': corr_bb})

correlations_df = pd.DataFrame(results)
print(correlations_df)

                  stat  corr_next_woba  corr_next_BB%
0         Unnamed: 0_x        0.013750       0.009129
1      avg_value_total        0.178943       0.416633
2      avg_swing_value       -0.109101      -0.601978
3       avg_take_value        0.132883       0.611823
4        avg_sum_value       -0.122317      -0.609703
5         Unnamed: 0_y        0.012791       0.008808
6                   PA        0.431866       0.161298
7                  Age        0.077955       0.096905
8                   K%       -0.135808       0.098757
9                BABIP        0.107253      -0.025011
10                wRC+        0.368513       0.245582
11                 BsR       -0.037543      -0.089698
12                 Off        0.366628       0.270422
13                 Def       -0.197785      -0.176145
14                 WAR        0.349532       0.189336
15             Barrel%        0.301552       0.260909
16               maxEV        0.361568       0.148023
17            HardHit%      