### Correlation and Value of Each Stat

In [34]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Swing Decisons')

In [35]:
swing = pd.read_csv('./data/final_datasets/swing_values.csv')
fg = pd.read_csv('./data/final_datasets/data_15_24.csv')

In [36]:
dfs = swing.merge(fg, how='left', left_on=['batter', 'game_date'], right_on=['Name', 'Season'])

In [37]:
# Select columns of interest for correlation
cols_of_interest = [
'avg_swing_value',
]

# Add all other numeric columns except identifiers and duplicates
exclude_cols = [
    'Unnamed: 0_x', 'batter', 'game_date', 'Unnamed: 0_y', 'Season', 'Name', 'Team'
]
numeric_cols = [col for col in dfs.select_dtypes(include=[np.number]).columns if col not in exclude_cols]

# Ensure all columns of interest are included
corr_cols = list(set(cols_of_interest + numeric_cols))

# Compute correlation matrix
corr_matrix = dfs[corr_cols].corr()

# Show correlations of the 5 key stats with all other stats
corr_matrix_stat = corr_matrix[['avg_value_total', 'avg_swing_value', 'avg_take_value', 'avg_sum_value', 'O-Swing%']]

In [38]:
# Unstack, filter, and sort by absolute value, excluding self and cols_of_interest-to-cols_of_interest
corr_unstacked = corr_matrix_stat.stack().reset_index()
corr_unstacked.columns = ['stat', 'other_stat', 'correlation']

# Exclude relationships where both are in cols_of_interest (self relationships)
mask = ~corr_unstacked['other_stat'].isin(cols_of_interest)
filtered_corrs = corr_unstacked[mask]

# Sort by absolute correlation value and show top 20
top20 = filtered_corrs.reindex(filtered_corrs['correlation'].abs().sort_values(ascending=False).index).head(20)
print(top20)

                stat       other_stat  correlation
88     avg_sum_value    avg_sum_value     1.000000
17    avg_take_value   avg_take_value     1.000000
134         O-Swing%         O-Swing%     1.000000
140  avg_value_total  avg_value_total     1.000000
87     avg_sum_value   avg_take_value    -0.996870
18    avg_take_value    avg_sum_value    -0.996870
53   avg_swing_value    avg_sum_value     0.992736
133         O-Swing%    avg_sum_value     0.983780
89     avg_sum_value         O-Swing%     0.983780
54   avg_swing_value         O-Swing%     0.982707
52   avg_swing_value   avg_take_value    -0.980119
19    avg_take_value         O-Swing%    -0.976710
132         O-Swing%   avg_take_value    -0.976710
15    avg_take_value  avg_value_total     0.939008
142  avg_value_total   avg_take_value     0.939008
143  avg_value_total    avg_sum_value    -0.908883
85     avg_sum_value  avg_value_total    -0.908883
9      swing_percent         O-Swing%     0.900668
8      swing_percent    avg_sum

In [39]:
metrics_to_test = ['avg_value_total', 'avg_swing_value', 'avg_take_value', 'avg_sum_value', 'woba', 'BB%', 'O-Swing%', 'xwoba']

for metric in metrics_to_test:
    player_year = dfs[['batter', 'game_date', metric]].sort_values(['batter', 'game_date'])

    # Shift metric by 1 year for each player
    player_year[f'{metric}_next'] = player_year.groupby('batter')[metric].shift(-1)
    player_year['year_next'] = player_year.groupby('batter')['game_date'].shift(-1)

    # Only keep rows where the next year is consecutive
    player_year = player_year[player_year['game_date'] + 1 == player_year['year_next']]

    # Drop rows with missing values
    player_year = player_year.dropna(subset=[metric, f'{metric}_next'])

    # Compute correlation across all player-year pairs
    corr = player_year[metric].corr(player_year[f'{metric}_next'])
    print(f"Correlation between current and next year {metric}: {corr:.3f}")

Correlation between current and next year avg_value_total: 0.749
Correlation between current and next year avg_swing_value: 0.810
Correlation between current and next year avg_take_value: 0.808
Correlation between current and next year avg_sum_value: 0.812
Correlation between current and next year woba: 0.468
Correlation between current and next year BB%: 0.710
Correlation between current and next year O-Swing%: 0.832
Correlation between current and next year xwoba: 0.645


In [40]:
# Sort the data by batter and game_date
dfs_sorted = dfs.sort_values(['batter', 'game_date'])

In [41]:
# List of stats to process (exclude 'woba' and 'BB%')
stats_to_process = [col for col in dfs_sorted.columns if col not in ['woba', 'BB%', 'batter', 'game_date', 'Name', 'Season', 'Team', ''] and dfs_sorted[col].dtype != 'O']

results = []

for stat in stats_to_process:
    temp = dfs_sorted[['batter', 'game_date', stat, 'woba', 'BB%']].copy()
    temp[f'{stat}_next'] = temp.groupby('batter')[stat].shift(-1)
    temp['year_next'] = temp.groupby('batter')['game_date'].shift(-1)
    # Only keep rows where the next year is consecutive
    temp = temp[temp['game_date'] + 1 == temp['year_next']]
    temp = temp.dropna(subset=[stat, f'{stat}_next', 'woba', 'BB%'])
    if not temp.empty:
        corr_woba = temp[f'{stat}_next'].corr(temp['woba'])
        corr_bb = temp[f'{stat}_next'].corr(temp['BB%'])
        results.append({'stat': stat, 'corr_next_woba': corr_woba, 'corr_next_BB%': corr_bb})

correlations_df = pd.DataFrame(results)
print(correlations_df)

                  stat  corr_next_woba  corr_next_BB%
0         Unnamed: 0_x        0.098359       0.018517
1      avg_value_total        0.169792       0.630323
2      avg_swing_value       -0.083693      -0.582790
3       avg_take_value        0.119329       0.623791
4        avg_sum_value       -0.105643      -0.610543
5         Unnamed: 0_y        0.096846       0.016541
6                   PA        0.217989       0.064154
7                  Age        0.044055       0.142815
8                   K%       -0.016982       0.119996
9                BABIP        0.131183      -0.115252
10                wRC+        0.491551       0.339140
11                 BsR       -0.115156      -0.142918
12                 Off        0.464119       0.319922
13                 Def       -0.245285      -0.196178
14                 WAR        0.314621       0.192130
15             Barrel%        0.407536       0.338779
16               maxEV        0.353335       0.200663
17            HardHit%      