In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [35]:
og = pd.read_csv('./data/full_data.csv')
df = og.copy() # import data

#### Prepare Data for NearestNeighbors

##### remove all nas

In [36]:
# need to have vars on the same scale and remove certian uneeded types
df = df.dropna(subset=['xslg']) # reduces sample size, keeps most key vars
df = df.dropna(subset=['CSW%']) # remove cols that lack all types of data
df = df.dropna(subset=['sprint_speed'])
df = df.drop(columns=['avg_swing_speed', 'fast_swing_rate', 'n_outs_above_average', 'Unnamed: 0', 'Team', 'WAR'])

I made the choice to drop WAR because I think it is more intresting to look at comparing underlying seasons based on there metrics and then seeing if the WAR totals are similar.

##### make all vars the same scale

In [45]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [46]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

mean for each year

In [48]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean

mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})

mean_col_year.head(50)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Def,Barrel%,...,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,sprint_speed
0,2018,455.561032,28.009197,0.08654,0.217566,0.297643,99.860515,0.160143,-0.789713,0.065183,...,0.405872,0.317233,33.873222,7.016499,6.100605,36.267947,98.890977,93.613389,24.390246,27.143692
1,2019,445.549872,27.831088,0.086937,0.224328,0.298797,99.611534,0.119241,-0.689517,0.071249,...,0.430831,0.323181,33.937883,7.700035,6.073518,37.421817,99.262962,93.820204,25.17741,27.078588
2,2020,171.521178,27.956771,0.092272,0.234368,0.291457,100.121986,0.003499,-0.436579,0.078546,...,0.415583,0.32335,33.433592,7.875048,6.115385,37.956375,99.193593,93.883739,26.473254,26.884205
3,2021,433.925949,28.257728,0.08832,0.226425,0.292156,99.72133,0.101698,-0.66377,0.083096,...,0.414361,0.320779,33.771767,8.315777,6.256721,39.377162,99.804351,94.172763,25.527535,27.211925
4,2022,434.023133,28.144471,0.081681,0.224079,0.290418,100.371113,0.009599,-1.175267,0.077361,...,0.389179,0.309098,33.56904,7.743813,6.082443,38.595104,99.476725,93.97856,25.243507,27.234773
5,2023,446.656876,27.976513,0.085976,0.227085,0.296788,100.336974,0.050217,-1.326648,0.083172,...,0.413723,0.320478,33.962547,8.340763,6.388615,39.663911,99.868201,94.205784,25.516272,27.314667
6,2024,448.180898,27.945686,0.081883,0.225822,0.291259,100.173031,0.024099,-1.129341,0.080194,...,0.39723,0.312485,33.910599,8.043879,6.327415,39.183783,99.701585,94.116498,25.118228,27.328307


std for each stat for each year

In [44]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})

std_col_year.head(45)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Def,Barrel%,...,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,sprint_speed
0,2018,178.486548,3.693209,0.032331,0.063749,0.039626,28.530616,2.496393,8.572044,0.037585,...,0.071428,0.03947,4.371621,3.866162,2.108343,8.037491,2.67925,1.442091,6.197057,1.458342
1,2019,180.441832,3.625089,0.032154,0.063824,0.041562,29.380006,2.357207,8.607961,0.039637,...,0.078329,0.040855,4.408501,4.120286,2.132308,7.824984,2.583196,1.405933,6.169526,1.442971
2,2020,63.447516,3.565613,0.039154,0.070692,0.057481,36.84841,1.019048,3.579695,0.045829,...,0.087015,0.047139,5.978621,4.606877,2.880078,9.167338,2.747903,1.552068,6.962644,1.40754
3,2021,179.574464,3.396171,0.031489,0.063589,0.042329,28.748632,2.179553,7.690169,0.044852,...,0.078669,0.041448,4.244636,4.483307,2.097866,8.141869,2.64792,1.509311,6.29106,1.331969
4,2022,178.925349,3.546322,0.029662,0.065065,0.043236,30.439727,2.314636,8.215906,0.041972,...,0.069434,0.038578,4.27184,4.2062,2.044547,7.98518,2.528494,1.433195,6.332752,1.411683
5,2023,184.22997,3.67684,0.031558,0.062988,0.04109,28.607948,2.742804,8.222064,0.042755,...,0.073334,0.039062,4.265017,4.291954,2.069326,8.036329,2.486824,1.441447,6.272828,1.371183
6,2024,180.758032,3.622363,0.028765,0.062252,0.038688,29.252801,2.687984,8.427642,0.041715,...,0.072931,0.039131,4.127133,4.187283,2.057802,8.133116,2.598681,1.510124,6.312365,1.339349


In [None]:
# Create empty dataframe to store z-scores
z_scores = pd.DataFrame(index=df.index, columns=df.select_dtypes(include=[np.number]).columns)

# Calculate z-scores for each numeric column and scale to target of 100 with 1 std = 10 points
for col in df.select_dtypes(include=[np.number]).columns:
    if col not in ['Season', 'MLBAMID']:  # not needed
        for idx in df.index:
            season = df.loc[idx, 'Season']
            value = df.loc[idx, col]
            mean = mean_col_year.loc[mean_col_year['Season'] == season, col].iloc[0]
            std = std_col_year.loc[std_col_year['Season'] == season, col].iloc[0]
            # scale so that 10 is 1 std away
            z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

# Add Season and MLBAMID columns back
z_scores['Season'] = df['Season']
z_scores['MLBAMID'] = df['MLBAMID']

### make sure to chnage all dtypes to ints and then add back player names 