### Packages / Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [51]:
og = pd.read_csv('./data/full_data.csv')
df = og.copy() # import data

### Prepare Data for NearestNeighbors

##### remove all nas

In [None]:
# need to have vars on the same scale and remove certian uneeded types
df = df.dropna(subset=['xslg']) # reduces sample size, keeps most key vars
df = df.dropna(subset=['CSW%']) # remove cols that lack all types of data
df = df.drop(columns=['avg_swing_speed', 'fast_swing_rate', 'n_outs_above_average', 'Unnamed: 0', 'Team', 'sprint_speed'])

##### make all vars the same scale

In [65]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [54]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

mean for each year

In [55]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean

mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})

mean_col_year.head(50)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Def,Barrel%,...,CSW%,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
0,2018,455.101763,28.009193,0.086546,0.217756,0.297532,99.785582,0.160019,-0.78888,0.065212,...,0.272326,0.405783,0.317175,33.876421,7.018887,6.101957,36.26347,98.888685,93.613011,24.400207
1,2019,445.032281,27.831565,0.086912,0.224542,0.298676,99.486783,0.11908,-0.688753,0.071195,...,0.272293,0.430595,0.323034,33.9329,7.692813,6.07366,37.408502,99.25734,93.818437,25.188746
2,2020,170.418286,27.9532,0.092144,0.235097,0.290996,99.564191,0.003119,-0.431407,0.078266,...,0.28129,0.41447,0.322695,33.39957,7.846925,6.101995,37.88119,99.167011,93.873928,26.506133
3,2021,433.127326,28.254705,0.088272,0.226812,0.29204,99.556031,0.101489,-0.662248,0.083013,...,0.27355,0.413997,0.320537,33.775282,8.307475,6.264621,39.356021,99.794302,94.168802,25.553001
4,2022,433.33284,28.143983,0.081643,0.224407,0.290344,100.229135,0.009448,-1.173692,0.077332,...,0.274713,0.388921,0.308935,33.57061,7.740924,6.08312,38.586205,99.470437,93.977007,25.26246
5,2023,446.299263,27.976995,0.085972,0.227197,0.296695,100.249243,0.050124,-1.325506,0.083115,...,0.274949,0.413562,0.320382,33.956465,8.335067,6.39202,39.657629,99.865657,94.204865,25.521095
6,2024,447.74636,27.945595,0.081879,0.225981,0.291165,100.099361,0.024097,-1.128041,0.080147,...,0.273866,0.397061,0.312389,33.90153,8.039214,6.32277,39.179577,99.698101,94.115569,25.128763


std for each stat for each year

In [56]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})

std_col_year.head(45)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Def,Barrel%,...,CSW%,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
0,2018,178.957595,3.692978,0.03235,0.064153,0.040135,28.690452,2.495093,8.567616,0.037737,...,0.029461,0.07166,0.039587,4.419187,3.880451,2.12161,8.052921,2.682079,1.443637,6.212356
1,2019,180.946152,3.624993,0.032221,0.064323,0.042012,29.697654,2.355788,8.602814,0.039697,...,0.029999,0.078646,0.041135,4.449921,4.12534,2.150723,7.854153,2.595332,1.408938,6.190703
2,2020,64.524874,3.564725,0.039431,0.071719,0.058701,37.721799,1.015455,3.569764,0.046033,...,0.033194,0.088328,0.048167,6.156224,4.6271,2.946264,9.293454,2.78489,1.565701,7.003524
3,2021,180.320998,3.3964,0.031586,0.064507,0.042854,29.159946,2.177466,7.682891,0.044935,...,0.029764,0.079202,0.041954,4.301832,4.491633,2.150413,8.184205,2.66538,1.513315,6.334711
4,2022,179.582509,3.545664,0.029742,0.065648,0.043764,30.740495,2.312753,8.209291,0.042174,...,0.029909,0.06976,0.038845,4.304346,4.22638,2.074955,8.030648,2.541416,1.438603,6.361645
5,2023,184.571199,3.676334,0.031606,0.063117,0.041305,28.793758,2.74167,8.218763,0.042809,...,0.029531,0.073539,0.039247,4.289914,4.297364,2.093446,8.055078,2.490383,1.443281,6.278392
6,2024,181.18419,3.621831,0.028818,0.062619,0.03886,29.390187,2.686632,8.423522,0.041753,...,0.029245,0.0732,0.03932,4.153611,4.191118,2.066515,8.152075,2.604451,1.511959,6.330285


In [57]:
# z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
z_scores = pd.DataFrame(index=df.index, columns=df.select_dtypes(include=[np.number]).columns)

# scaled for each col
for col in df.select_dtypes(include=[np.number]).columns:
    if col not in ['Season', 'MLBAMID']:  # not needed
        for idx in df.index:
            season = df.loc[idx, 'Season']
            value = df.loc[idx, col]
            mean = mean_col_year.loc[mean_col_year['Season'] == season, col].iloc[0]
            std = std_col_year.loc[std_col_year['Season'] == season, col].iloc[0]
            # scale so that 10 is 1 std away
            z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

z_scores['Season'] = df['Season']
z_scores['MLBAMID'] = df['MLBAMID']


In [58]:
for col in z_scores.columns:
    if col not in ['Season', 'MLBAMID']:
        z_scores[col] = pd.to_numeric(z_scores[col])

In [59]:
normalized_data = z_scores.copy()
normalized_data.insert(0, 'Name', df['Name']) 
normalized_data.head(10)

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Def,...,MLBAMID,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
3,a.j. ellis,2018,84.795182,124.345683,117.165657,97.572979,112.531141,101.938702,91.753808,91.906965,...,454560.0,93.750565,100.966302,101.411071,92.735671,107.06088,104.019075,99.662657,102.185913,85.35144
10,aaron altherr,2018,90.494856,97.267265,112.293673,115.828285,87.303781,91.009973,101.859408,101.257349,...,571437.0,96.681082,99.95586,92.359632,106.90928,120.258406,111.842324,112.741005,112.398062,113.521108
11,aaron altherr,2019,79.052758,100.464648,87.133515,123.980096,55.354994,58.868623,100.945791,100.974403,...,571437.0,79.834447,72.521223,96.555221,94.442122,96.867752,79.87243,85.505525,87.00948,115.848368
15,aaron hicks,2018,107.035088,99.975107,121.131332,95.837136,91.736905,110.020596,109.263603,97.191876,...,543305.0,108.2636,113.849441,100.732213,104.589962,105.175519,108.365325,103.832895,103.709874,97.907063
16,aaron hicks,2019,89.497855,103.223274,110.755718,108.987709,96.914894,101.018573,93.964565,97.287053,...,543305.0,93.566824,93.428022,91.836036,105.350314,83.848873,99.225248,103.98385,103.24806,110.194729
17,aaron hicks,2020,106.289313,105.741817,125.910538,92.330901,94.038285,106.479739,109.877765,87.504906,...,543305.0,102.324316,110.651462,94.152958,97.953524,99.993229,100.343047,102.050344,100.783121,101.990236
18,aaron hicks,2021,82.967745,108.082955,107.230673,101.749185,84.04925,92.31231,97.965947,101.032812,...,543305.0,100.884178,102.97066,105.868938,103.768173,100.164521,99.320617,98.479392,97.656848,105.915028
19,aaron hicks,2022,101.09516,110.875302,118.567501,102.469447,97.364705,96.344577,102.44354,100.24352,...,543305.0,92.413884,98.472117,80.553122,95.407597,84.659329,93.168415,94.521899,94.092933,100.84497
20,aaron hicks,2023,92.723715,113.66308,115.390852,99.042541,103.434506,103.006113,103.625593,94.680131,...,543305.0,85.101506,90.220396,82.385511,89.912264,86.185358,87.265636,87.131033,87.20061,100.125678
21,aaron hicks,2024,78.764904,116.71642,104.63568,122.213494,79.659582,72.740299,100.519832,99.583929,...,543305.0,82.915099,83.624177,96.385001,94.895841,111.987476,95.240995,99.77144,100.072101,112.118313


### Nearest Neighbors

In [60]:
X = normalized_data.drop(columns=['Name', 'Season', 'MLBAMID'])
nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)

In [61]:
nn.fit(X)

In [62]:
search = normalized_data[normalized_data.index == 2113] 
search = search.drop(columns=['Season', 'Name', 'MLBAMID'])
search = search.values

In [63]:
distances, indices = nn.kneighbors(search, n_neighbors=10)



In [64]:
for i, (distance, index) in enumerate(zip(distances[0], indices[0])):
    player_info = normalized_data.iloc[index]
    print(f"\nNeighbor {i+1} (Distance: {distance:.2f})")
    print(f"Name: {player_info['Name']}")
    print(f"Season: {player_info['Season']}")


Neighbor 1 (Distance: 0.00)
Name: fernando tatis jr.
Season: 2024

Neighbor 2 (Distance: 20.29)
Name: austin riley
Season: 2024

Neighbor 3 (Distance: 23.40)
Name: austin riley
Season: 2022

Neighbor 4 (Distance: 26.33)
Name: austin riley
Season: 2023

Neighbor 5 (Distance: 26.56)
Name: eloy jiménez
Season: 2022

Neighbor 6 (Distance: 27.27)
Name: ronald acuña jr.
Season: 2018

Neighbor 7 (Distance: 27.56)
Name: rafael devers
Season: 2021

Neighbor 8 (Distance: 27.65)
Name: rafael devers
Season: 2022

Neighbor 9 (Distance: 29.47)
Name: eloy jiménez
Season: 2020

Neighbor 10 (Distance: 29.96)
Name: j.d. davis
Season: 2019


On a basic level, to estimate offense, this works pretty well. However, for this to be a useful tool to estimate player preformance as a whole it would be important to weight BSR and DEF as well as PA higher compared to other features. However figuring the correct weighting is not really a straightforward problem.

##### apply this to every player

In [73]:
kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10'])

In [74]:
# For each player in the normalized_data dataset
for index in normalized_data.index:
    # Get search player data
    search = normalized_data.loc[[index]].drop(columns=['Season', 'Name', 'MLBAMID'])
    
    # Find nearest neighbors
    distances, indices = nn.kneighbors(search)
    
    # Create row data
    row_data = {}
    
    # Get names and seasons combined for neighbors
    for i, idx in enumerate(indices[0]):
        name = normalized_data.iloc[idx]['Name']
        season = str(normalized_data.iloc[idx]['Season'])
        row_data[f'n{i+1}'] = f"{name} ({season})"
    
    # Add row to DataFrame with numeric index
    kNear.loc[index] = row_data

    

In [75]:
kNear.head(10)

Unnamed: 0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10
3,a.j. ellis (2018),robbie grossman (2023),ben zobrist (2018),anthony rendon (2023),austin nola (2022),nick martini (2018),brett gardner (2020),mark canha (2023),mike tauchman (2023),abraham almonte (2021)
10,aaron altherr (2018),sam hilliard (2022),miguel sanó (2018),bobby dalbec (2022),willson contreras (2021),teoscar hernández (2019),matt mervis (2023),byron buxton (2023),evan white (2020),tyler austin (2019)
11,aaron altherr (2019),peter bourjos (2019),jason vosler (2023),kaleb cowart (2018),trayce thompson (2018),blake swihart (2019),isaac galloway (2019),chris davis (2020),ryan cordell (2018),jared walsh (2023)
15,aaron hicks (2018),brandon nimmo (2024),max muncy (2019),aaron hicks (2020),jorge polanco (2022),michael conforto (2019),mark canha (2019),ian happ (2023),gregory polanco (2018),michael conforto (2021)
16,aaron hicks (2019),tom murphy (2021),clint frazier (2021),ian happ (2021),mitch haniger (2019),justin upton (2019),jorge polanco (2024),wil myers (2021),brian anderson (2023),dj stewart (2021)
17,aaron hicks (2020),andrew mccutchen (2021),ian happ (2023),aaron hicks (2018),andrew mccutchen (2023),lamonte wade jr. (2023),michael conforto (2021),max muncy (2020),brandon nimmo (2024),michael conforto (2023)
18,aaron hicks (2021),jake lamb (2021),travis shaw (2021),billy mckinney (2023),mitch haniger (2019),jesús aguilar (2019),luke maile (2023),tyler nevin (2024),jorge polanco (2024),dominic smith (2024)
19,aaron hicks (2022),connor joe (2024),aaron hicks (2023),hunter dozier (2020),russell martin (2019),césar hernández (2018),jon berti (2021),brian dozier (2019),brian goodwin (2021),jj bleday (2023)
20,aaron hicks (2023),robbie grossman (2024),connor joe (2024),aaron hicks (2022),rafael ortega (2023),mark canha (2024),p.j. higgins (2022),jason kipnis (2020),jace peterson (2023),alex call (2022)
21,aaron hicks (2024),steve pearce (2019),dustin garneau (2020),sean bouchard (2024),logan morrison (2020),chris young (2018),daniel descalso (2019),jason castro (2022),jackie bradley jr. (2021),ryan noda (2024)
