### Packages / Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import numpy as np
from numpy.linalg import pinv # for mah distance 
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [None]:
og = pd.read_csv('./data/sample/full_data.csv')
df = og.copy() # import data

### Prepare Data for NearestNeighbors

##### remove all nas

In [3]:
# need to have vars on the same scale and remove certian uneeded types
df = df.dropna(subset=['xslg']) # reduces sample size, keeps most key vars
df = df.dropna(subset=['CSW%']) # remove cols that lack all types of data
df = df.drop(columns=['avg_swing_speed', 'fast_swing_rate', 'n_outs_above_average', 'Unnamed: 0', 'Team', 'sprint_speed','Def', 'BsR'])
df = df[df['Season'] < 2024]

##### make all vars the same scale

In [4]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [5]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

mean for each year

In [6]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean

mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})

mean_col_year.head(50)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,WAR,Barrel%,maxEV,...,CSW%,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
0,2018,455.101763,28.009193,0.086546,0.217756,0.297532,99.785582,1.78918,0.065212,110.256543,...,0.272326,0.405783,0.317175,33.876421,7.018887,6.101957,36.26347,98.888685,93.613011,24.400207
1,2019,445.032281,27.831565,0.086912,0.224542,0.298676,99.486783,1.759262,0.071195,110.408084,...,0.272293,0.430595,0.323034,33.9329,7.692813,6.07366,37.408502,99.25734,93.818437,25.188746
2,2020,170.418286,27.9532,0.092144,0.235097,0.290996,99.564191,0.690637,0.078266,109.314055,...,0.28129,0.41447,0.322695,33.39957,7.846925,6.101995,37.88119,99.167011,93.873928,26.506133
3,2021,433.127326,28.254705,0.088272,0.226812,0.29204,99.556031,1.760448,0.083013,111.027816,...,0.27355,0.413997,0.320537,33.775282,8.307475,6.264621,39.356021,99.794302,94.168802,25.553001
4,2022,433.33284,28.143983,0.081643,0.224407,0.290344,100.229135,1.770046,0.077332,110.582671,...,0.274713,0.388921,0.308935,33.57061,7.740924,6.08312,38.586205,99.470437,93.977007,25.26246
5,2023,446.299263,27.976995,0.085972,0.227197,0.296695,100.249243,1.766768,0.083115,110.870425,...,0.274949,0.413562,0.320382,33.956465,8.335067,6.39202,39.657629,99.865657,94.204865,25.521095


std for each stat for each year

In [7]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})

std_col_year.head(45)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,WAR,Barrel%,maxEV,...,CSW%,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
0,2018,178.957595,3.692978,0.03235,0.064153,0.040135,28.690452,1.993839,0.037737,3.511066,...,0.029461,0.07166,0.039587,4.419187,3.880451,2.12161,8.052921,2.682079,1.443637,6.212356
1,2019,180.946152,3.624993,0.032221,0.064323,0.042012,29.697654,1.976588,0.039697,3.483595,...,0.029999,0.078646,0.041135,4.449921,4.12534,2.150723,7.854153,2.595332,1.408938,6.190703
2,2020,64.524874,3.564725,0.039431,0.071719,0.058701,37.721799,0.869848,0.046033,3.439657,...,0.033194,0.088328,0.048167,6.156224,4.6271,2.946264,9.293454,2.78489,1.565701,7.003524
3,2021,180.320998,3.3964,0.031586,0.064507,0.042854,29.159946,1.858615,0.044935,3.344791,...,0.029764,0.079202,0.041954,4.301832,4.491633,2.150413,8.184205,2.66538,1.513315,6.334711
4,2022,179.582509,3.545664,0.029742,0.065648,0.043764,30.740495,2.019168,0.042174,3.149186,...,0.029909,0.06976,0.038845,4.304346,4.22638,2.074955,8.030648,2.541416,1.438603,6.361645
5,2023,184.571199,3.676334,0.031606,0.063117,0.041305,28.793758,1.936657,0.042809,3.163585,...,0.029531,0.073539,0.039247,4.289914,4.297364,2.093446,8.055078,2.490383,1.443281,6.278392


In [8]:
# z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
z_scores = pd.DataFrame(index=df.index, columns=df.select_dtypes(include=[np.number]).columns)

# scaled for each col
for col in df.select_dtypes(include=[np.number]).columns:
    if col not in ['Season', 'MLBAMID']:  # not needed
        for idx in df.index:
            season = df.loc[idx, 'Season']
            value = df.loc[idx, col]
            mean = mean_col_year.loc[mean_col_year['Season'] == season, col].iloc[0]
            std = std_col_year.loc[std_col_year['Season'] == season, col].iloc[0]
            # scale so that 10 is 1 std away
            z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

z_scores['Season'] = df['Season']
z_scores['MLBAMID'] = df['MLBAMID']


In [9]:
for col in z_scores.columns:
    if col not in ['Season', 'MLBAMID']:
        z_scores[col] = pd.to_numeric(z_scores[col])

In [10]:
normalized_data = z_scores.copy()
normalized_data.insert(0, 'Name', df['Name']) 
normalized_datam= normalized_data[normalized_data['Season'] < 2024] # need to remove 2024 season for loss fn to work correctly
normalized_data.head(10)

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,WAR,Barrel%,...,MLBAMID,xslg,xwoba,sweet_spot_percent,barrel_batted_rate,solidcontact_percent,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent
3,a.j. ellis,2018,84.795182,124.345683,117.165657,97.572979,112.531141,101.938702,89.569899,93.853313,...,454560.0,93.750565,100.966302,101.411071,92.735671,107.06088,104.019075,99.662657,102.185913,85.35144
10,aaron altherr,2018,90.494856,97.267265,112.293673,115.828285,87.303781,91.009973,91.331289,108.530177,...,571437.0,96.681082,99.95586,92.359632,106.90928,120.258406,111.842324,112.741005,112.398062,113.521108
11,aaron altherr,2019,79.052758,100.464648,87.133515,123.980096,55.354994,58.868623,87.119836,95.682064,...,571437.0,79.834447,72.521223,96.555221,94.442122,96.867752,79.87243,85.505525,87.00948,115.848368
15,aaron hicks,2018,107.035088,99.975107,121.131332,95.837136,91.736905,110.020596,111.115094,105.211941,...,543305.0,108.2636,113.849441,100.732213,104.589962,105.175519,108.365325,103.832895,103.709874,97.907063
16,aaron hicks,2019,89.497855,103.223274,110.755718,108.987709,96.914894,101.018573,93.913806,101.952828,...,543305.0,93.566824,93.428022,91.836036,105.350314,83.848873,99.225248,103.98385,103.24806,110.194729
17,aaron hicks,2020,106.289313,105.741817,125.910538,92.330901,94.038285,106.479739,101.174882,97.922304,...,543305.0,102.324316,110.651462,94.152958,97.953524,99.993229,100.343047,102.050344,100.783121,101.990236
18,aaron hicks,2021,82.967745,108.082955,107.230673,101.749185,84.04925,92.31231,90.842943,103.780407,...,543305.0,100.884178,102.97066,105.868938,103.768173,100.164521,99.320617,98.479392,97.656848,105.915028
19,aaron hicks,2022,101.09516,110.875302,118.567501,102.469447,97.364705,96.344577,95.54461,95.310376,...,543305.0,92.413884,98.472117,80.553122,95.407597,84.659329,93.168415,94.521899,94.092933,100.84497
20,aaron hicks,2023,92.723715,113.66308,115.390852,99.042541,103.434506,103.006113,95.71116,89.882062,...,543305.0,85.101506,90.220396,82.385511,89.912264,86.185358,87.265636,87.131033,87.20061,100.125678
27,aaron judge,2018,102.397117,94.559423,120.42193,113.633928,117.607659,117.426151,116.572235,123.563973,...,592450.0,116.915602,119.154263,104.126503,123.65991,106.58954,122.149143,129.416941,132.449298,118.994072


### Nearest Neighbors

In [11]:
X = normalized_data.drop(columns=['Name', 'Season', 'MLBAMID'])

metrics needed for mahalanobis distance. I was having problems with ecluedian and apperantly this handles covarience between features much better, testing various metrics

In [12]:
cov_matrix = np.cov(X, rowvar=False)
inv_cov_matrix = pinv(cov_matrix)

In [13]:
nn = NearestNeighbors(n_neighbors=5, metric='mahalanobis', n_jobs=-1, metric_params={'VI': inv_cov_matrix})

In [14]:
nn.fit(X)

In [15]:
search = normalized_data[normalized_data.index == 32] 
search = search.drop(columns=['Season', 'Name', 'MLBAMID'])
search = search.values

In [16]:
distances, indices = nn.kneighbors(search, n_neighbors=10)



In [17]:
for i, (distance, index) in enumerate(zip(distances[0], indices[0])):
    player_info = normalized_data.iloc[index]
    print(f"\nNeighbor {i+1} (Distance: {distance:.2f})")
    print(f"Name: {player_info['Name']}")
    print(f"Season: {player_info['Season']}")


Neighbor 1 (Distance: 0.00)
Name: aaron judge
Season: 2023

Neighbor 2 (Distance: 4.90)
Name: yordan alvarez
Season: 2022

Neighbor 3 (Distance: 4.96)
Name: shohei ohtani
Season: 2021

Neighbor 4 (Distance: 5.26)
Name: shohei ohtani
Season: 2023

Neighbor 5 (Distance: 5.50)
Name: bryce harper
Season: 2020

Neighbor 6 (Distance: 5.60)
Name: aaron judge
Season: 2018

Neighbor 7 (Distance: 5.60)
Name: josh donaldson
Season: 2021

Neighbor 8 (Distance: 5.67)
Name: j.d. martinez
Season: 2018

Neighbor 9 (Distance: 5.73)
Name: aaron judge
Season: 2019

Neighbor 10 (Distance: 5.78)
Name: joey votto
Season: 2021


On a basic level, to estimate offense, this works pretty well. However, for this to be a useful tool to estimate player preformance as a whole it would be important to weight BSR and DEF as well as PA higher compared to other features. However figuring the correct weighting is not really a straightforward problem.

##### apply this to every player

In [22]:
kNear = pd.DataFrame(columns=['n1', 'd1', 'n2', 'd2', 'n3', 'd3', 'n4', 'd4', 'n5', 'd5', 'n6', 'd6', 'n7', 'd7', 'n8', 'd8', 'n9', 'd9', 'n10', 'd10'])

weight testing to improve model

In [23]:
normalized_data = z_scores.copy() # have to recrate database and for first run refit bc of filter so idx is correct
normalized_data.insert(0, 'Name', df['Name']) 
normalized_data = normalized_data[normalized_data['Season'] < 2024]

find nn for every index (player) in dataset

In [24]:
# For each player in the normalized_data dataset
for index in normalized_data.index:
    # Get search player data
    search = normalized_data.loc[[index]].drop(columns=['Season', 'Name', 'MLBAMID'])
    
    # Find nearest neighbors
    distances, indices = nn.kneighbors(search)
    
    # Create row data
    row_data = {}
    
    # Get names and seasons combined for neighbors
    for i, idx in enumerate(indices[0]):
        try:
            name = normalized_data.iloc[idx]['Name']
            season = str(normalized_data.iloc[idx]['Season'])
            row_data[f'n{i+1}'] = f"{name},{season}"
            row_data[f'd{i+1}'] = f"{distances[0][i]:.2f}"
        except Exception:
            print(f"{Exception} at index {idx}")
            continue

    
    # Add row to DataFrame with numeric index
    kNear.loc[index] = row_data

KeyboardInterrupt: 

In [None]:
# convert distance col to int cause yeah
for col in kNear.columns:
    if col.startswith('d'):
        kNear[col] = kNear[col].astype(float).astype(int)

kNear.head(10)

Unnamed: 0,n1,d1,n2,d2,n3,d3,n4,d4,n5,d5,n6,d6,n7,d7,n8,d8,n9,d9,n10,d10
3,"albert pujols,2022",258.1,"albert pujols,2021",265.08,"adrian beltré,2018",267.83,"miguel cabrera,2023",268.1,"justin turner,2023",268.36,"a.j. ellis,2018",268.41,"albert pujols,2020",269.12,"albert pujols,2019",269.72,"ben zobrist,2018",269.95,"chase utley,2018",270.08
10,"albert pujols,2022",189.33,"miguel cabrera,2020",194.8,"josé bautista,2018",194.94,"matt holliday,2018",195.19,"joey votto,2021",195.43,"shin-soo choo,2019",196.97,"joey votto,2022",197.44,"albert pujols,2021",198.01,"shin-soo choo,2020",198.56,"miguel cabrera,2021",198.85
11,"robinson chirinos,2022",194.18,"hunter pence,2020",195.02,"joey votto,2023",195.12,"edwin encarnación,2020",196.07,"erik kratz,2019",196.79,"stephen vogt,2022",196.92,"albert pujols,2021",197.23,"miguel cabrera,2023",198.07,"jeff mathis,2020",198.33,"curtis granderson,2019",198.46
15,"albert pujols,2022",201.44,"joey votto,2021",209.74,"carlos santana,2022",214.11,"steve pearce,2018",214.29,"edwin encarnación,2019",214.4,"justin turner,2020",214.74,"miguel cabrera,2020",214.89,"albert pujols,2021",215.44,"joey votto,2020",215.6,"jed lowrie,2021",215.63
16,"albert pujols,2022",196.62,"albert pujols,2021",201.12,"joey votto,2022",201.65,"shin-soo choo,2020",204.72,"joey votto,2023",204.9,"miguel cabrera,2023",204.91,"miguel cabrera,2020",205.47,"adrian beltré,2018",205.58,"matt holliday,2018",206.02,"edwin encarnación,2019",207.16
17,"albert pujols,2022",214.99,"albert pujols,2021",224.8,"joey votto,2021",224.85,"miguel cabrera,2020",226.06,"joey votto,2022",226.28,"carlos santana,2022",226.32,"andrew mccutchen,2023",226.6,"matt holliday,2018",227.03,"steve pearce,2018",227.05,"joey votto,2020",227.15
18,"albert pujols,2022",213.99,"albert pujols,2021",219.85,"adrian beltré,2018",223.85,"joey votto,2022",224.27,"miguel cabrera,2023",225.2,"shin-soo choo,2020",225.32,"albert pujols,2020",225.67,"curtis granderson,2019",226.48,"joey votto,2021",226.61,"miguel cabrera,2020",226.69
19,"albert pujols,2022",221.97,"albert pujols,2021",224.72,"miguel cabrera,2023",228.65,"joey votto,2022",228.99,"albert pujols,2019",229.39,"albert pujols,2020",229.56,"adrian beltré,2018",230.06,"joey votto,2023",231.35,"andrew mccutchen,2023",232.2,"erik kratz,2020",232.39
20,"albert pujols,2022",226.69,"albert pujols,2021",227.42,"miguel cabrera,2023",230.06,"albert pujols,2020",230.59,"albert pujols,2019",231.58,"erik kratz,2020",232.18,"adrian beltré,2018",232.37,"ben zobrist,2019",232.45,"chase utley,2018",233.5,"brett gardner,2021",234.29
27,"joey votto,2021",205.29,"aaron judge,2023",209.44,"albert pujols,2022",209.55,"david freese,2019",212.02,"josh donaldson,2021",212.86,"aaron judge,2022",213.46,"miguel cabrera,2020",214.29,"j.d. martinez,2023",215.16,"matt holliday,2018",215.33,"shin-soo choo,2019",215.78


take nearest neighbor, find war total following season subtract that from index player war in the following season

In [None]:
def calculate_war_diff(row):
    # Split the current player's info
    player_name, season = row['n1'].split(',')
    player_name = player_name.strip()
    season = int(season)

    # index player
    player_war_current = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['WAR'].values # get WAR for the player following season
    player_war_current = player_war_current[0]

    # war for nearest player not named the same, skip 2 cause of distance row
    potential_war = []
    for i in range(1, 11):
        neighbor_col = f'n{i}'
        if neighbor_col in row:
            neighbor_name, neighbor_season = row[neighbor_col].split(',')
            neighbor_name = neighbor_name.strip()
            neighbor_season = int(neighbor_season)
                
            if neighbor_name != player_name:
                war_value = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['WAR'].values
                if len(war_value) > 0:
                    potential_war.append(war_value[0])
                    break
    
    if len(potential_war) > 0:
        return potential_war[0] - player_war_current
    else:
        return None

In [None]:
kNear['war_diff'] = kNear.apply(calculate_war_diff, axis=1)
miss = kNear['war_diff'].abs().sum()
print(miss)

3038.9272075506224


baseline 6768.585203621661
<small>age*2 4426.339083247895 age*3 2956.867042393109 age*4 3820.9909826534395 age3, pa2 4548.5247146644215 </small>

### Function to Optimize Weighting

want to optimize the weights of each to minimize this war_diff loss function, weights for col as input, optimize weights

In [None]:
cols_opti = normalized_data.columns.tolist()
cols_opti.remove('Name')
cols_opti.remove('Season')
cols_opti.remove('MLBAMID')
opti_df = pd.DataFrame({'stat':cols_opti, 'weight':1})
opti_df.head(5)

Unnamed: 0,stat,weight
0,PA,1
1,Age,1
2,BB%,1
3,K%,1
4,BABIP,1


In [None]:
def loss_fn(df_to_opti):
    # read df
    normalized_data = z_scores.copy()
    normalized_data.insert(0, 'Name', df['Name']) 
    normalized_data = normalized_data[normalized_data['Season'] < 2024]
    X = normalized_data.drop(columns=['Name', 'Season', 'MLBAMID'])
    
    # Apply weights from opti_df
    for idx, row in df_to_opti.iterrows():
        if row['stat'] in X.columns:
            X[row['stat']] = X[row['stat']] * row['weight']

    nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)
    nn.fit(X)

    # Create empty DataFrame for kNear
    kNear = pd.DataFrame(columns=['n1', 'd1', 'n2', 'd2', 'n3', 'd3', 'n4', 'd4', 
                                 'n5', 'd5', 'n6', 'd6', 'n7', 'd7', 'n8', 'd8', 
                                 'n9', 'd9', 'n10', 'd10'])

    # For each player in the normalized_data dataset
    for index in normalized_data.index:
        search = X.loc[[index]]
        distances, indices = nn.kneighbors(search)
        
        row_data = {}
        for i, idx in enumerate(indices[0]):
            try:
                name = normalized_data.iloc[idx]['Name']
                season = str(normalized_data.iloc[idx]['Season'])
                row_data[f'n{i+1}'] = f"{name},{season}"
                row_data[f'd{i+1}'] = f"{distances[0][i]:.2f}"
            except Exception:
                continue
                
        kNear.loc[index] = row_data
    
    # Calculate war difference
    kNear['war_diff'] = kNear.apply(calculate_war_diff, axis=1)
    miss = kNear['war_diff'].abs().sum()

    print(f"iter complete with miss of {miss}")

    return miss


In [None]:
from scipy.optimize import minimize

initial_weights = np.ones(len(cols_opti))

# change loss fn to work with weights
def loss_fn_array(weights):
    temp_opti_df = pd.DataFrame({'stat': cols_opti, 'weight': weights})
    return loss_fn(temp_opti_df)

# set bounds for weights (0 to 3 for each weight)
bounds = [(0, 3)] * len(cols_opti)

# Optimize
opt = minimize(fun=loss_fn_array, 
              x0=initial_weights, 
              method='L-BFGS-B', 
              bounds=bounds,
              options={'maxiter': 10})

KeyboardInterrupt: 