### NN Model for 2023 Data

#### Imports

packages

In [77]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os

local import

In [78]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')
data_23_24 = pd.read_csv('./data/cleaned/data_23_24.csv')
df = data_23_24.copy() # import data
df = df.dropna()

drop always unneeded cols

In [79]:
df = df.drop(columns=['Unnamed: 0', 'Team', 'Def'])

#### Normalize Data

copied from 'exploration/nearest_neighbors.ipynb'

In [80]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [81]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

In [82]:
def z_scores(player_df, mean_df, std_df):
    # z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
    z_scores = pd.DataFrame(index=player_df.index, columns=player_df.select_dtypes(include=[np.number]).columns)

    # scaled for each col
    for col in player_df.select_dtypes(include=[np.number]).columns:
        if col not in ['Season', 'MLBAMID']:  # not needed
            for idx in player_df.index:
                season = player_df.loc[idx, 'Season']
                value = player_df.loc[idx, col]
                mean = mean_df.loc[mean_df['Season'] == season, col].iloc[0]
                std = std_df.loc[std_df['Season'] == season, col].iloc[0]
                # scale so that 10 is 1 std away
                z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

    z_scores['Season'] = player_df['Season']
    
    # cols to numeric
    for col in z_scores.columns:
        if col not in ['Season', 'MLBAMID']:
            z_scores[col] = pd.to_numeric(z_scores[col])
    
    # reinsert player names
    z_scores.insert(0, 'Name', player_df['Name']) 

    return z_scores

mean for each year

In [83]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean
        
# clean up df to make it easier to work with
mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})
mean_col_year = mean_col_year.sort_values(by=['Season'])
mean_col_year.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,...,bb_percent,woba,xwoba,sweet_spot_percent,barrel_batted_rate,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent
0,2023,446.221213,27.977189,0.085983,0.227243,0.296656,100.236902,0.052217,3.800717,1.766132,...,8.601494,0.318553,0.320352,33.959502,8.335061,39.656425,99.865646,94.204951,25.523627,47.722911
1,2024,447.381885,27.939266,0.081851,0.226063,0.291184,100.084642,0.027876,3.517452,1.786232,...,8.184799,0.310247,0.312341,33.907515,8.037711,39.175173,99.695445,94.114469,25.132269,48.125014


std for each year

In [84]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

# clean up df to make it easier to work with
std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})
std_col_year = std_col_year.sort_values(by=['Season'])
std_col_year.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,...,bb_percent,woba,xwoba,sweet_spot_percent,barrel_batted_rate,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent
0,2023,184.640874,3.676013,0.031614,0.0632,0.041398,28.804999,2.741822,15.148537,1.935213,...,3.160195,0.041785,0.039241,4.295183,4.296957,8.05479,2.490148,1.443158,6.280495,5.035636
1,2024,181.345314,3.623815,0.028821,0.062645,0.038889,29.403954,2.684778,15.563986,2.050169,...,2.883528,0.042365,0.039278,4.165549,4.189756,8.161489,2.604874,1.512985,6.330819,4.873296


z_scores

In [85]:
# z scores for each stat and player, applying function above
normalized_data = z_scores(df, mean_col_year, std_col_year)
normalized_data.head()

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,...,bb_percent,woba,xwoba,sweet_spot_percent,barrel_batted_rate,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent
0,aaron hicks,2023,92.730688,113.663746,115.383212,99.036504,103.4362,103.009224,103.689726,100.390931,...,115.500641,101.782182,90.226519,82.400047,89.911325,87.266675,87.129859,87.198924,100.121603,85.259238
1,aaron hicks,2024,78.80387,116.724734,104.644954,122.191113,79.669692,72.758068,100.503868,94.138796,...,104.561084,73.033048,83.619107,96.380995,94.897767,95.25188,99.781672,100.079318,112.111753,91.535474
2,aaron judge,2023,100.63793,108.223069,133.579146,108.955677,100.926857,124.953327,91.050337,122.030436,...,133.5375,124.27845,135.841921,107.311674,144.601185,130.470781,130.291418,138.414595,117.636146,87.245085
3,aaron judge,2024,114.150799,111.205688,137.149845,102.687322,119.616021,140.246539,98.231326,158.744597,...,137.16004,139.124692,142.430135,116.546403,145.020012,126.741231,128.811338,132.976083,108.794646,87.431476
4,aaron schunk,2024,80.73389,94.648551,85.762277,114.408724,109.43302,82.786786,97.48617,93.519697,...,85.834024,87.431585,80.563989,113.905694,92.272316,90.84092,85.310293,85.76996,103.897965,121.289464


correlation with WRC+, help feature selection

In [86]:
corr = normalized_data.corr(numeric_only=True)
corr = corr['wRC+']
print(corr)

Season               -0.001886
PA                    0.516948
Age                   0.010575
BB%                   0.342593
K%                   -0.348082
BABIP                 0.666505
wRC+                  1.000000
BsR                   0.005465
Off                   0.617394
WAR                   0.612147
Barrel%               0.473195
maxEV                 0.462960
HardHit%              0.400085
O-Swing%             -0.202483
Z-Swing%              0.001095
O-Contact%            0.141159
Z-Contact%            0.202123
Contact%              0.235714
SwStr%               -0.268000
CSW%                 -0.320614
player_id            -0.045208
pa                    0.516732
k_percent            -0.348130
bb_percent            0.342328
woba                  0.994585
xwoba                 0.802905
sweet_spot_percent    0.401221
barrel_batted_rate    0.474190
hard_hit_percent      0.400273
avg_best_speed        0.426823
avg_hyper_speed       0.401459
whiff_percent        -0.227577
swing_pe

#### NN

function to find the str nearest neighbor based on a found nn index

In [87]:
def df_apply(df, indices):
    num_samples = indices.shape[0]
    output_index = df.index
    rows_data = []
    indices_array = indices.astype(int)

    for i in range(num_samples):
        row_neighbors = {}
        neighbor_positional_indices = indices_array[i]

        for j, neighbor_pos_idx in enumerate(neighbor_positional_indices):
            neighbor_row = df.iloc[neighbor_pos_idx]
            name = neighbor_row['Name']
            season = str(neighbor_row['Season'])
            row_neighbors[f'n{j+1}'] = f"{name},{season}"

        rows_data.append(row_neighbors)

    return pd.DataFrame(rows_data, index=output_index)

diff between target player season plus 1, and closest neighbor season plus 1 WRC+

In [88]:
def single_n_diff(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = np.nan
    for i in range(2, 6):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val = neighbor_lookup.iloc[0]
                        break
            except (IndexError, ValueError, AttributeError):
                 continue

    if pd.notna(wrc_current) and pd.notna(wrc_target_val):
        return abs(wrc_target_val - wrc_current)
    else:
        return np.nan

Implementing NN

<small> Removed players below 200 PA (85 is the normalized value) because I was having a ton of issues with small sample size players impacting preformance

In [89]:
X1 = normalized_data[normalized_data['PA'] > 85] # filter pa so we dont get small sample outliers, corresponds to about 200 PA
X1 = normalized_data[normalized_data['Season'] < 2024]
X1 = X1.reset_index(drop=True)
X1 = X1.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'player_id', 'pa', 'k_percent', 'bb_percent', 'woba']) # filter uneed features, more varience
X = X1.drop(columns=['Name', 'Season']) # same as x1 without names
X = X.to_numpy()

In [90]:
# nearest neighbors and indices
nn = NearestNeighbors(n_neighbors=9).fit(X)
indices_test = nn.kneighbors(X, return_distance=False)

Nearest Neighbor for Every Player in df

In [91]:
kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9'])
kNear = df_apply(df = X1, indices=indices_test)
for i in range(1, 7):  # for columns n1 through n6
    kNear[f'n{i}'] = kNear[f'n{i}'].astype(str)

Loss Value

In [92]:
# Calculate wrc_diff and handle missing values
kNear['wrc_diff'] = kNear.apply(single_n_diff, axis=1)

# Calculate sum of wrc_diff, ignoring NaN values
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

16171.947460148318


Experiment with how to calc loss
<br>
<small> look at more then one neighbor? </small>

In [93]:
# same as the first loss function except now calculating 8 neighbors and weighted avg wrc+
def wrc_diff_weight(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = []
    for i in range(2, 10):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val.append(neighbor_lookup.iloc[0]) # add neighbors

                    # if len of the list is greater then 2 break
                    if len(wrc_target_val) >= 7:
                        break
            except (IndexError, ValueError, AttributeError):
                continue

    if pd.notna(wrc_current) and len(wrc_target_val) > 0:
        weights = [0.25, 0.15, 0.125, 0.125, 0.125, 0.125, 0.1]
        avg_target = sum(w * x for w, x in zip(weights[:len(wrc_target_val)], wrc_target_val)) / sum(weights[:len(wrc_target_val)])
        return abs(avg_target - wrc_current)
    else:
        return np.nan


loss | diffrence in future year WRC+

In [94]:
kNear['wrc_diff'] = kNear.apply(wrc_diff_weight, axis=1)
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

12607.289661320436


#### loss function 

<small> just adding all the previous steps together to make it easy to tune col weights

In [95]:
def loss_fn(weights, normalized_data):

    # begin eval function
    weighted_og = normalized_data[normalized_data['Season'] < 2024] # filter for eval metric
    weighted_og = weighted_og[weighted_og['PA'] > 85] # drop if below 200 PA, more stability, less outliers
    weighted_og = weighted_og.reset_index(drop=True)
    weighted_og = weighted_og.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%'])
    weighted_x = weighted_og.drop(columns=['Name', 'Season'])
    

    # apply weights
    for col in weighted_x.columns:
        if col in weights:
            weighted_x[col] = weighted_x[col] * weights[col]

    # nearest neighbors
    nn_fn = NearestNeighbors(n_neighbors=9).fit(weighted_x)

    # distances and index
    indices_test_fn = nn_fn.kneighbors(weighted_x, return_distance=False)


    # add data using the mapped indices
    kNear_fn = df_apply(df = weighted_og, indices=indices_test_fn) # pass mapped to avoid out of bounds

    for i in range(1, 7):  # for columns n1 through n6
        kNear_fn[f'n{i}'] = kNear_fn[f'n{i}'].astype(str)

    # add miss
    kNear_fn['wrc_diff'] = kNear_fn.apply(wrc_diff_weight, axis=1) # using new wrc calc

    # drop inf or na
    kNear_fn = kNear_fn.dropna(subset=['wrc_diff'])
    kNear_fn['wrc_diff'] = kNear_fn['wrc_diff'].clip(upper=1000)

    # calc miss
    miss_fn = kNear_fn['wrc_diff'].sum(skipna=True)
    print(f"miss of {miss_fn}")
    return miss_fn


#### Manual Weight Tuning

<small> Ditched scipy largely because it was ineffective in finding the weights. Alot of this was likely because it was super slow as I coded everything in pandas and the it had to recompute the df constantly. This was widely inefficent. This is def my fault to a certian extent, however I can tune it just as effectively if not more (I wanted to do scipy just to learn it). Doing it manually also allows me not to recode everything. </small>

creating weight index

In [96]:
cols = normalized_data.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'player_id', 'pa', 'k_percent', 'bb_percent', 'woba']) # filter uneed features, more varience
cols = cols.drop(columns=['Name', 'Season'])
cols_opti = cols.columns
weights = {}
for col in cols_opti:
    weights[col] = 1
print(weights)

{'PA': 1, 'Age': 1, 'BB%': 1, 'K%': 1, 'BABIP': 1, 'wRC+': 1, 'Off': 1, 'WAR': 1, 'Barrel%': 1, 'maxEV': 1, 'HardHit%': 1, 'O-Swing%': 1, 'Contact%': 1, 'SwStr%': 1, 'CSW%': 1, 'xwoba': 1, 'sweet_spot_percent': 1, 'barrel_batted_rate': 1, 'hard_hit_percent': 1, 'avg_best_speed': 1, 'avg_hyper_speed': 1, 'whiff_percent': 1, 'swing_percent': 1}


manual weight tuining


In [97]:
weights = {'PA': 1, 'Age': 1, 'BB%': 1, 'K%': 1, 'BABIP': 1, 'wRC+': 1, 'Off': 1, 'WAR': 1, 'Barrel%': 1, 
           'maxEV': 1, 'HardHit%': 1, 'O-Swing%': 1, 'Contact%': 1, 'SwStr%': 1, 'CSW%': 1, 'xwoba': 1, 
           'sweet_spot_percent': 1, 'barrel_batted_rate': 1, 'hard_hit_percent': 1, 'avg_best_speed': 1,
             'avg_hyper_speed': 1, 'whiff_percent': 1, 'swing_percent': 1}

loss value

In [98]:
loss_value = loss_fn(weights = weights, normalized_data=normalized_data)

miss of 8171.802612505395


#### 2025 Predections

war predection function

In [99]:
def wrc_predict(row):
    player_info = row['t1'].split(',')
    player_name = player_info[0].strip()

    wrc_target_val = []
    for i in range(1, 9):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                neighbor_season = int(neighbor_info[1])
                neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                if not neighbor_lookup.empty:
                    wrc_target_val.append(neighbor_lookup.iloc[0]) # add neighbors

                    # if len of the list is greater then 2 break
                    if len(wrc_target_val) >= 7:
                        break

            except (IndexError, ValueError, AttributeError):
                continue

    if len(wrc_target_val) > 0:
        weights = [0.25, 0.15, 0.125, 0.125, 0.125, 0.125, 0.1]
        avg_target = sum(w * x for w, x in zip(weights[:len(wrc_target_val)], wrc_target_val)) / sum(weights[:len(wrc_target_val)])
        return avg_target
    else:
        return np.nan
  


special apply function

In [100]:
def results_apply(player_df, index_df, knn_pd):
    rows_data = []
    for i in range(len(knn_pd)):
        row_data = {}
        for col in knn_pd.columns:
            if col == 'player':
                neighbor_idx = knn_pd.iloc[i][col]
                neighbor_row = player_df.iloc[neighbor_idx]
                name = neighbor_row['Name']
                season = str(neighbor_row['Season'])
                row_data['t1'] = f"{name},{season}"
            else:
                neighbor_idx = knn_pd.iloc[i][col]
                neighbor_row = index_df.iloc[neighbor_idx]
                name = neighbor_row['Name']
                season = str(neighbor_row['Season'])
                row_data[f'n{int(col + 1)}'] = f"{name},{season}"
        rows_data.append(row_data)
    return pd.DataFrame(rows_data)


prepare data for indexed data search

In [108]:
target_data = normalized_data[normalized_data['Season'] >= 2024]
target_data = target_data.reset_index()
target_data = target_data.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'player_id', 'pa', 'k_percent', 'bb_percent', 'woba', 'index']) # filter uneed features, more varience
index_search = target_data.drop(columns=['Name', 'Season']) # same as x1 without names
index_search = index_search.to_numpy()

search for index

In [109]:
# nearest neighbors and index
nn = NearestNeighbors(n_neighbors=9).fit(X)

In [110]:
# nearest neighbors and indices for predictions
index_25 = nn.kneighbors(index_search, return_distance=False)

index to pandas

In [111]:
# to pd so its easier to work with
index_list = pd.DataFrame(index_25)

# insert player
index_list['player'] = index_list.index

# add index into first col as target player
cols = index_list.columns.tolist()
cols.insert(0, cols.pop(cols.index('player')))
index_list = index_list[cols]

apply results search

In [112]:
preds = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9'])
preds = results_apply(player_df = target_data, index_df=X1, knn_pd=index_list)
for i in range(1, 7):  # for columns n1 through n6
    preds[f'n{i}'] = preds[f'n{i}'].astype(str)

calculate WAR

In [113]:
# WAR Predection
preds['proj_wrc+'] = preds.apply(wrc_predict, axis=1)
final_predection = preds[['t1', 'proj_wrc+']]

export to folder

In [114]:
final_predection.to_csv('./models/wrc_models/2025_Predections/nn_23_preds.csv')