### NN Model for 2015-2023 Data

#### Imports

packages

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os
import sys
function_dir = 'C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/tools'
sys.path.append(function_dir)

local import

In [2]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')
data_15_24 = pd.read_csv('./data/cleaned/data_15_24.csv')
df = data_15_24.copy() # import data
df = df.dropna(axis=1)

drop always unneeded cols

In [3]:
df = df.drop(columns=['Unnamed: 0', 'Team', 'Def'])

#### Normalize Data

In [None]:
from tools.zscore import normalize
normalized_data = normalize(df)

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,...,Barrel%,maxEV,HardHit%,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,aaron altherr,2015,83.673311,88.854248,106.668284,109.242237,100.019531,108.211937,102.945489,101.636385,...,91.553208,100.149892,111.247874,88.405413,92.88364,88.363989,100.809601,95.826222,99.461747,106.268899
1,aaron altherr,2016,87.364658,91.16134,105.803035,115.760952,96.653426,87.036875,98.792426,90.263165,...,92.278037,104.128481,99.768632,93.710938,94.490944,87.435041,92.925354,89.970469,105.558546,114.162237
2,aaron altherr,2017,97.353487,94.028201,97.141222,106.388453,106.42176,106.871485,95.557806,103.850036,...,109.344091,106.15948,111.284496,97.626523,93.865722,87.390295,94.516551,90.558962,106.211698,112.540041
3,aaron altherr,2018,90.475994,97.133726,112.40988,115.927895,87.274674,90.895454,101.946707,92.161007,...,108.407488,112.339199,111.721561,93.481169,100.895639,75.452787,94.727436,86.216213,109.814056,113.176705
4,aaron altherr,2019,78.878232,100.237125,87.158527,124.067967,55.223476,59.296534,101.055712,91.019701,...,95.667955,108.819853,80.096932,100.057338,107.938992,87.90887,74.355868,81.722979,119.551445,117.709082


correlation with WRC+, help feature selection

In [10]:
corr = normalized_data.corr(numeric_only=True)
corr = corr['wRC+']
print(corr)

Season        0.008982
PA            0.525432
Age           0.013171
BB%           0.348095
K%           -0.348286
BABIP         0.664588
wRC+          1.000000
BsR          -0.002865
Off           0.617007
WAR           0.620593
Barrel%       0.481256
maxEV         0.489666
HardHit%      0.464444
O-Swing%     -0.183812
Z-Swing%      0.018460
O-Contact%    0.170086
Z-Contact%    0.159086
Contact%      0.199976
SwStr%       -0.234586
CSW%         -0.313831
Name: wRC+, dtype: float64


#### NN

function to find the str nearest neighbor based on a found nn index

In [11]:
def df_apply(df, indices):
    num_samples = indices.shape[0]
    output_index = df.index
    rows_data = []
    indices_array = indices.astype(int)

    for i in range(num_samples):
        row_neighbors = {}
        neighbor_positional_indices = indices_array[i]

        for j, neighbor_pos_idx in enumerate(neighbor_positional_indices):
            neighbor_row = df.iloc[neighbor_pos_idx]
            name = neighbor_row['Name']
            season = str(neighbor_row['Season'])
            row_neighbors[f'n{j+1}'] = f"{name},{season}"

        rows_data.append(row_neighbors)

    return pd.DataFrame(rows_data, index=output_index)

diff between target player season plus 1, and closest neighbor season plus 1 WRC+

In [12]:
def single_n_diff(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = np.nan
    for i in range(2, 6):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val = neighbor_lookup.iloc[0]
                        break
            except (IndexError, ValueError, AttributeError):
                 continue

    if pd.notna(wrc_current) and pd.notna(wrc_target_val):
        return abs(wrc_target_val - wrc_current)
    else:
        return np.nan

Implementing NN

<small> Removed players below 200 PA (85 is the normalized value) because I was having a ton of issues with small sample size players impacting preformance

In [13]:
X1 = normalized_data[normalized_data['PA'] > 85] # filter pa so we dont get small sample outliers, corresponds to about 200 PA
X1 = normalized_data[normalized_data['Season'] < 2024]
X1 = X1.reset_index(drop=True)
X1 = X1.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'Age']) # filter uneed features, more varience
X = X1.drop(columns=['Name', 'Season']) # same as x1 without names
X = X.to_numpy()

In [14]:
# nearest neighbors and indices
nn = NearestNeighbors(n_neighbors=9).fit(X)
indices_test = nn.kneighbors(X, return_distance=False)

Nearest Neighbor for Every Player in df

In [15]:
kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9'])
kNear = df_apply(df = X1, indices=indices_test)
for i in range(1, 7):  # for columns n1 through n6
    kNear[f'n{i}'] = kNear[f'n{i}'].astype(str)

Loss Value

In [16]:
# Calculate wrc_diff and handle missing values
kNear['wrc_diff'] = kNear.apply(single_n_diff, axis=1)

# Calculate sum of wrc_diff, ignoring NaN values
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

155335.4197027843


Experiment with how to calc loss
<br>
<small> look at more then one neighbor? </small>

In [17]:
# same as the first loss function except now calculating 8 neighbors and weighted avg wrc+
def wrc_diff_weight(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = []
    for i in range(2, 10):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val.append(neighbor_lookup.iloc[0]) # add neighbors

                    # if len of the list is greater then 2 break
                    if len(wrc_target_val) >= 7:
                        break
            except (IndexError, ValueError, AttributeError):
                continue

    if pd.notna(wrc_current) and len(wrc_target_val) > 0:
        weights = [0.25, 0.15, 0.125, 0.125, 0.125, 0.125, 0.1]
        avg_target = sum(w * x for w, x in zip(weights[:len(wrc_target_val)], wrc_target_val)) / sum(weights[:len(wrc_target_val)])
        return abs(avg_target - wrc_current)
    else:
        return np.nan


loss | diffrence in future year WRC+

In [18]:
kNear['wrc_diff'] = kNear.apply(wrc_diff_weight, axis=1)
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

121076.12240797182


#### loss function 

<small> just adding all the previous steps together to make it easy to tune col weights

In [19]:
def loss_fn(weights, normalized_data):

    # begin eval function
    weighted_og = normalized_data[normalized_data['Season'] < 2024] # filter for eval metric
    weighted_og = weighted_og[weighted_og['PA'] > 85] # drop if below 200 PA, more stability, less outliers
    weighted_og = weighted_og.reset_index(drop=True)
    weighted_og = weighted_og.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%'])
    weighted_x = weighted_og.drop(columns=['Name', 'Season'])
    

    # apply weights
    for col in weighted_x.columns:
        if col in weights:
            weighted_x[col] = weighted_x[col] * weights[col]

    # nearest neighbors
    nn_fn = NearestNeighbors(n_neighbors=9).fit(weighted_x)

    # distances and index
    indices_test_fn = nn_fn.kneighbors(weighted_x, return_distance=False)


    # add data using the mapped indices
    kNear_fn = df_apply(df = weighted_og, indices=indices_test_fn) # pass mapped to avoid out of bounds

    for i in range(1, 7):  # for columns n1 through n6
        kNear_fn[f'n{i}'] = kNear_fn[f'n{i}'].astype(str)

    # add miss
    kNear_fn['wrc_diff'] = kNear_fn.apply(wrc_diff_weight, axis=1) # using new wrc calc

    # drop inf or na
    kNear_fn = kNear_fn.dropna(subset=['wrc_diff'])
    kNear_fn['wrc_diff'] = kNear_fn['wrc_diff'].clip(upper=1000)

    # calc miss
    miss_fn = kNear_fn['wrc_diff'].sum(skipna=True)
    print(f"miss of {miss_fn}")
    return miss_fn


#### Manual Weight Tuning

<small> Ditched scipy largely because it was ineffective in finding the weights. Alot of this was likely because it was super slow as I coded everything in pandas and the it had to recompute the df constantly. This was widely inefficent. This is def my fault to a certian extent, however I can tune it just as effectively if not more (I wanted to do scipy just to learn it). Doing it manually also allows me not to recode everything. </small>

creating weight index

In [20]:
cols = normalized_data.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%']) # filter uneed features, more varience
cols = cols.drop(columns=['Name', 'Season'])

cols_opti = cols.columns
weights = {}
for col in cols_opti:
    weights[col] = 1
print(weights)

{'PA': 1, 'Age': 1, 'BB%': 1, 'K%': 1, 'BABIP': 1, 'wRC+': 1, 'Off': 1, 'WAR': 1, 'Barrel%': 1, 'maxEV': 1, 'HardHit%': 1, 'O-Swing%': 1, 'Contact%': 1, 'SwStr%': 1, 'CSW%': 1}


manual weight tuining


In [21]:
weights = {'PA': 1, 'Age': 1, 'BB%': 1, 
           'K%': 1, 'BABIP': 1, 'wRC+': 2, 
           'Off': 1, 'WAR': 1, 'Barrel%': 1, 
           'maxEV': 1.5, 'HardHit%': 1, 'O-Swing%': 1, 
           'Contact%': 1, 'SwStr%': 1, 'CSW%': 1}

loss value

In [22]:
loss_value = loss_fn(weights = weights, normalized_data=normalized_data)

miss of 74312.58591902777


#### 2025 Predections

predection function

In [23]:
def wrc_predict(row):
    player_info = row['t1'].split(',')
    player_name = player_info[0].strip()

    wrc_target_val = []
    for i in range(1, 9):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                neighbor_season = int(neighbor_info[1])
                neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                if not neighbor_lookup.empty:
                    wrc_target_val.append(neighbor_lookup.iloc[0]) # add neighbors

                    # if len of the list is greater then 2 break
                    if len(wrc_target_val) >= 7:
                        break

            except (IndexError, ValueError, AttributeError):
                continue

    if len(wrc_target_val) > 0:
        weights = [0.25, 0.15, 0.125, 0.125, 0.125, 0.125, 0.1]
        avg_target = sum(w * x for w, x in zip(weights[:len(wrc_target_val)], wrc_target_val)) / sum(weights[:len(wrc_target_val)])
        return avg_target
    else:
        return np.nan
  

special apply

In [24]:
def results_apply(player_df, index_df, knn_pd):
    rows_data = []
    for i in range(len(knn_pd)):
        row_data = {}
        for col in knn_pd.columns:
            if col == 'player':
                neighbor_idx = knn_pd.iloc[i][col]
                neighbor_row = player_df.iloc[neighbor_idx]
                name = neighbor_row['Name']
                season = str(neighbor_row['Season'])
                row_data['t1'] = f"{name},{season}"
            else:
                neighbor_idx = knn_pd.iloc[i][col]
                neighbor_row = index_df.iloc[neighbor_idx]
                name = neighbor_row['Name']
                season = str(neighbor_row['Season'])
                row_data[f'n{int(col + 1)}'] = f"{name},{season}"
        rows_data.append(row_data)
    return pd.DataFrame(rows_data)


prepare data for indexed data search

In [25]:
target_data = normalized_data[normalized_data['Season'] >= 2024]
target_data = target_data.reset_index()
target_data = target_data.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'index', 'Age']) # filter uneed features, more varience
index_search = target_data.drop(columns=['Name', 'Season']) # same as x1 without names
index_search = index_search.to_numpy()

nn

In [26]:
# nearest neighbors and index
nn = NearestNeighbors(n_neighbors=9).fit(X)
# nearest neighbors and indices for predictions
index_25 = nn.kneighbors(index_search, return_distance=False)

In [27]:
# to pd so its easier to work with
index_list = pd.DataFrame(index_25)

# insert player
index_list['player'] = index_list.index

# add index into first col as target player
cols = index_list.columns.tolist()
cols.insert(0, cols.pop(cols.index('player')))
index_list = index_list[cols]

apply predections

In [28]:
preds = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9'])
preds = results_apply(player_df = target_data, index_df=X1, knn_pd=index_list)
for i in range(1, 7):  # for columns n1 through n6
    preds[f'n{i}'] = preds[f'n{i}'].astype(str)

war proj

In [29]:
# WAR Predection
preds['proj_wrc+'] = preds.apply(wrc_predict, axis=1)
final_predection = preds[['t1', 'proj_wrc+']]

to csv

In [30]:
final_predection.to_csv('./models/wrc_models/2025_Predections/nn_15_23_preds.csv')