### NN Model for 2002-2024 Data

#### Imports

packages

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os

local import

In [2]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')
data_02_24 = pd.read_csv('./data/cleaned/data_02_24.csv')
df = data_02_24.copy() # import data

colab import

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/colab/data_02_24.csv')

drop always unneeded cols

In [4]:
df = df.drop(columns=['Unnamed: 0', 'Team', 'Def'])

#### Normalize Data

copied from 'exploration/nearest_neighbors.ipynb'

In [5]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [6]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

In [None]:
def z_scores(player_df, mean_df, std_df):
    # z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
    z_scores = pd.DataFrame(index=player_df.index, columns=player_df.select_dtypes(include=[np.number]).columns)

    # scaled for each col
    for col in player_df.select_dtypes(include=[np.number]).columns:
        if col not in ['Season', 'MLBAMID']:  # not needed
            for idx in player_df.index:
                season = player_df.loc[idx, 'Season']
                value = player_df.loc[idx, col]
                mean = mean_df.loc[mean_df['Season'] == season, col].iloc[0]
                std = std_df.loc[std_df['Season'] == season, col].iloc[0]
                # scale so that 10 is 1 std away
                z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

    z_scores['Season'] = player_df['Season']
    
    # cols to numeric
    for col in z_scores.columns:
        if col not in ['Season', 'MLBAMID']:
            z_scores[col] = pd.to_numeric(z_scores[col])
    
    # reinsert player names
    z_scores.insert(0, 'Name', player_df['Name']) 

    return z_scores

mean for each year

In [None]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean
        
# clean up df to make it easier to work with
mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})
mean_col_year = mean_col_year.sort_values(by=['Season'])
mean_col_year.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,2007,479.32366,29.131879,0.087048,0.165649,0.304928,100.059697,0.215508,3.847416,1.871003,0.237717,0.665527,0.596592,0.8861,0.813455,0.085515,0.253939
1,2009,475.489236,28.87133,0.090529,0.174445,0.301505,99.974603,0.142641,3.68209,1.853189,0.248842,0.664887,0.627009,0.882261,0.810436,0.085482,0.257907
2,2014,453.658304,28.465598,0.077702,0.198467,0.300486,99.992646,0.064191,3.242497,1.824187,0.307677,0.662543,0.663046,0.876915,0.798306,0.093996,0.267385
3,2019,447.323061,27.913035,0.086799,0.22362,0.299295,100.064046,0.093284,4.016924,1.793319,0.316885,0.691812,0.630996,0.853214,0.767166,0.110672,0.271976
4,2020,170.703366,28.026721,0.091721,0.234139,0.291201,100.136435,-0.00395,1.50307,0.695515,0.307861,0.680627,0.615423,0.844059,0.755157,0.113645,0.280891
5,2021,434.346692,28.345488,0.088237,0.225855,0.292583,100.027472,0.076247,3.768059,1.777268,0.314417,0.696508,0.628063,0.850632,0.765565,0.111828,0.273242
6,2018,455.833036,28.065419,0.086321,0.216814,0.297583,100.067295,0.142828,3.543007,1.790976,0.309668,0.679508,0.634281,0.859469,0.774731,0.106147,0.271946
7,2008,467.097081,28.828458,0.088692,0.169905,0.30198,99.982303,0.197216,3.719166,1.831372,0.249494,0.65964,0.624337,0.883456,0.813578,0.085154,0.253608
8,2024,448.020185,27.943522,0.081852,0.225702,0.290938,100.173534,0.053078,3.596287,1.787539,0.319775,0.696517,0.625439,0.859905,0.770404,0.110719,0.273644
9,2012,453.774083,28.526712,0.081452,0.191955,0.29889,100.08843,0.103176,3.615414,1.845739,0.302494,0.65305,0.675014,0.876266,0.802582,0.090579,0.265703


std for each year

In [None]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

# clean up df to make it easier to work with
std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})
std_col_year = std_col_year.sort_values(by=['Season'])
std_col_year.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,2007,186.438339,4.367115,0.034388,0.05742,0.038677,28.886765,3.851518,17.061512,2.064698,0.059155,0.059302,0.109721,0.049635,0.062472,0.032731,0.027622
1,2009,180.331321,4.019151,0.032891,0.057032,0.038721,28.042407,3.573888,16.125988,2.058909,0.055755,0.059636,0.102275,0.04986,0.061134,0.03156,0.027306
2,2014,182.754092,3.916344,0.030344,0.06333,0.041191,29.311807,3.396804,14.349276,2.024452,0.056597,0.061227,0.095852,0.050305,0.063619,0.033885,0.029648
3,2019,180.535575,3.667468,0.032196,0.064471,0.042027,30.151685,2.354851,16.002425,1.982361,0.062161,0.060492,0.090361,0.04989,0.060312,0.033093,0.029731
4,2020,64.379969,3.654656,0.039277,0.071616,0.058569,37.79624,1.012108,7.408017,0.869445,0.065219,0.064878,0.100243,0.060459,0.068263,0.036236,0.033228
5,2021,180.319863,3.494339,0.031279,0.064601,0.042804,29.169001,2.178201,14.804431,1.867638,0.059613,0.057509,0.088726,0.053418,0.061547,0.033486,0.029649
6,2018,179.370982,3.717087,0.032228,0.064343,0.040083,28.638997,2.471588,15.268044,1.978929,0.061761,0.058405,0.088076,0.050496,0.060659,0.032994,0.029504
7,2008,186.544015,4.222349,0.03379,0.059741,0.03643,28.291714,3.717989,16.034558,2.136428,0.059225,0.057336,0.103243,0.049905,0.06136,0.031749,0.027017
8,2024,180.852846,3.629216,0.028644,0.062498,0.039092,29.46428,2.674628,15.637422,2.048161,0.057921,0.058322,0.093398,0.049193,0.060854,0.033774,0.029235
9,2012,188.300684,4.000032,0.029774,0.061718,0.041901,29.311607,3.461465,14.618813,2.045586,0.056109,0.05864,0.093778,0.049135,0.06251,0.032466,0.026644


z_scores

In [None]:
# z scores for each stat and player, applying function above
normalized_data = z_scores(df, mean_col_year, std_col_year)
normalized_data.head()

correlation with WRC+, help feature selection

In [None]:
corr = normalized_data.corr(numeric_only=True)
corr = corr['wRC+']
print(corr)

Season        0.005616
PA            0.528335
Age           0.034580
BB%           0.390001
K%           -0.310795
BABIP         0.671869
wRC+          1.000000
BsR           0.002394
Off           0.610290
WAR           0.612521
O-Swing%     -0.228714
Z-Swing%      0.003293
O-Contact%    0.124176
Z-Contact%    0.133424
Contact%      0.165336
SwStr%       -0.214951
CSW%         -0.288683
Name: wRC+, dtype: float64


#### NN

function to find the str nearest neighbor based on a found nn index

In [12]:
def df_apply(df, indices):
    num_samples = indices.shape[0]
    output_index = df.index
    rows_data = []
    indices_array = indices.astype(int)

    for i in range(num_samples):
        row_neighbors = {}
        neighbor_positional_indices = indices_array[i]

        for j, neighbor_pos_idx in enumerate(neighbor_positional_indices):
            neighbor_row = df.iloc[neighbor_pos_idx]
            name = neighbor_row['Name']
            season = str(neighbor_row['Season'])
            row_neighbors[f'n{j+1}'] = f"{name},{season}"

        rows_data.append(row_neighbors)

    return pd.DataFrame(rows_data, index=output_index)

diff between target player season plus 1, and closest neighbor season plus 1 WRC+

In [None]:
def single_n_diff(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = np.nan
    for i in range(2, 6):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val = neighbor_lookup.iloc[0]
                        break
            except (IndexError, ValueError, AttributeError):
                 continue

    if pd.notna(wrc_current) and pd.notna(wrc_target_val):
        return abs(wrc_target_val - wrc_current)
    else:
        return np.nan

Implementing NN

<small> Dropped 2024 data so that you could calcuate its preformance on future years. Also removed players below 200 PA (85 is the normalized value) because I was having a ton of issues with small sample size players impacting preformance

In [43]:
X1 = normalized_data[normalized_data['Season'] < 2024] # filter for eval metric
X1 = normalized_data[normalized_data['PA'] > 85] # filter pa so we dont get small sample outliers, corresponds to about 200 PA
X1 = X1.reset_index(drop=True)
X1 = X1.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%']) # filter uneed features, more varience
X = X1.drop(columns=['Name', 'Season']) # same as x1 without names
X = X.to_numpy()

In [None]:
# nearest neighbors and indices
nn = NearestNeighbors(n_neighbors=9).fit(X)
indices_test = nn.kneighbors(X, return_distance=False)

Nearest Neighbor for Every Player in df

In [45]:
kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9'])
kNear = df_apply(df = X1, indices=indices_test)
for i in range(1, 7):  # for columns n1 through n6
    kNear[f'n{i}'] = kNear[f'n{i}'].astype(str)

Loss Value

In [None]:
# Calculate wrc_diff and handle missing values
kNear['wrc_diff'] = kNear.apply(single_n_diff, axis=1)

# Calculate sum of wrc_diff, ignoring NaN values
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

Experiment with how to calc loss
<br>
<small> look at more then one neighbor? </small>

In [None]:
# same as the first loss function except now calculating 8 neighbors and weighted avg wrc+
def wrc_diff_weight(row):
    wrc_current = np.nan
    player_name = None

    try:
        player_info = row['n1'].split(',')
        player_name = player_info[0].strip()
        season = int(player_info[1])

        current_lookup = df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+']

        if not current_lookup.empty:
            wrc_current = current_lookup.iloc[0]
        else:
            return np.nan
    except (IndexError, ValueError, AttributeError, KeyError):
         return np.nan

    wrc_target_val = []
    for i in range(2, 10):
        neighbor_col = f'n{i}'
        if neighbor_col in row and pd.notna(row[neighbor_col]):
            try:
                neighbor_info = row[neighbor_col].split(',')
                neighbor_name = neighbor_info[0].strip()

                if neighbor_name != player_name:
                    neighbor_season = int(neighbor_info[1])

                    neighbor_lookup = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+']

                    if not neighbor_lookup.empty:
                        wrc_target_val.append(neighbor_lookup.iloc[0]) # add neighbors

                    # if len of the list is greater then 2 break
                    if len(wrc_target_val) >= 7:
                        break
            except (IndexError, ValueError, AttributeError):
                continue

    if pd.notna(wrc_current) and len(wrc_target_val) > 0:
        weights = [0.25, 0.15, 0.125, 0.125, 0.125, 0.125, 0.1]
        avg_target = sum(w * x for w, x in zip(weights[:len(wrc_target_val)], wrc_target_val)) / sum(weights[:len(wrc_target_val)])
        return abs(avg_target - wrc_current)
    else:
        return np.nan


loss | diffrence in future year WRC+

In [None]:
kNear['wrc_diff'] = kNear.apply(wrc_diff_weight, axis=1)
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

176289.87063116103


#### loss function 

<small> just adding all the previous steps together to make it easy to tune col weights

In [None]:
def loss_fn(weights, normalized_data):

    # begin eval function
    weighted_og = normalized_data[normalized_data['Season'] < 2024] # filter for eval metric
    weighted_og = weighted_og[weighted_og['PA'] > 85] # drop if below 200 PA, more stability, less outliers
    weighted_og = weighted_og.reset_index(drop=True)
    weighted_og = weighted_og.drop(columns=['BsR', 'Z-Swing%', 'O-Contact%', 'Z-Contact%'])
    weighted_x = weighted_og.drop(columns=['Name', 'Season'])
    

    # apply weights
    for col in weighted_x.columns:
        if col in weights:
            weighted_x[col] = weighted_x[col] * weights[col]

    # nearest neighbors
    nn_fn = NearestNeighbors(n_neighbors=6).fit(weighted_x)

    # distances and index
    indices_test_fn = nn_fn.kneighbors(weighted_x, return_distance=False)


    # add data using the mapped indices
    kNear_fn = df_apply(df = weighted_og, indices=indices_test_fn) # pass mapped to avoid out of bounds

    for i in range(1, 7):  # for columns n1 through n6
        kNear_fn[f'n{i}'] = kNear_fn[f'n{i}'].astype(str)

    # add miss
    kNear_fn['wrc_diff'] = kNear_fn.apply(wrc_diff_weight, axis=1) # using new wrc calc

    # drop inf or na
    kNear_fn = kNear_fn.dropna(subset=['wrc_diff'])
    kNear_fn['wrc_diff'] = kNear_fn['wrc_diff'].clip(upper=1000)

    # calc miss
    miss_fn = kNear_fn['wrc_diff'].sum(skipna=True)
    print(f"miss of {miss_fn}")
    return miss_fn


#### Manual Weight Tuning

<small> Ditched scipy largely because it was ineffective in finding the weights. Alot of this was likely because it was super slow as I coded everything in pandas and the it had to recompute the df constantly. This was widely inefficent. This is def my fault to a certian extent, however I can tune it just as effectively if not more (I wanted to do scipy just to learn it). Doing it manually also allows me not to recode everything. </small>

cols to opti and weights

In [None]:
cols_opti = X1.columns
weights = {}
for col in cols_opti:
    weights[col] = 1
print(weights)

{'Name': 0.038294635284743195, 'Season': 0.8394103347175448, 'PA': 0.7977140486020863, 'Age': 0.02763592751284738, 'BB%': 0.3368566824661502, 'K%': 0.5053868667855917, 'BABIP': 0.5846145267752457, 'wRC+': 0.6848649759815939, 'Off': 0.8573050272724841, 'WAR': 0.9242130062308272, 'O-Swing%': 0.9017380942301532, 'Contact%': 0.42469717539250784, 'SwStr%': 0.12269607189686782, 'CSW%': 0.25540433029288023}


manual weight tuining


In [None]:
weights = {}

loss function 
<br>
<small>(untuned loss of 176289.87063116103)

In [23]:
loss_value = loss_fn(weights = weights, normalized_data=normalized_data)

miss of 371208.98961617716
