### Create NN Model for 02-24 Data 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import numpy as np
from numpy.linalg import pinv # for mah distance 
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [2]:
data_02_24 = pd.read_csv('./data/cleaned/data_02_24.csv')
df = data_02_24.copy() # import data

In [3]:
df = df.drop(columns=['Unnamed: 0', 'Team', 'Def'])

#### Normalize Data 

copied from 'exploration/nearest_neighbors.ipynb'

In [4]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [5]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

mean for each year

In [6]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean

mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})

mean_col_year.sort_values(by=['Season'])
mean_col_year.head(50)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,2007,479.32366,29.131879,0.087048,0.165649,0.304928,100.059697,0.215508,3.847416,1.871003,0.237717,0.665527,0.596592,0.8861,0.813455,0.085515,0.253939
1,2009,475.489236,28.87133,0.090529,0.174445,0.301505,99.974603,0.142641,3.68209,1.853189,0.248842,0.664887,0.627009,0.882261,0.810436,0.085482,0.257907
2,2014,453.658304,28.465598,0.077702,0.198467,0.300486,99.992646,0.064191,3.242497,1.824187,0.307677,0.662543,0.663046,0.876915,0.798306,0.093996,0.267385
3,2019,447.323061,27.913035,0.086799,0.22362,0.299295,100.064046,0.093284,4.016924,1.793319,0.316885,0.691812,0.630996,0.853214,0.767166,0.110672,0.271976
4,2020,170.703366,28.026721,0.091721,0.234139,0.291201,100.136435,-0.00395,1.50307,0.695515,0.307861,0.680627,0.615423,0.844059,0.755157,0.113645,0.280891
5,2021,434.346692,28.345488,0.088237,0.225855,0.292583,100.027472,0.076247,3.768059,1.777268,0.314417,0.696508,0.628063,0.850632,0.765565,0.111828,0.273242
6,2018,455.833036,28.065419,0.086321,0.216814,0.297583,100.067295,0.142828,3.543007,1.790976,0.309668,0.679508,0.634281,0.859469,0.774731,0.106147,0.271946
7,2008,467.097081,28.828458,0.088692,0.169905,0.30198,99.982303,0.197216,3.719166,1.831372,0.249494,0.65964,0.624337,0.883456,0.813578,0.085154,0.253608
8,2024,448.020185,27.943522,0.081852,0.225702,0.290938,100.173534,0.053078,3.596287,1.787539,0.319775,0.696517,0.625439,0.859905,0.770404,0.110719,0.273644
9,2012,453.774083,28.526712,0.081452,0.191955,0.29889,100.08843,0.103176,3.615414,1.845739,0.302494,0.65305,0.675014,0.876266,0.802582,0.090579,0.265703


std for each year

In [7]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})

std_col_year.sort_values(by=['Season'])
std_col_year.head(45)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,2007,186.438339,4.367115,0.034388,0.05742,0.038677,28.886765,3.851518,17.061512,2.064698,0.059155,0.059302,0.109721,0.049635,0.062472,0.032731,0.027622
1,2009,180.331321,4.019151,0.032891,0.057032,0.038721,28.042407,3.573888,16.125988,2.058909,0.055755,0.059636,0.102275,0.04986,0.061134,0.03156,0.027306
2,2014,182.754092,3.916344,0.030344,0.06333,0.041191,29.311807,3.396804,14.349276,2.024452,0.056597,0.061227,0.095852,0.050305,0.063619,0.033885,0.029648
3,2019,180.535575,3.667468,0.032196,0.064471,0.042027,30.151685,2.354851,16.002425,1.982361,0.062161,0.060492,0.090361,0.04989,0.060312,0.033093,0.029731
4,2020,64.379969,3.654656,0.039277,0.071616,0.058569,37.79624,1.012108,7.408017,0.869445,0.065219,0.064878,0.100243,0.060459,0.068263,0.036236,0.033228
5,2021,180.319863,3.494339,0.031279,0.064601,0.042804,29.169001,2.178201,14.804431,1.867638,0.059613,0.057509,0.088726,0.053418,0.061547,0.033486,0.029649
6,2018,179.370982,3.717087,0.032228,0.064343,0.040083,28.638997,2.471588,15.268044,1.978929,0.061761,0.058405,0.088076,0.050496,0.060659,0.032994,0.029504
7,2008,186.544015,4.222349,0.03379,0.059741,0.03643,28.291714,3.717989,16.034558,2.136428,0.059225,0.057336,0.103243,0.049905,0.06136,0.031749,0.027017
8,2024,180.852846,3.629216,0.028644,0.062498,0.039092,29.46428,2.674628,15.637422,2.048161,0.057921,0.058322,0.093398,0.049193,0.060854,0.033774,0.029235
9,2012,188.300684,4.000032,0.029774,0.061718,0.041901,29.311607,3.461465,14.618813,2.045586,0.056109,0.05864,0.093778,0.049135,0.06251,0.032466,0.026644


z_scores

In [8]:
# z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
z_scores = pd.DataFrame(index=df.index, columns=df.select_dtypes(include=[np.number]).columns)

# scaled for each col
for col in df.select_dtypes(include=[np.number]).columns:
    if col not in ['Season', 'MLBAMID']:  # not needed
        for idx in df.index:
            season = df.loc[idx, 'Season']
            value = df.loc[idx, col]
            mean = mean_col_year.loc[mean_col_year['Season'] == season, col].iloc[0]
            std = std_col_year.loc[std_col_year['Season'] == season, col].iloc[0]
            # scale so that 10 is 1 std away
            z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

z_scores['Season'] = df['Season']

In [9]:
# cols to numeric
for col in z_scores.columns:
    if col not in ['Season', 'MLBAMID']:
        z_scores[col] = pd.to_numeric(z_scores[col])

reinsert names

In [10]:
normalized_data = z_scores.copy()
normalized_data.insert(0, 'Name', df['Name']) 
normalized_data.head(10)

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,brent clevlen,2007,74.82687,85.958971,74.686219,193.060161,107.344122,43.282571,98.65317,96.363387,89.901996,138.703183,88.950287,71.666459,22.211544,37.187201,169.893575,204.59646
1,brett hayes,2009,74.297907,90.367791,72.475999,127.859623,108.219882,117.931506,99.866315,98.235778,91.58048,170.668602,80.735141,60.421763,105.381033,65.57805,147.469352,199.428878
2,carlos peguero,2014,75.723755,96.257739,107.348483,147.612705,148.436984,92.738037,100.377615,97.711317,91.256604,122.457976,73.452584,41.259125,91.336666,33.462464,156.57838,183.278087
3,seby zavala,2019,75.887131,92.057094,73.040518,181.646207,108.098997,44.146682,99.594613,95.874994,90.245625,129.458257,111.272068,55.70808,34.476706,40.158323,179.134016,179.9933
4,franklin barreto,2020,77.834198,88.981943,76.647649,142.109822,78.737137,59.956124,101.610132,90.946521,86.747334,120.942395,87.572409,55.233155,84.442687,56.517822,144.969583,165.94178
5,domingo santana,2014,76.161502,80.937328,92.701724,191.474452,27.049843,35.300547,100.388715,95.262033,88.346507,104.533169,117.006084,37.346862,46.681858,34.97386,163.064364,174.135337
6,khalil lee,2021,76.910658,84.702434,71.790101,176.836023,78.370523,45.201241,99.649956,94.981084,88.131524,103.173196,80.320382,53.36458,65.56127,54.53041,135.519274,172.156207
7,gift ngoepe,2018,75.646393,99.824005,89.546771,164.461808,67.339068,43.622346,99.348982,95.266649,89.33869,97.997082,71.4602,48.628111,78.321189,62.685967,123.659525,168.376763
8,eliezer alfonzo,2008,75.550163,100.406271,73.75179,132.428726,56.320994,41.119256,98.386518,95.994778,89.755634,150.740042,83.057646,74.748966,56.559446,48.8952,160.671646,180.916565
9,scott kingery,2021,76.966115,96.149521,71.790101,162.804735,65.020621,38.454702,99.634244,94.519822,88.636617,120.64705,120.572055,66.782039,34.360819,43.626577,166.147981,165.237711


#### NN

In [11]:
X = normalized_data[normalized_data['Season'] < 2024] # filter for eval metric
X = normalized_data.drop(columns=['Name', 'Season'])

metrics needed for mahalanobis distance. I was having problems with ecluedian and apperantly this handles covarience between features much better, testing various metrics

In [12]:
cov_matrix = np.cov(X, rowvar=False)
inv_cov_matrix = pinv(cov_matrix)

In [13]:
nn = NearestNeighbors(n_neighbors=6, metric='mahalanobis', n_jobs=-1, metric_params={'VI': inv_cov_matrix})

In [14]:
nn.fit(X)

apply to every player

In [15]:
kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6'])

In [16]:
def df_apply(indices):
    rows = []
    indices_array = indices.astype(int)
    
    # Get the actual index values from normalized_data
    original_indices = normalized_data.index.values[:len(indices)]
    
    for i, idx in enumerate(original_indices):
        row_data = {}
        neighbor_indices = indices_array[i]
        
        # Get names and seasons combined for neighbors
        for j, neighbor_idx in enumerate(neighbor_indices):
            try:
                name = normalized_data.iloc[neighbor_idx]['Name']
                season = str(normalized_data.iloc[neighbor_idx]['Season'])
                row_data[f'n{j+1}'] = f"{name},{season}"
            except:
                row_data[f'n{j+1}'] = "NA,0"
        rows.append(row_data)
    
    return pd.DataFrame(rows, index=original_indices)

In [17]:
indices_test = nn.kneighbors(X, return_distance=False)
kNear = df_apply(indices=indices_test)
kNear.head()

Unnamed: 0,n1,n2,n3,n4,n5,n6
0,"brent clevlen,2007","michael taylor,2012","kameron misner,2024","scott kingery,2021","victor diaz,2006","rocky gale,2019"
1,"brett hayes,2009","darren baker,2024","denis phipps,2012","jesus montero,2014","justin maxwell,2007","oswald peraza,2024"
2,"carlos peguero,2014","abiatal avelino,2018","darnell mcdonald,2007","guillermo quiroz,2007","jorge polanco,2015","levi jordan,2024"
3,"seby zavala,2019","victor diaz,2006","alex guerrero,2014","colin porter,2003","kory dehaan,2002","andrew brown,2011"
4,"franklin barreto,2020","jared oliva,2020","adalberto mondesi,2017","joe mccarthy,2020","cristian pache,2021","osvaldo martinez,2011"


wrc diffrence, not the best measure ever but it was the best choice with the options had

In [18]:
def calculate_wrc_diff(row):
    # Split the current player's info
    try:
        player_name, season = row['n1'].split(',')
        player_name = player_name.strip()
        season = int(season)
    except Exception as e:
        print(f"{e} at {row}")

    # index player
    try:
        wrc_current= df[(df['Name'] == player_name) & (df['Season'] == season + 1)]['wRC+'].values[0] # get WAR for the player following season
    except:
        return np.nan

    # war for nearest player not named the same
    wrc_target = []
    for i in range(2, 11):
        neighbor_col = f'n{i}'
        if neighbor_col in row:
            neighbor_name, neighbor_season = row[neighbor_col].split(',')
            neighbor_name = neighbor_name.strip()
            neighbor_season = int(neighbor_season)
                
            if neighbor_name != player_name:
                try:
                    wrc_value = df[(df['Name'] == neighbor_name) & (df['Season'] == neighbor_season + 1)]['wRC+'].values[0]
                    wrc_target.append(wrc_value)
                    break
                except:
                    continue
    
    if len(wrc_target) == 0:
        return np.nan

    return abs(wrc_target[0] - wrc_current)

In [19]:
for i in range(1, 7):  # for columns n1 through n6
    kNear[f'n{i}'] = kNear[f'n{i}'].astype(str)

kNear = kNear[kNear['n1'].notna()] # drops nas

In [20]:
# Calculate wrc_diff and handle missing values
kNear['wrc_diff'] = kNear.apply(calculate_wrc_diff, axis=1)

# Calculate sum of wrc_diff, ignoring NaN values
miss = kNear['wrc_diff'].sum(skipna=True)
print(miss)

373504.52639654034


loss function, previous steps combined

In [21]:
def loss_fn(weights):
    normalized_data = z_scores.copy()
    normalized_data.insert(0, 'Name', df['Name'])
    normalized_data = normalized_data[normalized_data['Season'] < 2024]
    X = normalized_data.drop(columns=['Name', 'Season'])

    # Apply weights to each column
    for col in X.columns:
        if col in weights.index:  # Only apply weight if column exists in weights
            X[col] = X[col] * weights.loc[col, 'weight']

    # conv matrix and fit model
    cov_matrix = np.cov(X, rowvar=False)
    inv_cov_matrix = pinv(cov_matrix)
    nn = NearestNeighbors(n_neighbors=6, metric='mahalanobis', n_jobs=-1, metric_params={'VI': inv_cov_matrix})
    nn.fit(X)

    # distances and index 
    indices = nn.kneighbors(X, return_distance=False)

    # drop last one
    indices = indices[:-1]

    # knear
    kNear = pd.DataFrame(columns=['n1', 'n2', 'n3', 'n4', 'n5', 'n6'])

    # add data using the mapped indices
    kNear = df_apply(indices=indices) # pass mapped to avoid out of bounds

    for i in range(1, 7):  # for columns n1 through n6
        kNear[f'n{i}'] = kNear[f'n{i}'].astype(str)

    # add miss
    kNear['wrc_diff'] = kNear.apply(calculate_wrc_diff, axis=1)

    # calc miss
    miss = kNear['wrc_diff'].sum(skipna=True)
    print(f"round complete, miss of {miss}")
    return miss

    


cols to opti and weights

In [22]:
cols_opti = normalized_data.drop(columns=['Name', 'Season']).columns.tolist()
weights_df = pd.DataFrame({'weight': np.random.normal(0, 3, len(cols_opti))}, index=cols_opti)
print(len(cols_opti)) # for population size

16


check if weights inti and func works as expected

In [23]:
weights_df_numpy = weights_df.to_numpy().flatten() 
loss_value = loss_fn(weights_df)
print(f"Current loss value: {loss_value}")

round complete, miss of 407356.61439028766
Current loss value: 407356.61439028766


scipy optimize weights of col

In [24]:
from scipy.optimize import differential_evolution

# loss wrapped for function
def loss_fn_wrapper(x):
    # Update weights_df with new weights from optimizer
    weights_df_temp = pd.DataFrame({'weight': x}, index=cols_opti)
    return loss_fn(weights_df_temp)

# Set bounds for weights (0 to 3 for each weight)
bounds = [(0, 3)] * len(cols_opti)

In [None]:
# Optimize | changed instead of a bfhs alg bc of the complex space of bounds
opt = differential_evolution(func=loss_fn_wrapper, bounds=bounds, maxiter=10, popsize=5, mutation=(0.5, 1.7), disp = True, workers=-1)
weights_df['weight'] = opt.x

In [None]:
print(opt.x)
print(weights_df)