### Create NN Model for 02-24 Data 

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [18]:
data_02_24 = pd.read_csv('./data/cleaned/data_02_24.csv')
df = data_02_24.copy() # import data

In [19]:
df = df.drop(columns=['Unnamed: 0', 'Team'])

#### Normalize Data 

copied from 'exploration/nearest_neighbors.ipynb'

In [20]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [21]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

mean for each year

In [22]:
# Create empty DataFrame to store results
mean_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_col_year[col] = yearly_mean

mean_col_year = mean_col_year.reset_index()
mean_col_year = mean_col_year.rename(columns={'index': 'Season'})

mean_col_year.sort_values(by=['Season'])
mean_col_year.head(50)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,Def,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,2007,479.32366,29.131879,0.087048,0.165649,0.304928,100.059697,0.215508,3.847416,-0.866266,1.871003,0.237717,0.665527,0.596592,0.8861,0.813455,0.085515,0.253939
1,2009,475.489236,28.87133,0.090529,0.174445,0.301505,99.974603,0.142641,3.68209,-0.916076,1.853189,0.248842,0.664887,0.627009,0.882261,0.810436,0.085482,0.257907
2,2014,453.658304,28.465598,0.077702,0.198467,0.300486,99.992646,0.064191,3.242497,-0.62758,1.824187,0.307677,0.662543,0.663046,0.876915,0.798306,0.093996,0.267385
3,2019,447.323061,27.913035,0.086799,0.22362,0.299295,100.064046,0.093284,4.016924,-0.794203,1.793319,0.316885,0.691812,0.630996,0.853214,0.767166,0.110672,0.271976
4,2020,170.703366,28.026721,0.091721,0.234139,0.291201,100.136435,-0.00395,1.50307,-0.480182,0.695515,0.307861,0.680627,0.615423,0.844059,0.755157,0.113645,0.280891
5,2021,434.346692,28.345488,0.088237,0.225855,0.292583,100.027472,0.076247,3.768059,-0.780495,1.777268,0.314417,0.696508,0.628063,0.850632,0.765565,0.111828,0.273242
6,2018,455.833036,28.065419,0.086321,0.216814,0.297583,100.067295,0.142828,3.543007,-0.908827,1.790976,0.309668,0.679508,0.634281,0.859469,0.774731,0.106147,0.271946
7,2008,467.097081,28.828458,0.088692,0.169905,0.30198,99.982303,0.197216,3.719166,-0.901239,1.831372,0.249494,0.65964,0.624337,0.883456,0.813578,0.085154,0.253608
8,2024,448.020185,27.943522,0.081852,0.225702,0.290938,100.173534,0.053078,3.596287,-1.214286,1.787539,0.319775,0.696517,0.625439,0.859905,0.770404,0.110719,0.273644
9,2012,453.774083,28.526712,0.081452,0.191955,0.29889,100.08843,0.103176,3.615414,-0.550295,1.845739,0.302494,0.65305,0.675014,0.876266,0.802582,0.090579,0.265703


std for each year

In [None]:
# Create empty DataFrame to store results
std_col_year = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_col_year[col] = yearly_std

std_col_year = std_col_year.reset_index()
std_col_year = std_col_year.rename(columns={'index': 'Season'})

std_col_year.sort_values(by=['Season'])
std_col_year.head(45)

z_scores

In [None]:
# z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
z_scores = pd.DataFrame(index=df.index, columns=df.select_dtypes(include=[np.number]).columns)

# scaled for each col
for col in df.select_dtypes(include=[np.number]).columns:
    if col not in ['Season', 'MLBAMID']:  # not needed
        for idx in df.index:
            season = df.loc[idx, 'Season']
            value = df.loc[idx, col]
            mean = mean_col_year.loc[mean_col_year['Season'] == season, col].iloc[0]
            std = std_col_year.loc[std_col_year['Season'] == season, col].iloc[0]
            # scale so that 10 is 1 std away
            z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

z_scores['Season'] = df['Season']

In [None]:
for col in z_scores.columns:
    if col not in ['Season', 'MLBAMID']:
        z_scores[col] = pd.to_numeric(z_scores[col])

reinsert names

In [None]:
normalized_data = z_scores.copy()
normalized_data.insert(0, 'Name', df['Name']) 
normalized_data.head(10)