In [None]:
%matplotlib inline

import os
import pdb
import argparse
import pickle as pkl
from pathlib import Path

from collections import defaultdict

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error


from six.moves.urllib.request import urlretrieve
import tarfile
import pickle
import sys

import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)



In [None]:
######################################################################
# Set path to the folder where your data is stored
######################################################################
from google.colab import drive
drive.mount('/content/drive', force_remount = False)
os.chdir('/content/drive/MyDrive/Neural Nets Spring 2025 Project')
# os.chdir('/content/sample_data')
os.getcwd()
os.listdir()

# Prepare Folds of Data for Training-Validation

In [None]:
### Read in Cached Datasets and Features
cleandata = pd.read_feather('Data/cleandata.feather')
gameavgstats = pd.read_feather('Data/gameavgstats.feather')
sumstats = pd.read_feather('Data/sumstats.feather')
teamstats = pd.read_feather('Data/teamstats.feather')
otherstats = pd.read_feather('Data/otherstats.feather')

In [None]:
####Function for returning training and validation data tensors for a fold (window) of the dataset based on seasons
####Can specify number of seasons to use for validation and how many days after the training start date to not include (buffer)
####Combines all features from and measures coverage for each player-game and removes those below a provided threshold
####Imputes missing feature data by forward filling and FT features with 0s
####Standardizes numeric features and returns training and validation tensors

feature_list = []

def get_data_fold(fold_start, fold_end, validation_years = 1, train_start_buffer = 60,  feature_cover_cutoff = 0.6, feature_list = None, testset = False):

  validation_split = fold_end - validation_years + 1

  print("STARTING DATA PREP FOR FOLD")
  #Get data for seasons in fold
  alldf = cleandata.loc[(cleandata.Season >= fold_start) & (cleandata.Season <= fold_end), ['PlayerId','Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS']]

  first_game_date = alldf.Date.min()
  last_game_date = alldf.Date.max()

  train_start_date = first_game_date + pd.Timedelta(days = train_start_buffer)

  print('First and Last Game Dates in Fold: ' + str(first_game_date) + ' ' + str(last_game_date.date))
  print('Start Date after Buffer: ' + str(train_start_date))
  print('Validation Seasons: ' + str(alldf[alldf.Season >= validation_split].Season.unique()))

  #Merge in other stats
  if feature_list != None:
    otherlist = [col for col in feature_list if col in otherstats.columns]
    otherfeatures = otherstats[['Date', 'PlayerId'] + otherlist]
  otherfeatures = otherstats

  alldf = alldf.merge(otherfeatures, on = ['Date', 'PlayerId'], how = 'left')

  print('Number of Player Games: ' + str(len(alldf)))

  #Cut off at training start date
  alldf = alldf[alldf.Date >= train_start_date]
  print('Number of Player Games After Buffer: ' + str(len(alldf)))

  #Combine all Input Features
  features = pd.concat([gameavgstats[(gameavgstats.Date >= first_game_date) & (gameavgstats.Date <= last_game_date)],
                        sumstats[(sumstats.Date >= first_game_date) & (sumstats.Date <= last_game_date)],
                        teamstats[(teamstats.Date >= first_game_date) & (teamstats.Date <= last_game_date)]])

  #Select features if provided list
  if (feature_list != None):
    features = features[features.StatName.isin(feature_list)]

  #Number of Features for a Player Game
  features['StatCount'] = features.groupby(['Date', 'PlayerId'])['StatName'].transform('size')
  features['NonNACount'] = features.groupby(['Date', 'PlayerId'])['Data'].transform('count')
  features['Coverage'] = features['NonNACount'] / features['StatCount']

  #Remove rows with not enough feature coverage
  features = features[features.Coverage > feature_cover_cutoff]

  #Impute missing feature data first using last known value
  features['Data_Imp'] = features['Data']
  features = features.sort_values('Date')
  features['Data_Imp'] = features.groupby(['PlayerId', 'StatName'])['Data_Imp'].ffill()

  #Impute Missing FT Data with 0
  features.loc[features.StatName.str.contains("FT") & (features.Data_Imp.isna()), 'Data_Imp'] = 0

  #Pivot Features into Wide
  features = features.pivot(index = ['Date', 'PlayerId'], columns = 'StatName', values = 'Data_Imp').reset_index()

  #Merge features back to player data
  alldf = alldf.merge(features, on = ['Date', 'PlayerId'], how = 'left')

  #Drop rows that still contains NAs
  alldf = alldf.dropna()
  print('Number of Player Games After Dropping NAs: ' + str(len(alldf)))

  #Confirm no duplicated games
  assert len(alldf[alldf.duplicated(subset = ['PlayerId', 'Date'], keep = False)]) == 0

  #Split into train and validation sets
  train_df, valid_df = alldf.loc[alldf.Season < validation_split], alldf.loc[alldf.Season >= validation_split]

  #Testing set, both train and validation are the test set
  if testset:
    train_df, valid_df = alldf, alldf

  ##Drop Not Needed Columns
  train_df = train_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS'])
  valid_df = valid_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS'])

  #Split Binary and Numeric Columns
  binary_cols = ['HomeGame', 'C',	'C-F',	'F',	'F-C',	'F-G',	'G',	'G-F']
  numeric_cols = [col for col in train_df.columns if col not in binary_cols]

  train_binary = train_df[binary_cols]
  train_numeric = train_df[numeric_cols]
  valid_binary = valid_df[binary_cols]
  valid_numeric = valid_df[numeric_cols]

  ##Standardize numerical features
  scaler = StandardScaler().fit(train_numeric)
  train_numeric = pd.DataFrame(scaler.transform(train_numeric), columns = numeric_cols)
  valid_numeric = pd.DataFrame(scaler.transform(valid_numeric), columns = numeric_cols)

  #Recombine binary and standardized numeric features
  train_df = pd.concat([train_binary.reset_index(drop = True), train_numeric], axis = 1)
  valid_df = pd.concat([valid_binary.reset_index(drop = True), valid_numeric], axis = 1)

  feature_cols = train_df.columns

  train_df = train_df.astype(float)
  valid_df = valid_df.astype(float)

  train_df = torch.tensor(train_df.to_numpy(), dtype = torch.float32)
  valid_df = torch.tensor(valid_df.to_numpy(), dtype = torch.float32)

  #Get target variable tensors
  train_target, valid_target = alldf.loc[alldf.Season < validation_split, 'FTSYPTS'], alldf.loc[alldf.Season >= validation_split, 'FTSYPTS']

  train_target = torch.tensor(train_target.to_numpy(), dtype = torch.float32)
  valid_target = torch.tensor(valid_target.to_numpy(), dtype = torch.float32)

  print('Train Shape', train_df.shape, 'Validation Shape', valid_df.shape, 'Train Target Shape', train_target.shape, 'Valid Target Shape', valid_target.shape)

  return(train_df, valid_df, train_target, valid_target, feature_cols)

In [None]:
# Add Player, Team, and Opp Team indices for embeddings to data
def get_data_embed_fold(fold_start, fold_end, validation_years = 1, train_start_buffer = 60,  feature_cover_cutoff = 0.6, feature_list = None, testset = False):

  validation_split = fold_end - validation_years + 1

  print("STARTING DATA PREP FOR FOLD")
  #Get data for seasons in fold
  alldf = cleandata.loc[(cleandata.Season >= fold_start) & (cleandata.Season <= fold_end), ['PlayerId','Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS']]

  first_game_date = alldf.Date.min()
  last_game_date = alldf.Date.max()

  train_start_date = first_game_date + pd.Timedelta(days = train_start_buffer)

  print('First and Last Game Dates in Fold: ' + str(first_game_date) + ' ' + str(last_game_date.date))
  print('Start Date after Buffer: ' + str(train_start_date))
  print('Validation Seasons: ' + str(alldf[alldf.Season >= validation_split].Season.unique()))

  #Merge in other stats
  if feature_list != None:
    otherlist = [col for col in feature_list if col in otherstats.columns]
    otherfeatures = otherstats[['Date', 'PlayerId'] + otherlist]
  otherfeatures = otherstats

  alldf = alldf.merge(otherfeatures, on = ['Date', 'PlayerId'], how = 'left')

  print('Number of Player Games: ' + str(len(alldf)))

  #Cut off at training start date
  alldf = alldf[alldf.Date >= train_start_date]
  print('Number of Player Games After Buffer: ' + str(len(alldf)))

  #Combine all Input Features
  features = pd.concat([gameavgstats[(gameavgstats.Date >= first_game_date) & (gameavgstats.Date <= last_game_date)],
                        sumstats[(sumstats.Date >= first_game_date) & (sumstats.Date <= last_game_date)],
                        teamstats[(teamstats.Date >= first_game_date) & (teamstats.Date <= last_game_date)]])

  #Select features if provided list
  if (feature_list != None):
    features = features[features.StatName.isin(feature_list)]

  #Number of Features for a Player Game
  features['StatCount'] = features.groupby(['Date', 'PlayerId'])['StatName'].transform('size')
  features['NonNACount'] = features.groupby(['Date', 'PlayerId'])['Data'].transform('count')
  features['Coverage'] = features['NonNACount'] / features['StatCount']

  #Remove rows with not enough feature coverage
  features = features[features.Coverage > feature_cover_cutoff]

  #Impute missing feature data first using last known value
  features['Data_Imp'] = features['Data']
  features = features.sort_values('Date')
  features['Data_Imp'] = features.groupby(['PlayerId', 'StatName'])['Data_Imp'].ffill()

  #Impute Missing FT Data with 0
  features.loc[features.StatName.str.contains("FT") & (features.Data_Imp.isna()), 'Data_Imp'] = 0

  #Pivot Features into Wide
  features = features.pivot(index = ['Date', 'PlayerId'], columns = 'StatName', values = 'Data_Imp').reset_index()

  #Merge features back to player data
  alldf = alldf.merge(features, on = ['Date', 'PlayerId'], how = 'left')

  #Drop rows that still contains NAs
  alldf = alldf.dropna()
  print('Number of Player Games After Dropping NAs: ' + str(len(alldf)))

  #Confirm no duplicated games
  assert len(alldf[alldf.duplicated(subset = ['PlayerId', 'Date'], keep = False)]) == 0

  #Create Player, Team, and Opp Indices for embeddings
  playerid_index = alldf.PlayerId.unique()
  playerid_index = {playerid_index[i]: i for i in range(len(playerid_index))}
  alldf['PlayerId_Index'] = alldf.PlayerId.map(playerid_index)

  team_index = alldf.Playerteam.unique()
  team_index = {team_index[i]: i for i in range(len(team_index))}
  alldf['Team_Index'] = alldf.Playerteam.map(team_index)

  team_index = alldf.Oppteam.unique()
  team_index = {team_index[i]: i for i in range(len(team_index))}
  alldf['Opp_Index'] = alldf.Oppteam.map(team_index)

  # #Split into train and validation sets
  train_df, valid_df = alldf.loc[alldf.Season < validation_split], alldf.loc[alldf.Season >= validation_split]

  #Separate embedding columns
  train_player_index = torch.tensor(train_df[['PlayerId_Index']].to_numpy(), dtype = torch.long)
  train_team_index = torch.tensor(train_df[['Team_Index']].to_numpy(), dtype = torch.long)
  train_opp_index = torch.tensor(train_df[['Opp_Index']].to_numpy(), dtype = torch.long)

  valid_player_index = torch.tensor(valid_df[['PlayerId_Index']].to_numpy(), dtype = torch.long)
  valid_team_index = torch.tensor(valid_df[['Team_Index']].to_numpy(), dtype = torch.long)
  valid_opp_index = torch.tensor(valid_df[['Opp_Index']].to_numpy(), dtype = torch.long)

  #Drop Not Needed Columns
  train_df = train_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS', 'PlayerId_Index', 'Team_Index', 'Opp_Index'])
  valid_df = valid_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS', 'PlayerId_Index', 'Team_Index', 'Opp_Index'])

  #Split Binary and Numeric Columns
  binary_cols = ['HomeGame', 'C',	'C-F',	'F',	'F-C',	'F-G',	'G',	'G-F']
  numeric_cols = [col for col in train_df.columns if col not in binary_cols]

  train_binary = train_df[binary_cols]
  train_numeric = train_df[numeric_cols]
  valid_binary = valid_df[binary_cols]
  valid_numeric = valid_df[numeric_cols]

  #Standardize numerical features
  scaler = StandardScaler().fit(train_numeric)
  train_numeric = pd.DataFrame(scaler.transform(train_numeric), columns = numeric_cols)
  valid_numeric = pd.DataFrame(scaler.transform(valid_numeric), columns = numeric_cols)

  #Recombine binary and standardized numeric features
  train_df = pd.concat([train_binary.reset_index(drop = True), train_numeric], axis = 1)
  valid_df = pd.concat([valid_binary.reset_index(drop = True), valid_numeric], axis = 1)

  train_df = train_df.astype(float)
  valid_df = valid_df.astype(float)

  train_df = torch.tensor(train_df.to_numpy(), dtype = torch.float32)
  valid_df = torch.tensor(valid_df.to_numpy(), dtype = torch.float32)

  #Get target variable tensors
  train_target, valid_target = alldf.loc[alldf.Season < validation_split, 'FTSYPTS'], alldf.loc[alldf.Season >= validation_split, 'FTSYPTS']

  train_target = torch.tensor(train_target.to_numpy(), dtype = torch.float32)
  valid_target = torch.tensor(valid_target.to_numpy(), dtype = torch.float32)

  print('Train Shape', train_df.shape, 'Validation Shape', valid_df.shape, 'Train Target Shape', train_target.shape, 'Valid Target Shape', valid_target.shape)
  print('Player Index Shape', train_player_index.shape, 'Team Index Shape', train_team_index.shape, 'Opp Index Shape', train_opp_index.shape)

  return(train_df, valid_df, train_target, valid_target, train_player_index, valid_player_index, train_team_index, valid_team_index, train_opp_index, valid_opp_index)

In [None]:
#Prepares data for LSTM models
def get_data_fold_lstm(fold_start, fold_end, validation_years = 1, train_start_buffer = 60,  feature_cover_cutoff = 0.6, feature_list = None):

  validation_split = fold_end - validation_years + 1

  print("STARTING DATA PREP FOR FOLD")
  #Get data for seasons in fold
  alldf = cleandata.loc[(cleandata.Season >= fold_start) & (cleandata.Season <= fold_end), ['PlayerId','Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS']]

  first_game_date = alldf.Date.min()
  last_game_date = alldf.Date.max()

  train_start_date = first_game_date + pd.Timedelta(days = train_start_buffer)

  print('First and Last Game Dates in Fold: ' + str(first_game_date) + ' ' + str(last_game_date.date))
  print('Start Date after Buffer: ' + str(train_start_date))
  print('Validation Seasons: ' + str(alldf[alldf.Season >= validation_split].Season.unique()))

  #Merge in other stats
  if feature_list != None:
    otherlist = [col for col in feature_list if col in otherstats.columns]
    otherfeatures = otherstats[['Date', 'PlayerId'] + otherlist]
  otherfeatures = otherstats

  alldf = alldf.merge(otherfeatures, on = ['Date', 'PlayerId'], how = 'left')

  print('Number of Player Games: ' + str(len(alldf)))

  #Cut off at training start date
  alldf = alldf[alldf.Date >= train_start_date]
  print('Number of Player Games After Buffer: ' + str(len(alldf)))

  #Combine all Input Features
  features = pd.concat([gameavgstats[(gameavgstats.Date >= first_game_date) & (gameavgstats.Date <= last_game_date)],
                        sumstats[(sumstats.Date >= first_game_date) & (sumstats.Date <= last_game_date)],
                        teamstats[(teamstats.Date >= first_game_date) & (teamstats.Date <= last_game_date)]])

  #Select features if provided list
  if (feature_list != None):
    features = features[features.StatName.isin(feature_list)]

  #Number of Features for a Player Game
  features['StatCount'] = features.groupby(['Date', 'PlayerId'])['StatName'].transform('size')
  features['NonNACount'] = features.groupby(['Date', 'PlayerId'])['Data'].transform('count')
  features['Coverage'] = features['NonNACount'] / features['StatCount']

  #Remove rows with not enough feature coverage
  features = features[features.Coverage > feature_cover_cutoff]

  #Impute missing feature data first using last known value
  features['Data_Imp'] = features['Data']
  features = features.sort_values('Date')
  features['Data_Imp'] = features.groupby(['PlayerId', 'StatName'])['Data_Imp'].ffill()

  #Impute Missing FT Data with 0
  features.loc[features.StatName.str.contains("FT") & (features.Data_Imp.isna()), 'Data_Imp'] = 0

  #Pivot Features into Wide
  features = features.pivot(index = ['Date', 'PlayerId'], columns = 'StatName', values = 'Data_Imp').reset_index()

  #Merge features back to player data
  alldf = alldf.merge(features, on = ['Date', 'PlayerId'], how = 'left')

  #Drop rows that still contains NAs
  alldf = alldf.dropna()
  alldf = alldf.sort_values(['PlayerId','Date']).reset_index(drop=True)
  print('Number of Player Games After Dropping NAs: ' + str(len(alldf)))

  #Confirm no duplicated games
  assert len(alldf[alldf.duplicated(subset = ['PlayerId', 'Date'], keep = False)]) == 0

  #Split into train and validation sets
  train_df, valid_df = alldf.loc[alldf.Season < validation_split], alldf.loc[alldf.Season >= validation_split]

  ##Drop Not Needed Columns
  train_df = train_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS'])
  valid_df = valid_df.drop(columns = ['PlayerId', 'Playerteam', 'Oppteam', 'Date', 'Season', 'FTSYPTS'])

  #Split Binary and Numeric Columns
  binary_cols = ['HomeGame', 'C',	'C-F',	'F',	'F-C',	'F-G',	'G',	'G-F']
  numeric_cols = [col for col in train_df.columns if col not in binary_cols]

  train_binary = train_df[binary_cols]
  train_numeric = train_df[numeric_cols]
  valid_binary = valid_df[binary_cols]
  valid_numeric = valid_df[numeric_cols]

  ##Standardize numerical features
  scaler = StandardScaler().fit(train_numeric)
  train_numeric = pd.DataFrame(scaler.transform(train_numeric), columns = numeric_cols)
  valid_numeric = pd.DataFrame(scaler.transform(valid_numeric), columns = numeric_cols)

  #Recombine binary and standardized numeric features
  train_df = pd.concat([train_binary.reset_index(drop = True), train_numeric], axis = 1)
  valid_df = pd.concat([valid_binary.reset_index(drop = True), valid_numeric], axis = 1)

  train_df = train_df.astype(float)
  valid_df = valid_df.astype(float)

  display(train_df)
  display(valid_df)

  train_df = torch.tensor(train_df.to_numpy(), dtype = torch.float32)
  valid_df = torch.tensor(valid_df.to_numpy(), dtype = torch.float32)

  #Get target variable tensors
  train_target, valid_target = alldf.loc[alldf.Season < validation_split, 'FTSYPTS'], alldf.loc[alldf.Season >= validation_split, 'FTSYPTS']

  train_target = torch.tensor(train_target.to_numpy(), dtype = torch.float32)
  valid_target = torch.tensor(valid_target.to_numpy(), dtype = torch.float32)

  print('Train Shape', train_df.shape, 'Validation Shape', valid_df.shape, 'Train Target Shape', train_target.shape, 'Valid Target Shape', valid_target.shape)

  return(train_df, valid_df, train_target, valid_target)

## Create Data Loaders

In [None]:
###Creates list of data loaders with TensorDatasets wrapped around the training and validation data across folds
def folds_data_loader(folds = [(1992, 2000), (1998, 2006), (2003, 2011), (2008, 2016)], valid_years = 2, train_buffer = 60,
                      batch_size = 64, shuffle = True, feature_list = None, testset = True):

  df_list = []

  for i in folds:
    print("Fold Start", i[0], "Fold End Season", i[1])
    train_df, valid_df, train_target, valid_target, feature_cols = get_data_fold(fold_start = i[0], fold_end = i[1], feature_list = feature_list,
                                                                   validation_years = valid_years, train_start_buffer = train_buffer,
                                                                   feature_cover_cutoff = 0.6)
    train_dataset = TensorDataset(train_df, train_target)
    valid_dataset = TensorDataset(valid_df, valid_target)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = shuffle, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)

    df_list.append((i[0], i[1], train_loader, valid_loader, feature_cols))

  return(df_list)


In [None]:
# dataloaders_test = folds_data_loader(folds = [(2004, 2019)], valid_years=3, train_buffer = 0,
#                                        batch_size = 64, shuffle = True, feature_list = None)

In [None]:
###Creates list of data loaders with TensorDatasets wrapped around the training and validation data across folds, includes embeddings
def folds_data_embed_loader(folds = [(1992, 2000), (1998, 2006), (2003, 2011), (2008, 2016)], batch_size = 64, shuffle = True, feature_list = None):

  df_list = []

  for i in folds:
    print("Fold Start", i[0], "Fold End Season", i[1])
    train_df, valid_df, train_target, valid_target, train_player_index, valid_player_index, train_team_index, valid_team_index, train_opp_index, valid_opp_index = get_data_embed_fold(fold_start = i[0], fold_end = i[1], feature_list = feature_list,
                                                                   validation_years = 2, train_start_buffer = 60,  feature_cover_cutoff = 0.6)

    train_dataset = TensorDataset(train_df, train_target, train_player_index, train_team_index, train_opp_index )
    valid_dataset = TensorDataset(valid_df, valid_target, valid_player_index, valid_team_index, valid_opp_index)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = shuffle, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)

    df_list.append((i[0], i[1], train_loader, valid_loader))

  return(df_list)


In [None]:
###Create list of folds ###Creates list of data loaders with TensorDatasets wrapped around the training and validation data across folds, for the lstm model
def folds_data_loader_lstm(folds = [(1992, 2000), (1998, 2006), (2003, 2011), (2008, 2016)], valid_years = 2, train_buffer = 60,
                      batch_size = 64, shuffle = True, feature_list = None, testset = True):

  df_list = []

  for i in folds:
    print("Fold Start", i[0], "Fold End Season", i[1])
    train_df, valid_df, train_target, valid_target = get_data_fold_lstm(fold_start = i[0], fold_end = i[1], feature_list = feature_list,
                                                                   validation_years = valid_years, train_start_buffer = train_buffer,
                                                                   feature_cover_cutoff = 0.6)
    train_dataset = TensorDataset(train_df, train_target)
    valid_dataset = TensorDataset(valid_df, valid_target)
    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = shuffle, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)

    df_list.append((i[0], i[1], train_loader, valid_loader))

  return(df_list)

dataloaders_lstm = folds_data_loader_lstm(folds = [(2004, 2019)], valid_years = 3, train_buffer = 0, batch_size = 32, shuffle = False, feature_list = None, testset = False)

In [None]:
#Save the list of fold dfs.
# with open('Data/full_fold_2008_test.pkl', 'wb') as f:
#     pickle.dump(dataloaders_test, f)


In [None]:
# Load list of fold dfs
#full_fold_dfs_embeds
#full_fold_dfs_shuffle_parallel
#full_fold_2008_test.pkl

# with open('Data/full_fold_2008_test.pkl', 'rb') as f:
#     # dataloaders_embed = pkl.load(f)
#     dataloaders = pkl.load(f)



# Training and Validation

## Standard training for fully connected nn

In [None]:
####Training over the folds of data given the list of data loaders
def train_model_folds(dataloaderlist, nepochs=3, lr=0.005, patience = 5):

  results = []

  for dl in dataloaderlist:

    #get train and validation data loaders
    fold_start, fold_end = dl[0], dl[1]
    train_loader, valid_loader = dl[2], dl[3]

    #Get training sample sizes and feature dimension
    n, d = train_loader.dataset.tensors[0].size()

    #Get validation sample sizes and feature dimension
    n_valid, d_valid = valid_loader.dataset.tensors[0].size()

    #Batch Size
    batch_size = train_loader.batch_size

    print('NEW FOLD:', n, d, batch_size, fold_start, fold_end)

    #Use GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #Initiate the FCNN model
    model = fc_nn(input_dim = d).to(device)

    #Get the loss
    loss_func = nn.L1Loss()

    #Specify the optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses, valid_losses = [], []

    #Early stopping setup
    valid_loss_best = float('inf')
    epochs_noimprovement = 0
    early_stop = False

    #Training loop over epochs
    for epoch in range(nepochs):

      print('Epoch', epoch)
      model.train()
      train_loss = 0

      #Train over a batch
      for x_batch, y_batch in train_loader:

          x_batch = x_batch.to(device)
          y_batch = y_batch.to(device)

          #Forward pass of model
          optimizer.zero_grad()
          output = model(x_batch).squeeze(dim = 1)

          loss = loss_func(output, y_batch)

          #backward pass of model, backprop
          loss.backward()
          optimizer.step()

          #Accumulate training loss
          train_loss += loss.item() * x_batch.size(0)

      #Compute average training loss per sample
      train_loss /= n
      train_losses.append(train_loss)

      #Run Validation
      model.eval()
      valid_loss = 0.0

      with torch.no_grad():
          for x_batch, y_batch in valid_loader:

              x_batch = x_batch.to(device)
              y_batch = y_batch.to(device)

              #Run forward pass of model
              output = model(x_batch).squeeze(dim = 1)

              #Compute Loss and sum
              loss = loss_func(output, y_batch)

              valid_loss += loss.item() * x_batch.size(0)

      #Compute average loss per sample
      valid_loss /= n_valid
      valid_losses.append(valid_loss)

      #Early Stopping, if validation loss does not improve in #patience of epochs, stop training.
      if valid_loss < valid_loss_best - 1e-5:
        valid_loss_best = valid_loss
        epochs_noimprovement = 0
      else:
        epochs_noimprovement += 1

      if epochs_noimprovement >= patience:
        print('Early Stopping due to No Improvement')
        early_stop = True
        break

      #Display and record losses
      print(f"Epoch {epoch+1:>2}/{nepochs}: Train Loss = {train_loss:.4f}, Val Loss = {valid_loss:.4f}, Best Val Loss = {valid_loss_best:.4f}")

      #Return the loss curve results
      loss_curve = pd.DataFrame({'Train':train_losses, 'Valid':valid_losses}).reset_index(names = 'Epoch')

      loss_curve['Fold'] = fold_start

      results.append(loss_curve)

  results = pd.concat(results)

  return((results))


## With Embeddings

In [None]:
####Training over the folds of data given the list of data loaders, including embeddings
def train_model_embed_folds(dataloaderlist, nepochs=3, lr=0.005, patience = 5):

  results = []

  for dl in dataloaderlist:

    #get train and validation data loaders
    fold_start, fold_end = dl[0], dl[1]
    train_loader, valid_loader = dl[2], dl[3]

    #Get training sample sizes and feature dimension
    n, d = train_loader.dataset.tensors[0].size()

    #Get validation sample sizes and feature dimension
    n_valid, d_valid = valid_loader.dataset.tensors[0].size()

    #Batch Size
    batch_size = train_loader.batch_size

    #Get Embedding dimensions for players, teams and opponents
    numplayers = max(max(train_loader.dataset.tensors[2].unique()), max(valid_loader.dataset.tensors[2].unique())) + 1
    numteams = max(max(train_loader.dataset.tensors[3].unique()), max(valid_loader.dataset.tensors[3].unique())) + 1
    numopps = max(max(train_loader.dataset.tensors[4].unique()), max(valid_loader.dataset.tensors[4].unique())) + 1

    print('NEW FOLD:', 'Num Samples', n, 'Input Dim', d, 'Batch Size', batch_size, 'Fold Seasons', fold_start, fold_end)
    print('Players', numplayers, 'Teams', numteams, 'Opps', numopps)

    #Use GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')

    #Initiate a FCNN model with embeddings
    model = fc_nn_embed(input_dim = d, num_players = numplayers, num_teams = numteams, num_opps = numopps).to(device)

    #Use L1 Loss
    loss_func = nn.L1Loss()

    #Specify the optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses, valid_losses = [], []

    #Early stopping setup
    valid_loss_best = float('inf')
    epochs_noimprovement = 0
    early_stop = False

    #Training loop over epochs
    for epoch in range(nepochs):

      print('Epoch', epoch)
      model.train()
      train_loss = 0

      #Take a batch of features, targets, and ids for embeddings
      for x_batch, y_batch, playerids, teamids, oppids in train_loader:

          x_batch = x_batch.to(device)
          y_batch = y_batch.to(device)
          playerids = playerids.to(device)
          teamids = teamids.to(device)
          oppids = oppids.to(device)

          #Forward pass of model
          optimizer.zero_grad()
          output = model(x_batch, player_ids = playerids, team_ids = teamids, opp_ids = oppids).squeeze(dim = 1)

          loss = loss_func(output, y_batch)

          #backward pass of model
          loss.backward()
          optimizer.step()

          #Accumulate training loss
          train_loss += loss.item() * x_batch.size(0)

      #Compute average training loss per sample
      train_loss /= n
      train_losses.append(train_loss)

      #Run Validation
      model.eval()
      valid_loss = 0.0

      with torch.no_grad():
          for x_batch, y_batch, playerids, teamids, oppids in valid_loader:

              x_batch = x_batch.to(device)
              y_batch = y_batch.to(device)
              playerids = playerids.to(device)
              teamids = teamids.to(device)
              oppids = oppids.to(device)

              #Run forward pass of model
              output = model(x_batch, player_ids = playerids, team_ids = teamids, opp_ids = oppids).squeeze(dim = 1)

              #Compute Loss and sum
              loss = loss_func(output, y_batch)

              valid_loss += loss.item() * x_batch.size(0)

      #Compute average loss per sample
      valid_loss /= n_valid
      valid_losses.append(valid_loss)

      #Early Stopping
      if valid_loss < valid_loss_best - 1e-5:
        valid_loss_best = valid_loss
        epochs_noimprovement = 0
      else:
        epochs_noimprovement += 1

      if epochs_noimprovement >= patience:
        print('Early Stopping due to No Improvement')
        early_stop = True
        break

      #Display and record losses
      print(f"Epoch {epoch+1:>2}/{nepochs}: Train Loss = {train_loss:.4f}, Val Loss = {valid_loss:.4f}, Best Val Loss = {valid_loss_best:.4f}")

      loss_curve = pd.DataFrame({'Train':train_losses, 'Valid':valid_losses}).reset_index(names = 'Epoch')

      loss_curve['Fold'] = fold_start

      results.append(loss_curve)

  results = pd.concat(results)

  return(results)


## LSTM Training

In [None]:
####Training over folds of the dataset for the LSTM
def train_model_lstm_folds(dataloaderlist, nepochs=3, lr=0.005, patience = 5):

  results = []

  for dl in dataloaderlist:

    #get train and validation data loaders
    fold_start, fold_end = dl[0], dl[1]
    train_loader, valid_loader = dl[2], dl[3]

    #Get training sample sizes and feature dimension
    n, d = train_loader.dataset.tensors[0].size()

    #Get validation sample sizes and feature dimension
    n_valid, d_valid = valid_loader.dataset.tensors[0].size()

    #Batch Size
    batch_size = train_loader.batch_size

    print('NEW FOLD:', n, d, batch_size, fold_start, fold_end)

    #Use GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #Get the model
    model = lstm_nn(input_dim = d).to(device)

    #Get the loss
    loss_func = nn.L1Loss()

    #Specify the optimizer
    # optimizer = optim.Adam(model.parameters(), lr=lr)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay = 1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='min',
                                                           factor=0.5,
                                                           patience=3)

    train_losses, valid_losses = [], []

    #Early stopping setup
    valid_loss_best = float('inf')
    epochs_noimprovement = 0
    early_stop = False

    #Training loop over epochs
    for epoch in range(nepochs):

      print('Epoch', epoch)
      model.train()
      train_loss = 0.0

      for x_batch, y_batch in train_loader:

          x_batch = x_batch.to(device).unsqueeze(1)
          y_batch = y_batch.to(device)

          #Forward pass of model
          optimizer.zero_grad()
          output = model(x_batch)#.squeeze(dim = 1)

          loss = loss_func(output, y_batch)

          #backward pass of model
          loss.backward()
          optimizer.step()

          #Accumulate training loss
          train_loss += loss.item() * x_batch.size(0)

      #Compute average training loss per sample
      train_loss /= n
      train_losses.append(train_loss)

      #Run Validation
      model.eval()
      valid_loss = 0.0

      with torch.no_grad():
          for x_batch, y_batch in valid_loader:

              x_batch = x_batch.to(device).unsqueeze(1)
              y_batch = y_batch.to(device)

              #Run forward pass of model
              output = model(x_batch)#.squeeze(dim = 1)

              #Compute Loss and sum
              loss = loss_func(output, y_batch)

              valid_loss += loss.item() * x_batch.size(0)

      #Compute average loss per sample
      valid_loss /= n_valid
      valid_losses.append(valid_loss)

      scheduler.step(valid_loss)

      #Early Stopping
      if valid_loss < valid_loss_best - 1e-5:
        valid_loss_best = valid_loss
        epochs_noimprovement = 0
      else:
        epochs_noimprovement += 1

      if epochs_noimprovement >= patience:
        print('Early Stopping due to No Improvement')
        early_stop = True
        break

      #Display and record losses
      print(f"Epoch {epoch+1:>2}/{nepochs}: Train Loss = {train_loss:.4f}, Val Loss = {valid_loss:.4f}, Best Val Loss = {valid_loss_best:.4f}")

      loss_curve = pd.DataFrame({'Train':train_losses, 'Valid':valid_losses}).reset_index(names = 'Epoch')

      loss_curve['Fold'] = fold_start

      results.append(loss_curve)

  results = pd.concat(results)

  return(results)


## Regression Baseline Models

In [None]:
#Use the dataloaders to build Lasso linear regression models over folds for baseline comparison

dataloaders_tmp = dataloaders_test
for i in range(len(dataloaders_tmp)):

  #Get train and validation data loaders
  fold_start, fold_end = dataloaders_tmp[i][0], dataloaders_tmp[i][1]
  train_loader, valid_loader = dataloaders_tmp[i][2], dataloaders_tmp[i][3]

  #Get training sample sizes and feature dimension
  n, d = train_loader.dataset.tensors[0].size()

  print("Fold:", fold_start, fold_end)

  ##Extract training and validation features and targets
  X_train = train_loader.dataset.tensors[0].cpu().numpy()
  y_train = train_loader.dataset.tensors[1].cpu().numpy().flatten()

  X_val = valid_loader.dataset.tensors[0].cpu().numpy()
  y_val = valid_loader.dataset.tensors[1].cpu().numpy().flatten()

  print(X_train.shape)

  # Fit the linear regression model
  lr_model = Lasso(alpha=0.1)
  lr_model.fit(X_train, y_train)

  # Show how many non zero coefficients there are
  non_zero_count = np.sum(lr_model.coef_ != 0)
  print(non_zero_count)

  # Predict on validation set
  y_pred = lr_model.predict(X_val)
  y_train_pred = lr_model.predict(X_train)

  #Compute MAE
  train_mae = mean_absolute_error(y_train, y_train_pred)
  valid_mae = mean_absolute_error(y_val, y_pred)

  print('Train:', train_mae, 'Valid:', valid_mae)

  #Show coefficients
  coef_df = pd.DataFrame({
      'Feature': dataloaders_tmp[0][4],
      'Weight': lr_model.coef_})



# Design Neural Nets

In [None]:
#Fully Connected NN Models
class fc_nn(nn.Module):
    def __init__(self, input_dim):
        super(fc_nn, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            # nn.Linear(256, 128),
            # nn.ReLU(),
            # # nn.Dropout(0.3),
            # nn.Linear(128, 128),
            # nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            # nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.ReLU(),
            # nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

## FCNN with Embedding

In [None]:
#Fully connected model with player and team embeddings
class fc_nn_embed(nn.Module):
    def __init__(self, input_dim, num_players, num_teams, num_opps):
        super(fc_nn_embed, self).__init__()

        #Specify embedding layers with hidden dimension
        self.player_embed = nn.Embedding(num_players, 16)
        self.team_embed = nn.Embedding(num_teams, 4)
        self.opp_embed = nn.Embedding(num_opps, 4)

        self.net = nn.Sequential(
            nn.Linear(input_dim + 24, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            # nn.Dropout(0.2),
            # nn.Linear(64, 32),
            # nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x, player_ids, team_ids, opp_ids):

        #Generate embeddings
        player_embeds = self.player_embed(player_ids.squeeze(-1))
        team_embeds = self.team_embed(team_ids.squeeze(-1))
        opp_embeds = self.opp_embed(opp_ids.squeeze(-1))

        #Concatenate embeddings with numerical features
        x = torch.cat([x, player_embeds, team_embeds, opp_embeds], dim=1)

        return self.net(x)

## LSTM

In [None]:
#LSTM Model
class lstm_nn(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=1, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers>1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        last_h = hn[-1]
        last_h = self.dropout(last_h)
        y = self.fc(last_h)
        return y.squeeze(-1)

# Evaluation and Testing

In [None]:
##Run the training-validation functions
results = train_model_folds(dataloaders_test, nepochs=2, lr=0.005, patience = 5)
# results = train_model_embed_folds(dataloaders_embed, nepochs=30, lr=0.005, patience = 5)
# results = train_model_lstm_folds(dataloaders_lstm, nepochs=15, lr=0.01, patience = 10)
results = results.melt(id_vars = ['Fold', 'Epoch'], var_name = 'Type', value_name = 'Loss')


In [None]:
# results.to_csv('Data/results_base_fcnn_earlystop.csv')
# results = pd.read_csv('Data/results_base_fcnn_dropout.csv')

In [None]:
#Show best loss results and number of epochs
display(results.groupby(['Fold', 'Type'])['Loss'].min().reset_index().pivot(index = 'Type', columns = 'Fold', values = 'Loss'))
display(results.groupby(['Fold', 'Type'])['Epoch'].max().reset_index().pivot(index = 'Type', columns = 'Fold', values = 'Epoch'))

#Plot training and validation loss curves
sns.set_context(rc={"font.size": 14, "axes.titlesize": 18,https://www.gradescope.com/courses/994586 "axes.labelsize": 16})
x = sns.relplot(data = results, x = 'Epoch', y = 'Loss', hue = 'Type', col = 'Fold', kind = 'line', col_wrap=4)
x.fig.suptitle('Loss Curves for Fully Connected Network with 3 layers with Early Stopping', fontsize = 16)
x.fig.subplots_adjust(top=0.8)
plt.show()
