In [None]:
%matplotlib inline

import os
import pdb
import argparse
import pickle as pkl
from pathlib import Path

from collections import defaultdict

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

from six.moves.urllib.request import urlretrieve
import tarfile
import pickle
import sys

import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)



In [None]:
######################################################################
# Set path to the folder where your data is stored
######################################################################
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Neural Nets Spring 2025 Project')
os.getcwd()
os.listdir()

# Bring in Raw Player Data and Clean and Preprocess

In [None]:
####Bring in Player Game data
rawdata = pd.read_csv('player_games.txt', header = 0 )
rawdata.head()

###Glossary
# Rk -- Rank
# PTS -- Points
# Age -- As of the date of the game listed. Age is Years-Days.
# GS -- Games Started
# POS -- position
# MP -- Minutes Played
# FG -- Field Goals
# FGA -- Field Goal Attempts
# FG% -- Field Goal Percentage
# 2P -- 2-Point Field Goals
# 2PA -- 2-Point Field Goal Attempts
# 2P% -- 2-Point Field Goal Percentage
# 3P -- 3-Point Field Goals
# 3PA -- 3-Point Field Goal Attempts
# 3P% -- 3-Point Field Goal Percentage
# FT -- Free Throws
# FTA -- Free Throw Attempts
# FT% -- Free Throw Percentage
# ORB -- Offensive Rebounds
# DRB -- Defensive Rebounds
# TRB -- Total Rebounds
# AST -- Assists
# STL -- Steals
# BLK -- Blocks
# TOV -- Turnovers
# PF -- Personal Fouls
# PTS -- Points
# GmSc -- Game Score
# BPM -- Box Plus/Minus
# A box score estimate of the points per 100 possessions a player contributed above a league-average player, translated to an average team.

In [None]:
#Choose start and end dates of data
start_date = '1991-11-01'
end_date = '2020-07-01'

In [None]:
#Remove Rank column which is not needed
cleandata = rawdata.drop(columns = 'Rk')

#Remove duplicated rows
cleandata = cleandata.drop_duplicates(subset = ['Date', 'Player'])

#Change date to date format
cleandata['Date'] = pd.to_datetime(cleandata['Date'])
# cleandata = cleandata[(cleandata['Date'] >= start_date) & (cleandata['Date'] <= end_date)]

##explore categorical variables
display(cleandata.Player.unique())
display(cleandata.Pos.unique())
display(cleandata.Playerteam.unique())
display(cleandata.Oppteam.unique())
display(cleandata.GameOutcome.unique())
display(cleandata.Age.unique())
display(cleandata.Date.unique())
display(cleandata.At.unique())

#Convert Age to numeric
cleandata['AgeYr'] = cleandata['Age'].str.split('-').str[0].astype(int)
cleandata['AgeDays'] = cleandata['Age'].str.split('-').str[1].astype(int)
cleandata['AgeDays'] = cleandata['AgeYr'] * 365 + cleandata['AgeDays']
cleandata.drop(columns = ['Age', 'AgeYr'], inplace = True)

#Compute Ending Year of Season
cleandata['Season'] = cleandata['Date'].dt.year
cleandata.loc[cleandata['Date'].dt.month > 8, 'Season'] = cleandata['Season'] + 1

display(cleandata[['Season', 'Date']].drop_duplicates())

#Get Player Id
cleandata['PlayerId'] = cleandata['Player'].str.split('\\').str[1]
cleandata.drop(columns = ['Player'], inplace = True)

#Convert At to home/away
cleandata['HomeGame'] = True
cleandata.loc[cleandata['At'] == '@', 'HomeGame'] = False
cleandata.drop(columns = 'At', inplace = True)
display(cleandata.HomeGame.unique())

#Drop Player Games with less than 2 minutes played
cleandata = cleandata[cleandata['MP'] >= 2]

###Construct Fantasy Points
cleandata['FTSYPTS'] = cleandata['PTS'] + cleandata['TRB'] * 1.2 + cleandata['AST'] * 1.5 + cleandata['STL'] * 3 + cleandata['BLK'] * 3 - cleandata['TOV'] * 1

##Explore Player Game data
display(cleandata.head())

display(cleandata.describe())


In [None]:
#Cache Cleaned Player data
cleandata.reset_index(drop = True).to_feather('Data/cleandata.feather')


# Compute Historical Average by Game Features

In [None]:
### Numerical stats list
print(cleandata.columns)
num_stats_list = ['MP', '2P', '2PA', '3P', '3PA', 'FT',
       'FT%', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS','GmSc', 'BPM']

In [None]:
### Build Historical Average Set of Features
#idcol can be PlayerId or Team
def hist_avg_features(data, idcol = 'PlayerId', type = 'lookback', gameslkback = 3, columns = ['PTS']):
  data = data[['Date', idcol, 'Season'] + columns]

  #convert to long format
  data = data.melt(id_vars = ['Date', idcol, 'Season'], value_name = 'Data', var_name = 'Stat')

  data = data.sort_values('Date')

  #Compute rolling average over number of game lookbacks or expanding window over a season or player's career
  if type == 'lookback':

    data['AvgData'] = data.groupby([idcol, 'Stat'], group_keys = False).apply(lambda g: g['Data'].rolling(window = gameslkback, min_periods = 1).mean())
    data['StatName'] = data['Stat'] + 'Avg' + str(gameslkback)

  if type == 'career':
    data['AvgData'] = data.groupby([idcol, 'Stat'], group_keys = False).apply(lambda g: g['Data'].expanding().mean())
    data['StatName'] = data['Stat'] + 'CareerAvg'

  if type == 'season':
    data['AvgData'] = data.groupby([idcol, 'Stat', 'Season'], group_keys = False).apply(lambda g: g['Data'].expanding().mean())
    data['StatName'] = data['Stat'] + 'SeasonAvg'

  #Shfit data back one game to ensure no look ahead bias
  data['AvgData'] = data.groupby([idcol, 'Stat']).AvgData.shift(1)
  data.drop(columns = ['Data', 'Stat'], inplace = True)
  data.rename(columns = {'AvgData' : 'Data'}, inplace = True)

  return(data)

hist_avg_features(data = cleandata, type = 'season')


In [None]:
#Generate average stats over past # of games
gameavgstats = []
for i in [2,5,15]:
  tmp = hist_avg_features(data = cleandata, type = 'lookback', gameslkback = i, columns = num_stats_list)
  display(tmp)
  gameavgstats.append(tmp)

gameavgstats = pd.concat(gameavgstats)


In [None]:
#Get Career and Season Average Stats
careeravg = hist_avg_features(data = cleandata, type = 'career', columns = num_stats_list)
seasonavg = hist_avg_features(data = cleandata, type = 'season', columns = num_stats_list)


In [None]:
gameavgstats = pd.concat([gameavgstats, careeravg, seasonavg])

In [None]:
#### Bring in average stats from cache
# gameavgstats = pd.read_feather('Data/gameavgstats.feather')

# Cache avg stats
gameavgstats.reset_index(drop = True).to_feather('Data/gameavgstats.feather')

# Compute Historical Sums over Dates

In [None]:
###Function for computing total sum of metrics over a certain lookback number of days
def hist_sum_features(data, idcol = 'PlayerId', type = 'lookback', dayslookback = 1, columns = ['MP']):

  data = data[['Date', 'Season', idcol] + columns]

  data['Date'] = pd.to_datetime(data['Date'])

  #convert to long format
  data = data.melt(id_vars = ['Date', 'Season', idcol], value_name = 'Data', var_name = 'Stat')

  # data.set_index('Date', inplace = True)
  data = data.sort_values([idcol, 'Stat' , 'Date'])

  #days to lookback
  roll_lookback = str(dayslookback + 1) + 'D'

  if type == 'lookback':

    #compute rolling sum based on the date
    avgdata = data.groupby([idcol, 'Stat']).apply(lambda g: g.set_index('Date')['Data'].rolling(window = roll_lookback).sum()).reset_index()
    avgdata.rename(columns = {'Data': 'SumData'}, inplace = True)

    #remove current game from sum
    avgdata = avgdata.merge(data, on = ['Date', idcol, 'Stat'], how = 'left')
    avgdata['SumData'] = avgdata['SumData'] - avgdata['Data']

    avgdata['StatName'] = avgdata['Stat'] + 'Sum' + str(dayslookback) + 'D'

  avgdata.drop(columns = ['Data', 'Stat'], inplace = True)
  avgdata.rename(columns = {'SumData': 'Data'}, inplace = True)

  return(avgdata)

tmp = hist_sum_features(data = cleandata, type = 'lookback', dayslookback = 10, columns = ['MP', 'PTS'])
tmp

In [None]:
###Create summed features
summingdf = cleandata[['PlayerId', 'Date','Season', 'MP', 'GameOutcome', 'GS', 'HomeGame']]
summingdf['GP'] = 1
summingdf['Win'] = 0
summingdf.loc[summingdf['GameOutcome'] == 'W', 'Win'] = 1
summingdf['Loss'] = 0
summingdf.loc[summingdf['GameOutcome'] == 'L', 'Loss'] = 1
summingdf['Home'] = 0
summingdf.loc[summingdf['HomeGame'] == True, 'Home'] = 1
summingdf['Away'] = 0
summingdf.loc[summingdf['HomeGame'] == False, 'Away'] = 1


sumstats = []
for i in [2,7,14]:
  tmp = hist_sum_features(data = summingdf, dayslookback = i, columns = ['MP', 'Win', 'Loss', 'Home', 'Away'])
  display(tmp)
  sumstats.append(tmp)

sumstats = pd.concat(sumstats)

del summingdf

In [None]:
#Cache summed stats
# sumstats.reset_index(drop = True).to_feather('Data/sumstats.feather')
sumstats.reset_index(drop = True).to_feather('sumstats.feather')

# Bring in Team Level Data

In [None]:
tmdata = pd.read_csv('Data/team_games.txt')

#Remove Rank column which is not needed
cleantmdata = tmdata.drop(columns = 'Rk')

# display(cleantmdata[cleantmdata.duplicated(subset = ['Date', 'Tm', 'Opp'], keep = False)].sort_values(['Date', 'Tm', 'Opp']))

#Remove duplicated rows
cleantmdata = cleantmdata.drop_duplicates(subset = ['Date', 'Tm', 'Opp'])

# #Change date to date format
cleantmdata['Date'] = pd.to_datetime(cleantmdata['Date'])
cleantmdata = cleantmdata[(cleantmdata['Date'] >= start_date) & (cleantmdata['Date'] <= end_date)]

# ##explore categorical variables
display(cleantmdata.Tm.unique())
display(cleantmdata.At.unique())
display(cleantmdata.Opp.unique())
display(cleantmdata.MP.unique())

#Compute Ending Year of Season
cleantmdata['Season'] = cleantmdata['Date'].dt.year
cleantmdata.loc[cleantmdata['Date'].dt.month >= 8, 'Season'] = cleantmdata['Season'] + 1

display(cleantmdata[['Season', 'Date']].drop_duplicates())

cleantmdata.sort_values(['Date', 'Tm', 'Opp'])
cleantmdata[(cleantmdata.Date == '2020-08-14')]

# #Convert At to home/away
cleantmdata['HomeGame'] = True
cleantmdata.loc[cleantmdata['At'] == '@', 'HomeGame'] = False
cleantmdata.drop(columns = 'At', inplace = True)


##Explore Player Game data
display(cleantmdata.head())

display(cleantmdata.describe())


In [None]:
###Compute Other Total Counting Stats
othertmdata = cleandata.groupby(['Date', 'Playerteam'])[['TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']].sum().reset_index()
othertmdata.rename(columns = {'Playerteam': 'Tm'}, inplace = True)
cleantmdata = cleantmdata.merge(othertmdata, on = ['Date', 'Tm'], how = 'left')

#Opponent other total counting stats
othertmdata.rename(columns = {'Tm' : 'Opp', 'TRB': 'OppTRB', 'AST': 'OppAST', 'STL': 'OppSTL', 'BLK': 'OppBLK', 'TOV': 'OppTOV', 'PF': 'OppPF'}, inplace = True)
cleantmdata = cleantmdata.merge(othertmdata, on = ['Date', 'Opp'], how = 'left')

cleantmdata

del othertmdata

# Construct Team Level Features

In [None]:
###Build Each Team's Historical Average Features

teamavgstats = []
for i in [5,20]:
  tmp = hist_avg_features(data = cleantmdata, idcol = 'Tm', type = 'lookback',
                          gameslkback = i, columns = ['PTS', 'FG%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
                                                    'OppPTS', 'OppFG%', 'OppTRB', 'OppAST', 'OppSTL', 'OppBLK', 'OppTOV', 'OppPF'])
  teamavgstats.append(tmp)

teamseasonavg = hist_avg_features(data = cleantmdata, idcol = 'Tm', type = 'season', columns = ['PTS', 'FG%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
                                                    'OppPTS', 'OppFG%', 'OppTRB', 'OppAST', 'OppSTL', 'OppBLK', 'OppTOV', 'OppPF'])

teamavgstats = pd.concat(teamavgstats)
teamavgstats = pd.concat([teamavgstats, teamseasonavg])
teamavgstats.rename(columns = {'AvgData': 'Data'}, inplace = True)

display(teamavgstats)

In [None]:
###Create team summed features
summingdf = cleantmdata[['Tm', 'Date', 'Season', 'Result', 'HomeGame']]
display(summingdf)
summingdf['GP'] = 1
summingdf['Win'] = 0
summingdf.loc[summingdf['Result'].str[0] == 'W', 'Win'] = 1
summingdf['Home'] = 0
summingdf.loc[summingdf['HomeGame'] == True, 'Home'] = 1

tmsumstats = []
for i in [2,7,14]:
  tmp = hist_sum_features(data = summingdf, idcol = 'Tm', dayslookback = i, columns = ['GP', 'Win', 'Home'])
  display(tmp)
  tmsumstats.append(tmp)

tmsumstats = pd.concat(tmsumstats)

tmsumstats

In [None]:
###Build Own Team's Stat Features
ownteamstats = cleandata[['Date', 'PlayerId', 'Playerteam']]
ownteamstats.rename(columns = {'Playerteam': 'Tm'}, inplace = True)
ownteamstats = ownteamstats.merge(teamavgstats, on = ['Date', 'Tm'], how = 'left')
ownteamstats = ownteamstats[~(ownteamstats['StatName'].str.contains('Opp', na = False))]
ownteamstats['StatName'] = 'Tm' + ownteamstats['StatName']

ownteamstats = ownteamstats[ownteamstats.StatName.notna()]
display(ownteamstats)


In [None]:
# Opponent Team's Past Stats and Opponent's Past Opponent's stats
oppteamstats = cleandata[['Date', 'PlayerId', 'Oppteam']]
oppteamstats.rename(columns = {'Oppteam': 'Tm'}, inplace = True)
oppteamstats = oppteamstats.merge(teamavgstats, on = ['Date', 'Tm'], how = 'left')

#Opponent Couning Stats
oppcounts = cleandata[['Date', 'PlayerId', 'Oppteam']]
oppcounts.rename(columns = {'Oppteam': 'Tm'}, inplace = True)
oppcounts = oppcounts.merge(tmsumstats, on = ['Date', 'Tm'], how = 'left')
oppcounts

oppteamstats = pd.concat([oppteamstats, oppcounts])
oppteamstats = oppteamstats[oppteamstats.StatName.notna()]

oppteamstats['StatName'] = 'Opp' + oppteamstats['StatName']

oppteamstats = oppteamstats[oppteamstats.StatName.notna()]

display(oppteamstats)

In [None]:
###Combine Team stats and cache
teamstats = pd.concat([ownteamstats, oppteamstats])
teamstats.drop(columns = 'Tm', inplace = True)
display(teamstats)
teamstats.reset_index(drop = True).to_feather('Data/teamstats.feather')

# Construct Other Categorical Features

In [None]:
otherstats = cleandata[['Date', 'PlayerId', 'Pos', 'HomeGame', 'AgeDays']]

###One Hot Encode Position
posohe = pd.get_dummies(otherstats['Pos'])
otherstats = pd.concat([otherstats, posohe], axis = 1)
otherstats.drop(columns = ['Pos'], inplace = True)

###Get Month of Year
otherstats['Month'] = otherstats['Date'].dt.month

otherstats


In [None]:
#Cache Other Stats
otherstats.reset_index(drop = True).to_feather('Data/otherstats.feather')

In [None]:
#Fantasy points target
cleandata[['Date', 'PlayerId', 'FTSYPTS']].reset_index(drop=True).to_feather('Data/fantasy_points_target.feather')