In [None]:
# import packages
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
from model import RNNModel
import torch.nn as nn
import torch.optim as optim
from train import train
from itertools import product
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read csv
df = pd.read_csv('results/all_nba_preds.csv', index_col=0)

In [None]:
# define min and max career len, pad player careers to max len
max_len = df.groupby(['player_id'], sort=False)['player_season'].count().max()
min_len = 4
input_list, target_list = [], []

for i in range(len(df['player_id'].unique())):
    if len(df[df['player_id']==i]) < min_len:
        continue
    else:
        curr_pred = df[df['player_id']==i]['pred_all_nba'].values
        padded_input = np.pad(curr_pred, (0, max_len - len(curr_pred)), 'constant')
        padded_target = np.pad(curr_pred[1:], (0, max_len - len(curr_pred) + 1), 'constant')
        input_list.append(padded_input), target_list.append(padded_target)
        
input_tensor = torch.Tensor(input_list)
target_tensor = torch.Tensor(target_list)

In [None]:
# function to randomly select which players to use as train, valid, and test
def split_indices(train_perc=0.6, valid_perc=0.2):
    perm = np.random.RandomState(seed=0).permutation(len(input_tensor))
    n = len(perm)
    train_end = int(train_perc * n)
    valid_end = int((train_perc + valid_perc) * n)
    train_ind = perm[:train_end]
    valid_ind = perm[train_end:valid_end]
    test_ind = perm[valid_end:]
    
    return train_ind, valid_ind, test_ind

In [None]:
# convert inputs and targets to torch-readable format
full_data = TensorDataset(input_tensor, target_tensor)

train_ind, valid_ind, test_ind = split_indices()

train_loader = DataLoader(full_data, batch_size=4, sampler=SubsetRandomSampler(train_ind))
valid_loader = DataLoader(full_data, batch_size=4, sampler=SubsetRandomSampler(valid_ind))
test_loader = DataLoader(full_data, batch_size=4, sampler=SubsetRandomSampler(test_ind))

In [None]:
# define and train model
model = RNNModel(hidden_dim=50, rnn_type='LSTM', bidir=False,
                 checkpoint_name='checkpoints/lstm_d5_n2.pt', dropout=0.5, num_layers=2)

criterion = nn.MSELoss(reduction='sum')
optimizer = optim.Adam(model.parameters())

train(model, train_loader, valid_loader, test_loader, optimizer, criterion, n_epochs=50, short_train=False,
      checkpoint_name=model.checkpoint_name, patience=5)

In [None]:
# function that predicts a player's entire career arc using the trained model
def pred_player(player_name):
    player_progr = df[df['player']==player_name]['pred_all_nba'].values
    start_len = len(player_progr)
    
    while(len(player_progr) < max_len):
        player_tens = torch.Tensor(player_progr).unsqueeze(0)
        hidden = model.initHidden(1)
        output, hidden = model(player_tens, hidden)
        player_progr = np.concatenate((player_progr, np.array([output[-1].item()])))
        
    remaining_all_nba = np.sum(player_progr[start_len:])
    return player_progr, start_len, remaining_all_nba

In [None]:
# predict career arc for all players above minimum length
min_len_df = df.groupby("player_id").filter(lambda x: len(x) >= 4)
all_players = min_len_df[min_len_df['season_start']==2018]['player'].values
remaining_pred, curr_year, full_prog = [], [], []

for i in all_players:
    history, start_len, total = pred_player(i)
    full_prog.append(history)
    curr_year.append(start_len)
    remaining_pred.append(total)

In [None]:
# convert predictions to csv
df_annual = pd.DataFrame(full_prog, columns=['year_%s' %i for i in range(1, 22)])
df_pred = pd.DataFrame(zip(all_players, curr_year, remaining_pred),
                       columns=['player', 'current_year', 'remaining_all_nba'])
full_df = df_pred.join(df_annual)

In [None]:
full_df.to_csv('results/full_predictions.csv', index=False)

# Visualization function

In [None]:
df = pd.read_csv('results/full_predictions.csv')

In [None]:
# function to help visualize any player's career arc
def plot_player(player_name):
    plt.style.use('fivethirtyeight')
    player_vals = df[df['player']==player_name].values[0, 1:].astype(float)
    player_hist = player_vals[2:]
    player_year = int(player_vals[0])
    player_total = player_vals[1]

    fig, ax = plt.subplots()

    sns.lineplot(x=range(1, player_year + 1), y=player_hist[:player_year], ax=ax, label="Historical")
    sns.lineplot(x=range(player_year, 22), y=player_hist[player_year - 1:], ax=ax, label="Projected")
    ax.axvline(x=player_year, color='black', linestyle='--')

    fig.suptitle(f"LEBRON projection for {player_name}", weight='bold', size=18, y=1.02)
    ax.set_title(f"Remaining All-NBA probability: {player_total:.2f}", size=14, fontname="Rockwell")

    ax.set_xlabel("Year #")
    ax.set_ylabel("P(All-NBA)")
    
    plt.setp(ax.get_legend().get_texts(), fontname='Rockwell')

    fig.text(x = -0.02, y = -0.08,
        s = '___________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

    fig.text(x = -0.02, y = -.14,
        s = 'https://dribbleanalytics.blog                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'dimgrey', horizontalalignment='left')
    
    fname = player_name.lower().replace(" ", "_")
    fig.savefig(f'graphs/{fname}.png', dpi = 400, bbox_inches = 'tight')

In [None]:
plot_player('LeBron James')