In [2]:
# general imports
import pandas as pd
import numpy as np
import time
import ast
import re
import matplotlib.pyplot as plt
import pickle

In [3]:
# api call imports
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.static import teams, players

In [4]:
# model building imports
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error

In [4]:
import warnings
warnings.filterwarnings('ignore')

**Read in Training Data**

In [5]:
# player df
dfp = pd.read_csv("./data/full_team_player_v2.csv")

# team df
dft = pd.read_csv("./data/full_teams_data.csv")

# team roster df
dfr = pd.read_csv("./data/team_rosters.csv")
dfr = dfr.set_index(["TEAM", "SEASON"])

In [6]:
# aggregation data
wpdf = pd.read_csv("./data/full_team_player_v3.csv")

**Model Prep**

In [7]:
def prep_win_pct_pred_data(wpdf):
    bins = []
    for x in wpdf["W_PCT"]:
        if x >= 0.5:
            bins.append(1)
        else:
            bins.append(0)
    
    wpdf["W_PCT_BIN"] = bins
    
    return wpdf

In [8]:
def predict_win_pct(wpdf, season, num_prev_seasons, classify=True):
    if season == '2013-14':
        print("Can't test 2013 data since there is no data before this season.")
        return ""
    
    seasons = wpdf["GROUP_VALUE"].unique().tolist()
    indexed_season = seasons.index(season)
    start_season_index = indexed_season-num_prev_seasons
    if start_season_index < 0:
        start_season_index = 0
    training_seasons = seasons[start_season_index:indexed_season]
    print(training_seasons)
    
    X_cont = wpdf.columns[7:-1]
    X = X_cont
#     X = []
#     for col in X_cont:
#         for season in training_seasons:
#             print(col, season)
#             if col.endswith(season):
#                 X.append(col)
                
#     print(X)
                
    if classify:
        Y = ['W_PCT_BIN']
    else:
        Y = ['W_PCT']
        
    # get previous season data (training)
    train_data = wpdf[wpdf["GROUP_VALUE"].isin(training_seasons)]
    X_train = train_data[X].fillna(0)
    Y_train = train_data[Y]
    
    #get current season data (testing)
    test_data = wpdf[wpdf["GROUP_VALUE"] == season]
    X_test = test_data[X].fillna(0)
    Y_test = test_data[Y]
    
    # standardize data
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # choose hyperparameters
    n_estimators = 12
    depth = 19
    
    #f1metrics = {}
    #for n_estimators in range(5, 25):
    #    f2metrics = {}
    #    for depth in range(10, 25):
    #        print("{}:{} > ".format(n_estimators, depth), end="")
    if classify:
        accs = []
        for _ in range(200):
            mod = RandomForestClassifier(n_estimators=n_estimators, max_depth=depth).fit(X_train, Y_train)
            preds = mod.predict(X_test)
            accs.append(accuracy_score(Y_test, preds))
            
        return sum(accs)/len(accs)
        #f2metrics[depth] = sum(accs)/len(accs)
    else:
        maes = []
        for _ in range(200):
            mod = RandomForestRegressor(n_estimators=n_estimators, max_depth=depth).fit(X_train, Y_train)
            preds = mod.predict(X_test)
            maes.append(mean_absolute_error(Y_test, preds))
#             f2metrics[depth] = sum(maes)/len(maes)
        return sum(maes)/len(maes)
        
#     f1metrics[n_estimators] = f2metrics
    
#     return f1metrics

In [9]:
wpdf = prep_win_pct_pred_data(wpdf)

In [10]:
wpdf.shape

(210, 323)

**Initial Model Test**

In [11]:
metrics = predict_win_pct(wpdf, '2018-19', 4, True)

['2014-15', '2015-16', '2016-17', '2017-18']


In [12]:
# accuracy for predicting 2018-19 season games and looking back 4 seasons
metrics

0.5819999999999995

Below are some other adiditonal tests that you can run. 
Pass in the dataset (wpdf), the current season, the number of previous seasons you want to train on, and whether you want to classify (True) or predict with regression (False)

In [13]:
predict_win_pct(wpdf, '2019-20', 4, True)

['2015-16', '2016-17', '2017-18', '2018-19']


0.6163333333333337

In [14]:
predict_win_pct(wpdf, '2016-17', 3, True)

['2013-14', '2014-15', '2015-16']


0.6818333333333341

In [15]:
predict_win_pct(wpdf, '2018-19', 1, True)

['2017-18']


0.6375000000000012

**Validate Model Against All Seasons**

In [16]:
# runs through all seasons and training with 1-6 prior seasons 
# change this variable to FALSE to conduct regression prediction and TRUE for classification prediction
classification = True

metrics_seasonal = {}
for season in wpdf["GROUP_VALUE"].unique().tolist():
    num_prev_seasons_dict = {}
    for num_prev_seasons in range(1, 6):
        print("Training season {}, looking back {} seasons...".format(season, num_prev_seasons))
        num_prev_seasons_dict[num_prev_seasons] = predict_win_pct(wpdf, season, num_prev_seasons, classification)
    metrics_seasonal[season] = num_prev_seasons_dict

Training season 2013-14, looking back 1 seasons...
Can't test 2013 data since there is no data before this season.
Training season 2013-14, looking back 2 seasons...
Can't test 2013 data since there is no data before this season.
Training season 2013-14, looking back 3 seasons...
Can't test 2013 data since there is no data before this season.
Training season 2013-14, looking back 4 seasons...
Can't test 2013 data since there is no data before this season.
Training season 2013-14, looking back 5 seasons...
Can't test 2013 data since there is no data before this season.
Training season 2014-15, looking back 1 seasons...
['2013-14']
Training season 2014-15, looking back 2 seasons...
['2013-14']
Training season 2014-15, looking back 3 seasons...
['2013-14']
Training season 2014-15, looking back 4 seasons...
['2013-14']
Training season 2014-15, looking back 5 seasons...
['2013-14']
Training season 2015-16, looking back 1 seasons...
['2014-15']
Training season 2015-16, looking back 2 seasons

In [17]:
# accuracy by current season & # of prior seasons looked at
metrics_seasonal

{'2013-14': {1: '', 2: '', 3: '', 4: '', 5: ''},
 '2014-15': {1: 0.6585000000000008,
  2: 0.6530000000000006,
  3: 0.6500000000000006,
  4: 0.6595000000000008,
  5: 0.6505000000000005},
 '2015-16': {1: 0.7795000000000004,
  2: 0.7536666666666669,
  3: 0.755666666666667,
  4: 0.7593333333333331,
  5: 0.7634999999999996},
 '2016-17': {1: 0.6980000000000004,
  2: 0.6868333333333341,
  3: 0.6796666666666675,
  4: 0.6813333333333341,
  5: 0.6865000000000008},
 '2017-18': {1: 0.7581666666666669,
  2: 0.6976666666666668,
  3: 0.6731666666666674,
  4: 0.6630000000000011,
  5: 0.6650000000000003},
 '2018-19': {1: 0.6390000000000009,
  2: 0.6285000000000006,
  3: 0.5844999999999996,
  4: 0.5811666666666664,
  5: 0.5848333333333334},
 '2019-20': {1: 0.6178333333333335,
  2: 0.5945000000000004,
  3: 0.5676666666666664,
  4: 0.6153333333333336,
  5: 0.6038333333333337}}

In [20]:
# save metrics output to pickle file
if classification:
    output_type = "classification" 
else:
    output_type = "regression"
with open("./results/metrics_seasonal_{}.pkl".format(output_type), "wb") as file:
    pickle.dump(metrics_seasonal, file)

**Accuracy By Season**

In [21]:
seasons = wpdf["GROUP_VALUE"].unique().tolist()

In [22]:
for i in range(1, len(seasons)):
    print(seasons[i])
    sdf = wpdf[wpdf["GROUP_VALUE"] == seasons[i]]
    print(sdf["W_PCT"].corr(sdf["FGM_{}".format(seasons[i-1])]))

2014-15
0.48758089941582583
2015-16
0.5782554377812922
2016-17
0.6730983793649054
2017-18
0.5304743320639487
2018-19
0.42753005733794813
2019-20
0.16263684010747761


## Win/Loss Prediction

In [5]:
dfwl = pd.read_csv("./data/nba_player_data.csv")

In [7]:
dfwl.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,4348,4348,2018-19,203518,21800772,2019-02-01,OKC @ MIA,W,9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,-8.0,1
1,4349,4349,2018-19,203518,21800751,2019-01-29,OKC @ ORL,W,6,1,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3,4.0,1
2,4350,4350,2018-19,203518,21800493,2018-12-23,OKC vs. MIN,L,27,2,...,1.0,1.0,0.0,0.0,2.0,0.0,2.0,7,1.0,1
3,4351,4351,2018-19,203518,21800485,2018-12-22,OKC @ UTA,W,16,2,...,1.0,1.0,0.0,0.0,0.0,0.0,4.0,6,13.0,1
4,4352,4352,2018-19,203518,21800466,2018-12-19,OKC @ SAC,W,23,3,...,1.0,1.0,1.0,0.0,0.0,0.0,4.0,9,6.0,1


In [25]:
def prep_winloss_data(df, seasons):
    print("Prepping", df.shape, "...")
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
    df["SEASON_ID"] = df["SEASON_ID"].apply(lambda x: str(x)[1:]+'-'+str(int(str(x)[1:][-2:])+1))
    df = df[df["SEASON_ID"].isin(seasons)]
    
    # get week num id
    df["GAME_WEEK"] = df["GAME_DATE"].dt.week
    
    players_dict = players.get_players()
    
    players_list = []
    for player_id in df["Player_ID"]: 
        for player in players_dict:
            if player['id'] == player_id:
                players_list.append(player['full_name'])
                
    df["PLAYER_NAME"] = players_list
    
    df["TEAM_A"] = df["MATCHUP"].apply(lambda x: re.split('@|vs.', x)[0].strip())
    df["TEAM_B"] = df["MATCHUP"].apply(lambda x: re.split('@|vs.', x)[1].strip())
    
    return df

In [28]:
# prediction seasons
seasons = dfr.reset_index()["SEASON"].unique().tolist()

In [29]:
dfc = prep_winloss_data(dfwl, seasons)

In [None]:
teams_dict = teams.get_teams()
    
team_namesA = []
for team in dfc["TEAM_A"]:
    found = False
    for team_dict in teams_dict:
        if team_dict['abbreviation'] == team:
            team_namesA.append(team_dict['full_name'])
            found = True
    if not found:
        team_namesA.append("")
    
dfc["TEAM_A_NAME"]= team_namesA
                
team_namesB = []
for team in dfc["TEAM_B"]:
    found = False
    for team_dict in teams_dict:
        if team_dict['abbreviation'] == team:
            team_namesB.append(team_dict['full_name'])
            found = True
    if not found:
        team_namesB.append("")
                
dfc["TEAM_B_NAME"] = team_namesB

In [None]:
dfc.head()

In [None]:
interested_vars = ['FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 
                   'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS']

In [None]:
dfc_selected = dfc[dfc["SEASON_ID"] == '2018-19']

In [None]:
wpdf.head()

In [None]:
dfc.head()

In [None]:
dfc_interested_vars = []

In [None]:
def inject_player_data(dfc, wpdf, seasons):    
    tojoindf = pd.DataFrame()
    for i in range(len(dfc)):
        current_season = dfc.iloc[i]["SEASON_ID"]
        prev_season = seasons[seasons.index(current_season)-1]
        
        teamA = dfc.iloc[i]["TEAM_A_NAME"]
        teamB = dfc.iloc[i]["TEAM_B_NAME"]
    
        teamA_data = wpdf[(wpdf["GROUP_VALUE"] == current_season) & (wpdf["TEAM_NAME"] == teamA)]
        teamB_data = wpdf[(wpdf["GROUP_VALUE"] == current_season) & (wpdf["TEAM_NAME"] == teamB)]
        
        teamA_data = teamA_data[teamA_data.columns[7:-1]]
        teamB_data = teamB_data[teamB_data.columns[7:-1]]
        
        new_colsA = []
        for col in teamA_data.columns:
            new_colsA.append(col + "_A")
        
        new_colsB = []
        for col in teamB_data.columns:
            new_colsB.append(col + "_B")
            
        teamA_data.columns = new_colsA
        teamB_data.columns = new_colsB 
        
        merged = pd.concat([teamA_data, teamB_data], axis=1)
        
        
        return merged
        
        X = []
        for col in merged.columns:
            if col.endswith(prev_season):
                X.append(col)
                
        merged[X]
        
        return 

In [None]:
test=inject_player_data(dfc, wpdf, seasons)

In [None]:
test

In [None]:
dfcg = dfc.groupby(["SEASON_ID", "TEAM_A_NAME", "PLAYER_NAME"])[interested_vars].mean()
dfcg["MIN"] = dfc.groupby(["SEASON_ID", "TEAM_A_NAME", "PLAYER_NAME"])['MIN'].sum()

In [None]:
dfcg

In [None]:
dfcg