In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
#import pytz
import scipy
import requests
import warnings
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

import extract_nba_rolling_stats

# Parameters and Data

In [None]:
eda_rolling_period = 25
eda_stat_cats = ['pts']

eda_params = extract_nba_rolling_stats.UserParameters(rolling_period = eda_rolling_period,
                                                    stat_cats = eda_stat_cats)

In [None]:
extracted_data = extract_nba_rolling_stats\
                        .run_extract(roll_period = eda_rolling_period,
                                     categories = eda_stat_cats)\
                        .extracted_data

rolling_data = extract_nba_rolling_stats\
                .get_rolling_stats(original_data = extracted_data,
                                   p = eda_params
                                  )

standardized_data = extract_nba_rolling_stats\
                        .standardize_data(rolling_data,
                                          eda_params.stat_cats)

In [None]:
standardized_data

In [2]:
def pull_preprocess_data(rolling_period = 62,
                         stat_cats = ['pts']):
    user_params = extract_nba_rolling_stats\
                      .UserParameters(rolling_period = rolling_period,
                                     stat_cats = stat_cats)
    
    def split_train_test(data):
        split_date = data['game_date'].unique()[int(len(data['game_date'].unique()) * 0.7)]
    
        train = data[data['game_date'] <= split_date]
        test = data[data['game_date'] > split_date]
    
        # Independent and Dependent variables
        X_cols = [col for col in data.columns if ((col == 'min')
                                                  | (('player_' in col)
                                                     & ('_mean' in col))
                                                  # | ('_stand' in col)
                                                 )
                 ]
        y_cols = [col for col in data.columns if '_normed' in col]
    
        # Training set
        X_train = train[X_cols].dropna()
        y_train = train.loc[X_train.index,
                            y_cols]
    
        # Test set
        X_test = test[X_cols].dropna()
        y_test = test.loc[X_test.index,
                        y_cols]
    
        return X_train, X_test, y_train, y_test

    
    extracted_data = extract_nba_rolling_stats\
                            .run_extract(roll_period = rolling_period,
                                         categories = stat_cats)\
                            .extracted_data
    
    rolling_data = extract_nba_rolling_stats\
                    .get_rolling_stats(original_data = extracted_data,
                                       p = user_params
                                      )

    standardized_data = extract_nba_rolling_stats\
                            .standardize_data(rolling_data,
                                              user_params.stat_cats)

    
    for cat in stat_cats:
        standardized_data[cat+'_normed'] = standardized_data[cat].apply(lambda x: (x**(1/2)).real)
        
        # Remove Outliers
        third_quart = np.percentile(standardized_data[cat], 75, method='midpoint')
        first_quart = np.percentile(standardized_data[cat], 25, method='midpoint')

        interquartile_range = third_quart - first_quart
        
        upper = third_quart + (1.5*interquartile_range)

        standardized_data.drop(standardized_data[standardized_data[cat] > upper].index,
                 inplace = True)

        # Normalize Data
        standardized_data[cat+'_normed'] = standardized_data[cat].apply(lambda x: (x**(1/2)).real)

        # Categorize if average was met by player
        standardized_data[cat+'_avg_met'] = (standardized_data['fpts'] >= standardized_data['player_fpts_mean']).apply(int)

    # Dummify Data
    data_dummied = pd.get_dummies(data = standardized_data.dropna(how = 'any'),
                              columns= ['player_position'],
                              dtype = int)

    X_train, X_test, y_train, y_test = split_train_test(data_dummied)
    
    return data_dummied#X_train, X_test, y_train, y_test

In [3]:
for i in [82, 60, 75, 100]:
    data = pull_preprocess_data(rolling_period = i,
                                stat_cats = ['fpts'])

    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %i])

    with pd.ExcelWriter(excel_path) as writer:
        data.to_excel(writer,
                      index = False)

Calculating stats for ['fpts']
Reading in Stats
Reading in Games
Reading in Players
Calculating individual team rolling statistics...
Shifting data...
Calculating league team rolling statistics...
Shifting data...
Calculating individual player rolling statistics...
Shifting data...
Calculating league player rolling statistics...
Shifting data...
Merging all data...
Standardizing data...
Done!
Calculating individual team rolling statistics...
Shifting data...
Calculating league team rolling statistics...
Shifting data...
Calculating individual player rolling statistics...
Shifting data...
Calculating league player rolling statistics...
Shifting data...
Merging all data...
Standardizing data...
Calculating stats for ['fpts']
Reading in Stats
Reading in Games
Reading in Players
Calculating individual team rolling statistics...
Shifting data...
Calculating league team rolling statistics...
Shifting data...
Calculating individual player rolling statistics...
Shifting data...
Calculating lea

# Pipeline

## Model Metrics

In [4]:
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, roc_auc_score, classification_report

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [6]:
ran_state = 130

In [10]:
rf_class = RandomForestClassifier(random_state = ran_state)

<IPython.core.display.Javascript object>

In [7]:
def split_train_test(data):

    unique_game_dates = data[data['game_date'] >= '2022-01-01']['game_date'].unique()

    # Take 2023 and beyond to adjust for the rolling statistic
    split_date = unique_game_dates[int(len(unique_game_dates) * 0.7)]
    
    train = data[data['game_date'] <= split_date]
    test = data[data['game_date'] > split_date]
        
        # Independent and Dependent variables
    ## Complete
    # ((col == 'min')
    #   | (('opponent_team_opp_' in col)
    #      & ('_stand' in col))
    #  )
    # ((col == 'min')
    #   | (('opponent_team_opp_' in col)
    #      & ('_pg' in col))
    #  )
    ## Try
    # ((('player_' in col)
    #    & ('mean' in col)
    #    & ('league' not in col))
    #   | (('opponent_team_opp_' in col)
    #      & ('_pg' in col))
    #  )
    X_cols = [col for col in data.columns if ((('_mean_stand' in col)
                                               & ('league' not in col))
                                              | (col == 'min_stand')
                                              # | (('team_' in col)
                                              #    & ('_pg_stand' in col)
                                              #    & ('opp_' not in col))
                                              | (('opponent_team_opp_pos_' in col)
                                                 & ('_pg' in col)
                                                & ('_stand' in col))
                                             )
             ]
    y_cols = [col for col in data.columns if '_avg_met' in col]
    
    # Training set
    X_train = train[X_cols].dropna()
    y_train = train.loc[X_train.index,
                        y_cols]
    
    # Test set
    X_test = test[X_cols].dropna()
    y_test = test.loc[X_test.index,
                    y_cols]
    
    return X_train, X_test, y_train, y_test

## RandomizedSearchCV

In [None]:
rf_performances = pd.DataFrame()

for p in [60,15,30,82]:
    print('Rolling period = %i' %p)

    # Read in data
    print('Reading data...')
    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %p])

    data = pd.read_excel(excel_path,
                         header = 0)

    # Train-test split
    X_train, X_test, y_train, y_test = split_train_test(data)

    print(*X_train.columns, sep = ', ')
    print(*y_train.columns, sep = ', ')
    
    # Randomized Search CV
    print('Performing RandomSearchCV...')
    ran_num_estimators = [50,100,300]
    ran_depth = [5,10,20]
    ran_sample_split = [2,3]
    
    ran_rf_params = {'n_estimators':ran_num_estimators,
                     'max_depth':ran_depth,
                     'min_samples_split':ran_sample_split}

    rf_regress_ranCV = RandomizedSearchCV(rf_class,
                                          ran_rf_params,
                                          n_iter = 9,
                                         cv = 3,
                                         random_state = ran_state,
                                         verbose = 3)

    rf_regress_ranCV.fit(X_train,
                         y_train)

    ran_cv_performance = pd.DataFrame(rf_regress_ranCV.cv_results_).sort_values('rank_test_score')

    ran_cv_performance['rolling_period'] = p

    rf_performances = pd.concat([rf_performances,
                                 ran_cv_performance])

# Writer RandomSearchCV performance to excel
print('Writing results to Excel...')
ran_cv_performance_excel_path = '/'.join(['.','Data',
                                        'Random Search CV Performance_classifier.xlsx'])

with pd.ExcelWriter(ran_cv_performance_excel_path, mode = 'a') as writer:
    rf_performances.to_excel(writer,
                                sheet_name = 'FPTS All Stand',
                                index = False)

In [None]:
best_model = rf_regress_ranCV.best_estimator_

In [None]:
best_model_importances = best_model.feature_importances_

In [None]:
rf_regress_ranCV.best_params_

In [None]:
feature_names = [i for i in X_train.columns]

forest_importances = pd.Series(best_model_importances,
                               index=feature_names)

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
forest_importances.plot.bar()
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
print('Test score = %.2f' %best_model.score(X_test,
                                         y_test)
     )

In [None]:
y_pred_class = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
rf_class_cm = confusion_matrix(y_test,
                                 y_pred_class)

In [None]:
cm_plot = ConfusionMatrixDisplay(rf_class_cm)
cm_plot.plot()

In [None]:
roc_auc_score(y_test, y_pred_proba[:, 1])

## GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

In [11]:
gs_performances = pd.DataFrame()

for p in [82, 60, 75, 100]:
    print('Rolling period = %i' %p)

    # Read in data
    print('Reading data...')
    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %p])

    data = pd.read_excel(excel_path,
                         header = 0)

    # Train-test split
    X_train, X_test, y_train, y_test = split_train_test(data)

    # Randomized Search CV
    print('Performing GridSearchCV...')
    gs_num_estimators = [35,50,65]
    gs_depth = [8,10,12]
    gs_sample_split = [3]
    
    gs_rf_params = {'n_estimators':gs_num_estimators,
                 'max_depth':gs_depth,
                 'min_samples_split':gs_sample_split}

    rf_class_gsCV = GridSearchCV(rf_class,
                                          gs_rf_params,
                                         cv = 3,
                                         verbose = 3)

    rf_class_gsCV.fit(X_train,
                         y_train)

    gs_cv_performance = pd.DataFrame(rf_class_gsCV.cv_results_).sort_values('rank_test_score')

    gs_cv_performance['rolling_period'] = p

    gs_performances = pd.concat([gs_performances,
                                 gs_cv_performance])

# Writer RandomSearchCV performance to excel
print('Writing results to Excel...')
gs_cv_performance_excel_path = '/'.join(['.','Data',
                                        'GridSearch CV Performance_classifier.xlsx'])

with pd.ExcelWriter(gs_cv_performance_excel_path, mode = 'a') as writer:
    gs_performances.to_excel(writer,
                                sheet_name = 'FPTS All Stand Min True',
                                index = False)

Rolling period = 82
Reading data...
Performing GridSearchCV...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END max_depth=8, min_samples_split=3, n_estimators=35;, score=0.696 total time=   2.0s
[CV 2/3] END max_depth=8, min_samples_split=3, n_estimators=35;, score=0.730 total time=   1.8s
[CV 3/3] END max_depth=8, min_samples_split=3, n_estimators=35;, score=0.723 total time=   1.9s
[CV 1/3] END max_depth=8, min_samples_split=3, n_estimators=50;, score=0.696 total time=   2.7s
[CV 2/3] END max_depth=8, min_samples_split=3, n_estimators=50;, score=0.731 total time=   2.8s
[CV 3/3] END max_depth=8, min_samples_split=3, n_estimators=50;, score=0.721 total time=   2.7s
[CV 1/3] END max_depth=8, min_samples_split=3, n_estimators=65;, score=0.695 total time=   3.6s
[CV 2/3] END max_depth=8, min_samples_split=3, n_estimators=65;, score=0.729 total time=   3.5s
[CV 3/3] END max_depth=8, min_samples_split=3, n_estimators=65;, score=0.722 total time=   3.5s
[CV 1/3] END 

## Final Params

In [None]:
# For rolling_period = 62
final_max_depth = 10
final_min_samples_split = 3
final_n_estimators = 50

In [None]:
final_rf_class = RandomForestClassifier(max_depth = final_max_depth,
                                         min_samples_split = final_min_samples_split,
                                         n_estimators = final_n_estimators,
                                         random_state = ran_state)

In [None]:
final_rolling_period = 60

excel_path = '/'.join(['.','Data',
                        'modeling_data_%i rp_fpts.xlsx' %final_rolling_period])

final_data = pd.read_excel(excel_path,
                     header = 0)

In [None]:
X_train, X_test, y_train, y_test = split_train_test(final_data)

In [None]:
final_rf_class.fit(X_train,
                     y_train)

### Export Model

In [None]:
# Save trained model
import pickle

model_file_name = './Models/rf_class_model_fpts.pkl'

In [None]:
with open(model_file_name, 'wb') as file:
    pickle.dump(final_rf_class,
                file)

## Model Evaluation

In [None]:
# Save trained model
import pickle

model_file_name = 'rf_class_model_fpts.pkl'

with open(model_file_name, 'rb') as file:
    final_rf_class = pickle.load(file)

In [None]:
y_pred_rf = pd.DataFrame(final_rf_class.predict(X_test),
                      columns = ['fpts_avg_met'])

In [None]:
y_pred_class = final_rf_class.predict(X_test)
y_pred_proba = final_rf_class.predict_proba(X_test)

In [None]:
rf_class_cm = confusion_matrix(y_test,
                                 y_pred_class)

In [None]:
cm_plot = ConfusionMatrixDisplay(rf_class_cm)
cm_plot.plot()

In [None]:
print(classification_report(y_test, y_pred_class))

In [None]:
roc_auc_score(y_test, y_pred_proba[:, 1])

In [None]:
tp,fp,fn,tn = rf_class_cm.ravel()

In [None]:
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)

In [None]:
precision = tp/(tp+fp)

In [None]:
sensitivity

In [None]:
specificity

In [None]:
precision