In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
#import pytz
import scipy
import requests
import warnings
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

import extract_nba_rolling_stats

# Parameters and Data

In [None]:
eda_rolling_period = 25
eda_stat_cats = ['pts']

eda_params = extract_nba_rolling_stats.UserParameters(rolling_period = eda_rolling_period,
                                                    stat_cats = eda_stat_cats)

In [None]:
extracted_data = extract_nba_rolling_stats\
                        .run_extract(roll_period = eda_rolling_period,
                                     categories = eda_stat_cats)\
                        .extracted_data

rolling_data = extract_nba_rolling_stats\
                .get_rolling_stats(original_data = extracted_data,
                                   p = eda_params
                                  )

standardized_data = extract_nba_rolling_stats\
                        .standardize_data(rolling_data,
                                          eda_params.stat_cats)

In [None]:
standardized_data

In [None]:
def pull_preprocess_data(rolling_period = 62,
                         stat_cats = ['pts']):
    user_params = extract_nba_rolling_stats\
                      .UserParameters(rolling_period = rolling_period,
                                     stat_cats = stat_cats)

    script_run_extract = extract_nba_rolling_stats\
                            .run_extract(roll_period = rolling_period,
                                         categories = stat_cats)
    
    extracted_data = script_run_extract.extracted_data
    
    rolling_data = script_run_extract.rolling_shifted_data

    standardized_data = script_run_extract.standardized_data

    
    for cat in stat_cats:
        standardized_data[cat+'_normed'] = standardized_data[cat].apply(lambda x: (x**(1/2)).real)
        
        # Remove Outliers
        third_quart = np.percentile(standardized_data[cat], 75, method='midpoint')
        first_quart = np.percentile(standardized_data[cat], 25, method='midpoint')

        interquartile_range = third_quart - first_quart
        
        upper = third_quart + (1.5*interquartile_range)

        standardized_data.drop(standardized_data[standardized_data[cat] > upper].index,
                 inplace = True)

        # Categorize if average was met by player
        standardized_data[cat+'_avg_met'] = (standardized_data['fpts'] >= standardized_data['player_fpts_mean']).apply(int)

    # Dummify Data
    data_dummied = pd.get_dummies(data = standardized_data.dropna(how = 'any'),
                              columns= ['player_position'],
                              dtype = int)
    
    return data_dummied

In [None]:
for i in [15,30,60,82]:
    data = pull_preprocess_data(rolling_period = i,
                                stat_cats = ['fpts'])

    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %i])

    with pd.ExcelWriter(excel_path) as writer:
        data.to_excel(writer,
                      index = False)

# Pipeline

## Model Metrics

In [2]:
from sklearn.metrics import mean_squared_error, r2_score

## Random Forest Regressor

In [3]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [4]:
ran_state = 130

In [5]:
rf_regress = RandomForestRegressor(random_state = ran_state)

In [6]:
pipeline = Pipeline(steps = [#('pca',pca),
                             ('rf',rf_regress)])

In [16]:
def split_train_test(data):

    unique_game_dates = data[data['game_date'] >= '2023-01-01']['game_date'].unique()

    # Take 2023 and beyond to adjust for the rolling statistic
    split_date = unique_game_dates[int(len(unique_game_dates) * 0.7)]
    
    train = data[data['game_date'] <= split_date]
    test = data[data['game_date'] > split_date]
        
        # Independent and Dependent variables
    ## Complete
    # ((col == 'min')
    #   | (('opponent_team_opp_' in col)
    #      & ('_stand' in col))
    #  )
    # ((col == 'min')
    #   | (('opponent_team_opp_' in col)
    #      & ('_pg' in col))
    #  )
    ## Try
    # ((('player_' in col)
    #    & ('mean' in col)
    #    & ('league' not in col))
    #   | (('opponent_team_opp_' in col)
    #      & ('_pg' in col))
    #  )
    X_cols = [col for col in data.columns if ((('_mean_stand' in col)
                                               & ('league' not in col))
                                              | (col == 'min_stand')
                                              # | (('team_' in col)
                                              #    & ('_pg_stand' in col)
                                              #    & ('opp_' not in col))
                                              | (('opponent_team_opp_pos_' in col)
                                                 & ('_pg' in col)
                                                & ('_stand' in col))
                                             )
             ]
    y_cols = [col for col in data.columns if (('_normed' in col)
                                              & ('_scaled' not in col))
             ]
    
    # Training set
    X_train = train[X_cols].dropna()
    y_train = train.loc[X_train.index,
                        y_cols]
    
    # Test set
    X_test = test[X_cols].dropna()
    y_test = test.loc[X_test.index,
                    y_cols]
    
    return X_train, X_test, y_train, y_test

## RandomizedSearchCV

In [17]:
rf_performances = pd.DataFrame()

for p in [60,15,30,82]:
    print('Rolling period = %i' %p)

    # Read in data
    print('Reading data...')
    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %p])

    data = pd.read_excel(excel_path,
                         header = 0)

    # Train-test split
    X_train, X_test, y_train, y_test = split_train_test(data)

    print(*X_train.columns, sep = ', ')
    print(*y_train.columns, sep = ', ')
    
    # Randomized Search CV
    print('Performing RandomSearchCV...')
    ran_num_estimators = [50,100,300]
    ran_depth = [5,10,20]
    ran_sample_split = [2,3]
    
    ran_rf_params = {'rf__n_estimators':ran_num_estimators,
                     'rf__max_depth':ran_depth,
                     'rf__min_samples_split':ran_sample_split}

    rf_regress_ranCV = RandomizedSearchCV(pipeline,
                                          ran_rf_params,
                                          n_iter = 9,
                                         cv = 3,
                                         random_state = ran_state,
                                         verbose = 3)

    rf_regress_ranCV.fit(X_train,
                         y_train)

    ran_cv_performance = pd.DataFrame(rf_regress_ranCV.cv_results_).sort_values('rank_test_score')

    ran_cv_performance['rolling_period'] = p

    rf_performances = pd.concat([rf_performances,
                                 ran_cv_performance])

# Writer RandomSearchCV performance to excel
print('Writing results to Excel...')
ran_cv_performance_excel_path = '/'.join(['.','Data',
                                        'Random Search CV Performance.xlsx'])

with pd.ExcelWriter(ran_cv_performance_excel_path, mode = 'a') as writer:
    rf_performances.to_excel(writer,
                                sheet_name = '%i Rolling Period' %p,
                                index = False)

Rolling period = 60
Reading data...
opponent_team_opp_pos_fpts_pg_stand, fpts_mean_stand, min_stand
fpts_normed
Performing RandomSearchCV...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=100;, score=0.640 total time=  11.8s
[CV 2/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=100;, score=0.735 total time=  15.4s
[CV 3/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=100;, score=0.744 total time=  10.9s
[CV 1/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50;, score=0.640 total time=   5.2s
[CV 2/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50;, score=0.735 total time=   5.4s
[CV 3/3] END rf__max_depth=5, rf__min_samples_split=3, rf__n_estimators=50;, score=0.744 total time=   6.2s
[CV 1/3] END rf__max_depth=20, rf__min_samples_split=3, rf__n_estimators=50;, score=0.605 total time=  20.6s
[CV 2/3] END rf__max_depth=20, rf__min_

In [18]:
best_model = rf_regress_ranCV.best_estimator_

In [19]:
best_model_importances = best_model.feature_importances_

AttributeError: 'Pipeline' object has no attribute 'feature_importances_'

In [None]:
feature_names = [i for i in X_train.columns]

forest_importances = pd.Series(best_model_importances,
                               index=feature_names)

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
forest_importances.plot.bar()
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## GridSearchCV

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
gs_performances = pd.DataFrame()

for p in [30,15,20]:
    print('Rolling period = %i' %p)

    # Read in data
    print('Reading data...')
    excel_path = '/'.join(['.','Data',
                            'modeling_data_%i rp_fpts.xlsx' %p])

    data = pd.read_excel(excel_path,
                         header = 0)

    # Train-test split
    X_train, X_test, y_train, y_test = split_train_test(data)

    # Randomized Search CV
    print('Performing GridSearchCV...')
    gs_num_estimators = [100,200,300]
    gs_depth = [5]
    gs_sample_split = [2,3]
    
    gs_rf_params = {'n_estimators':gs_num_estimators,
                 'max_depth':gs_depth,
                 'min_samples_split':gs_sample_split}

    rf_regress_gsCV = GridSearchCV(rf_regress,
                                          gs_rf_params,
                                         cv = 3,
                                         verbose = 3)

    rf_regress_gsCV.fit(X_train,
                         y_train)

    gs_cv_performance = pd.DataFrame(rf_regress_gsCV.cv_results_).sort_values('rank_test_score')

    gs_cv_performance['rolling_period'] = p

    gs_performances = pd.concat([gs_performances,
                                 gs_cv_performance])

# Writer RandomSearchCV performance to excel
print('Writing results to Excel...')
gs_cv_performance_excel_path = '/'.join(['.','Data',
                                        'GridSearch CV Performance.xlsx'])

with pd.ExcelWriter(gs_cv_performance_excel_path, mode = 'a') as writer:
    gs_performances.to_excel(writer,
                                sheet_name = 'GridSearchCV Perf',
                                index = False)

Rolling period = 30
Reading data...
Performing GridSearchCV...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.640 total time=   7.3s
[CV 2/3] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.735 total time=   7.7s
[CV 3/3] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.745 total time=   7.6s
[CV 1/3] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.640 total time=  17.6s
[CV 2/3] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.735 total time=  14.4s
[CV 3/3] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.745 total time=  14.5s
[CV 1/3] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.640 total time=  22.3s
[CV 2/3] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.735 total time=  23.3s
[CV 3/3] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.745 total time=  23.3s
[CV 

## Final Params

In [23]:
# For rolling_period = 62
final_max_depth = 5
final_min_samples_split = 2
final_n_estimators = 200

In [24]:
final_rf_regress = RandomForestRegressor(max_depth = final_max_depth,
                                         min_samples_split = final_min_samples_split,
                                         n_estimators = final_n_estimators,
                                         random_state = ran_state)

In [25]:
final_rolling_period = 30

excel_path = '/'.join(['.','Data',
                        'modeling_data_%i rp_fpts.xlsx' %final_rolling_period])

final_data = pd.read_excel(excel_path,
                     header = 0)

In [26]:
X_train, X_test, y_train, y_test = split_train_test(final_data)

In [27]:
final_rf_regress.fit(X_train,
                     y_train)

### Export Model

In [28]:
# Save trained model
import pickle

model_file_name = './Models/rf_regress_model_fpts.pkl'

In [29]:
with open(model_file_name, 'wb') as file:
    pickle.dump(final_rf_regress,
                file)

## Model Evaluation

In [None]:
# Save trained model
import pickle

model_file_name = './Models/rf_regress_model_fpts.pkl'

with open(model_file_name, 'rb') as file:
    final_rf_regress = pickle.load(file)

In [None]:
y_pred_rf = pd.DataFrame(final_rf_regress.predict(X_test),
                      columns = ['fpts_normed'])

In [None]:
y_test['fpts'] = y_test['fpts_normed'].apply(lambda x: x**2)
y_pred_rf['fpts'] = y_pred_rf['fpts_normed'].apply(lambda x: x**2)

In [None]:
mse = mean_squared_error(y_test[['fpts']],
                         y_pred_rf[['fpts']],
                        squared = False)

r_squared = r2_score(y_test[['fpts']],
                     y_pred_rf[['fpts']])

print('Root Mean Squared Error = %f' %mse)
print('R Squared = %f' %r_squared)

# Residual plot
residuals_rf = y_test['fpts'].values - y_pred_rf['fpts'].values
print('Sum of residuals = %f' %sum(residuals_rf))

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
sns.residplot(x = y_test['fpts'],
             y = y_pred_rf['fpts'])

In [None]:
final_data_X_test = final_data[final_data.index.isin(X_test.index)]

In [None]:
y_pred_rf.columns = ['_'.join([i,'pred']) for i in y_pred_rf.columns]

In [None]:
y_test.columns = ['_'.join([i,'test']) for i in y_test.columns]

In [None]:
y_pred_rf.index = final_data_X_test.index

In [None]:
final_data_Xy_test = pd.concat([final_data_X_test,
                                y_test,
                                y_pred_rf],
                               axis = 1)

In [None]:
final_data_Xy_test.drop([i for i in final_data_Xy_test.columns if 'player_position' in i],
                        axis = 1)

In [None]:
final_data_Xy_test['fpts_residuals'] = final_data_Xy_test['fpts_test'] - final_data_Xy_test['fpts_pred']

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
sns.distplot(final_data_Xy_test['fpts_residuals'])
print(((final_data_Xy_test['fpts_residuals'])).skew())

In [None]:
# skew of residuals
## All Stand Pos-less
### 0.33158

## No Team Stand Pos-less
### 0.33215