In [269]:
# import other files
%run validation.ipynb

# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [270]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers

In [271]:
# import data from "nba_data_collection" stage
all_nba_df = pd.read_csv('./data/all_stats_2000_23.csv')

# clean up the unuseful columns
all_nba_df = all_nba_df.drop(['NICKNAME', 'TEAM_ID', 'NBA_FANTASY_PTS', 'WNBA_FANTASY_PTS', 'NBA_FANTASY_PTS_RANK', 'WNBA_FANTASY_PTS_RANK'], axis=1)

# we are trying to predict the NBA 2022-23 MVP, so isolate this season out
nba_2023 = pd.read_csv('./data/all_stats_2000_23.csv')
nba_2023 = nba_2023[(nba_2023['season_id']== '2022-23')].reset_index(drop=True)
# import team stats to get Game Played stats
team_stats_2022_23_df = pd.read_csv('./data/team_stats_2022_23.csv')
team_stats_2022_23_df = team_stats_2022_23_df[['TEAM_ID', 'GP']]
team_stats_2022_23_df = team_stats_2022_23_df.rename({'GP': 'team_GP'}, axis=1)

nba_2023 = pd.merge(nba_2023, team_stats_2022_23_df, left_on = ['TEAM_ID'], right_on = ['TEAM_ID'], how = 'left')
nba_2023['GP_PCT'] = nba_2023['GP']/nba_2023['team_GP']
nba_2023 = nba_2023.drop(['NICKNAME', 'TEAM_ID', 'GP','NBA_FANTASY_PTS', 'WNBA_FANTASY_PTS', 'NBA_FANTASY_PTS_RANK', 'WNBA_FANTASY_PTS_RANK', 'team_GP'], axis=1)

In [272]:
past_mvps = pd.read_csv('./data/past_mvps_2000_22.csv')
past_mvps = past_mvps[['Player', 'season_id', 'Share']]

# left join all nba stats and mvp results
nba_2000_22 = all_nba_df[(all_nba_df['season_id']!= '2022-23')].reset_index(drop=True)
nba_2000_22 = pd.merge(nba_2000_22, past_mvps, left_on = ['PLAYER_NAME', 'season_id'], right_on = ['Player', 'season_id'], how = 'left')
nba_2000_22['GP_PCT'] = nba_2000_22['GP']/82
nba_2000_22 = nba_2000_22.drop(['Player', 'GP'], axis=1)
nba_2000_22['Share'] = nba_2000_22['Share'].fillna(0.000)

In [273]:
# since there are approx. 450 players in the NBA per season, but not all are qualified for MVP selection. We use past MVP's lowest stats to filter players out
# the stats we are using are the ones that show the impact of players on their teams
# Filter:
# winning percentage: lowest winning percentage on MVP's team in the past is 45.3%
# minutes per game: lowest minutes per game for a MVP is 30.4 min
# GP: lowest game played for an MVP per season is 49 per game
# +/-: lowest +/- for an MVP is 3.1
# offensive rating: lowest offensive rating is 104.7
# defensive rating: highest defensive rating is 110.6
# since we might mis-filter past MVPs out so if they had MVP shares, we still keep the players
# &(nba_2000_22['OFF_RATING'] >= 104.7)&(nba_2000_22['DEF_RATING'] <= 110.6)
filtered_nba_2000_22 = nba_2000_22[((nba_2000_22['W_PCT'] >= 0.453)&
                                    (nba_2000_22['MIN'] >= 30.4)&
                                    (nba_2000_22['GP_PCT'] >= 0.5976)&
                                    (nba_2000_22['PLUS_MINUS'] >= 3.1)&
                                    (nba_2000_22['OFF_RATING'] >= 104.7)&
                                    (nba_2000_22['DEF_RATING'] <= 110.6))|(nba_2000_22['Share']>0)].reset_index(drop=True)
nba_2023 = nba_2023[((nba_2023['W_PCT'] >= 0.453)&
                     (nba_2023['MIN'] >= 30.4)&
                     (nba_2023['GP_PCT'] >= 0.5976)&
                     (nba_2023['PLUS_MINUS'] >= 3.1))].reset_index(drop=True)
nba_2023_model = nba_2023.drop(['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'season_id'], axis = 1)

In [274]:
# split training and testing data set 
# training: 2000-2018, testing: 2019-2022, predict: 2023
# training
nba_2000_18 = filtered_nba_2000_22.loc[(filtered_nba_2000_22['season_id'] != '2019-20') 
                         & (filtered_nba_2000_22['season_id'] != '2020-21')
                         & (filtered_nba_2000_22['season_id'] != '2021-22')]

share_2000_18 = nba_2000_18[['Share']].values.ravel()
nba_2000_18_model = nba_2000_18.drop(['Share', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'season_id'], axis = 1)

# testing
nba_2019_22 = filtered_nba_2000_22.loc[(filtered_nba_2000_22['season_id'] == '2019-20') 
                         | (filtered_nba_2000_22['season_id'] == '2020-21')
                         | (filtered_nba_2000_22['season_id'] == '2021-22')]
share_2019_22 = nba_2019_22[['Share']].values.ravel()
nba_2019_22_model = nba_2019_22.drop(['Share', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ABBREVIATION', 'season_id'], axis = 1)

In [275]:
# Assuming that X and y are already defined and X is a pandas DataFrame
# StandardScaler
scaler = StandardScaler()
scaler.fit(nba_2000_18_model)
nba_2000_18_scaled = scaler.transform(nba_2000_18_model)
nba_2019_22_scaled = scaler.transform(nba_2019_22_model)
nba_2023_scaled = scaler.transform(nba_2023_model)

# SelectKBest
k = 20 # number of top features to select
selector = SelectKBest(f_regression, k=k)
selector.fit(nba_2000_18_scaled, share_2000_18)
nba_2000_18_new = selector.transform(nba_2000_18_scaled)
nba_2019_22_new = selector.transform(nba_2019_22_scaled)
nba_2023_new = selector.transform(nba_2023_scaled)

# Get the p-values and F-scores of the selected features
p_values = selector.pvalues_
f_scores = selector.scores_
selected_features = nba_2000_18_model.columns[selector.get_support()]

# Print the selected feature names, F-scores and p-values
# for feature, f_score, p_value in zip(selected_features, f_scores, p_values):
#    print(f"Feature: {feature}, F-score: {f_score:.2f}, p-value: {p_value:.2g}")

In [276]:
# Support Vector Regression 
param_grid = {'C': [0.001,0.01,0.1,0.5,1,2,5,10],
             'kernel': ['linear','rbf','poly'],
             'gamma': ['scale','auto'],
             'degree': [2,3,4],
             'epsilon': [0.1,0.5,1]}
svr_model = SVR()
grid = GridSearchCV(svr_model, param_grid)
grid.fit(nba_2000_18_new, share_2000_18)

# Validation
model_svr.fit(nba_2000_18_new, share_2000_18)
y_pred_svr = model_svr.predict(nba_2019_22_new)


In [277]:
# Gradient Boosting
param_grid = {'n_estimators': [10,20,30,40,50],
                             'learning_rate': [0.01,0.05,0.1,0.2,0.5],
                             'max_depth': [3,4,5]}
gb_model = GradientBoostingRegressor()
grid = GridSearchCV(gb_model, param_grid)
grid.fit(nba_2000_18_new, share_2000_18)
model_gb = GradientBoostingRegressor(**grid.best_params_)


# Validation
model_gb.fit(nba_2000_18_new, share_2000_18)
y_pred_gb = model_gb.predict(nba_2019_22_new)

Unnamed: 0,Season,Model,RMSE,R_Square
0,2019_22,SVR,0.173,0.442
1,2019_22,Gradient Boosting,0.104,0.797
2,2019_22,Random Forest,0.127,0.7
3,2019_22,NN,0.171,0.452
4,2019_22,Gradient Boosting,0.099,0.796


In [278]:
# Random Forest
param_grid = {'n_estimators': [15,25,50,64,100,150,200],
                             'max_features': [2,3,4,5],
                             'bootstrap': [True,False],
                             'oob_score': [True,False]}
rfc = RandomForestRegressor()
grid = GridSearchCV(rfc, param_grid)
grid.fit(nba_2000_18_new, share_2000_18)
model_rf = RandomForestRegressor(**grid.best_params_)
best_params.append(grid.best_params_)

# Validation
model_rf.fit(nba_2000_18_new, share_2000_18)
y_pred_rf = model.predict(nba_2019_22_new)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
140 fits failed out of a total of 560.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
140 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/frankhung/opt/anaconda3/envs/nba/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/frankhung/opt/anaconda3/envs/nba/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 434, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True

 0.37927579 0.35503546 0.37898292 0.38764472 0.3782602



In [279]:
# NN
# Define the neural network architecture
df_shape = nba_2000_18_new.shape[1]

# layers of NN
model_nn = keras.models.Sequential([keras.layers.Dense(16, activation='relu',input_shape=(df_shape,), kernel_regularizer=regularizers.l2(0.001))])
model_nn.add(layers.Dense(1, activation='linear'))

# Compile the model
model_nn.compile(optimizer='adam', loss='mse')

# Train the model
model_nn.fit(nba_2000_18_new, share_2000_18, epochs=100)
y_pred_nn = model_nn.predict(nba_2019_22_new)
y_pred_nn = y_pred_nn.reshape((nba_2019_22_new.shape[0],))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [280]:
# set up a metrics dataframe to contain different model results
model_list = ['SVR', ]
metrics = pd.DataFrame()
metrics = testing_metrics(share_2019_22, y_pred_svr, metrics, 'SVR', '2019_22')
metrics = testing_metrics(share_2019_22, y_pred_gb, metrics,'Gradient Boosting', '2019_22')
metrics = testing_metrics(share_2019_22, y_pred_rf, metrics, 'Random Forest', '2019_22')
metrics = testing_metrics(share_2019_22, y_pred_nn, metrics, 'NN', '2019_22')
metrics

ValueError: Found input variables with inconsistent numbers of samples: [78, 89]

In [281]:
results = nba_2023[['PLAYER_NAME', 'TEAM_ABBREVIATION', 'season_id']]
y_pred_nn = model_nn.predict(nba_2023_new)
y_pred_nn = y_pred_nn.reshape((nba_2023_new.shape[0],))
results['Predicted MVP Share'] = pd.Series(y_pred_nn).values
results = results.loc[(results['season_id'] == '2022-23')]
results_sorted = results.sort_values(by='Predicted MVP Share',
                                                ascending=False).reset_index(drop=True)
results_sorted[0:3]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['Predicted MVP Share'] = pd.Series(y_pred_nn).values


Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,season_id,Predicted MVP Share
0,Jarrett Allen,CLE,2022-23,-0.101725


In [None]:
model.fit(nba_2000_18_new, share_2000_18)
y_pred = model.predict(nba_2023_new)
initial_results = nba_2023[['PLAYER_NAME', 'TEAM_ABBREVIATION', 'season_id']]
results_sorted.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['Predicted MVP Share'] = pd.Series(y_pred_nn).values


Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,season_id,Predicted MVP Share
0,Al Horford,BOS,2022-23,0.216753
1,Anthony Davis,LAL,2022-23,0.292676
2,Darius Garland,CLE,2022-23,0.125151
3,Desmond Bane,MEM,2022-23,0.111674
4,Devin Booker,PHX,2022-23,0.235296
5,Domantas Sabonis,SAC,2022-23,0.231568
6,Donovan Mitchell,CLE,2022-23,0.262479
7,Draymond Green,GSW,2022-23,0.125431
8,Evan Mobley,CLE,2022-23,0.0548
9,Giannis Antetokounmpo,MIL,2022-23,0.580605


In [None]:
# Validation
model.fit(nba_2000_18_new, share_2000_18)
y_pred = model.predict(nba_2019_22_new)
test_result = testing_metrics(share_2019_22, y_pred, metrics, 'Random Forest', '2019_22')
test_result

In [None]:
metrics

In [26]:
y_pred.shape

(78,)

In [34]:
results = initial_results.copy()
results['Predicted MVP Share'] = pd.Series(y_pred_nn).values

In [35]:
results

Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,season_id,Predicted MVP Share
566,Anthony Davis,LAL,2019-20,0.190171
567,Bam Adebayo,MIA,2019-20,0.038179
568,Bojan Bogdanovic,UTA,2019-20,-0.051292
569,Chris Paul,OKC,2019-20,0.071017
570,Damian Lillard,POR,2019-20,0.243400
...,...,...,...,...
639,Nikola Jokic,DEN,2021-22,0.465135
640,Pascal Siakam,TOR,2021-22,0.071968
641,Rudy Gobert,UTA,2021-22,0.118159
642,Stephen Curry,GSW,2021-22,0.192505
