In [2]:
import pandas as pd
import numpy as np
import time
import nba_api
np.set_printoptions(suppress=True)

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE,f_regression, mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler



In [5]:
team_df = pd.read_parquet('../data/team_stats_by_season.parquet')

In [6]:
team_df

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,...,DREB_PCT_RANK,REB_PCT_RANK,TM_TOV_PCT_RANK,EFG_PCT_RANK,TS_PCT_RANK,PACE_RANK,PIE_RANK,CFID,CFPARAMS,SEASON
0,1610612737,Atlanta Hawks,82,43,39,0.524,3941.0,114.0,115.4,112.1,...,12,14,1,8,6,17,14,10,Atlanta Hawks,2021-22
1,1610612738,Boston Celtics,82,51,31,0.622,3981.0,112.1,113.6,104.0,...,16,10,13,9,9,24,2,10,Boston Celtics,2021-22
2,1610612751,Brooklyn Nets,82,44,38,0.537,3951.0,111.0,113.2,109.6,...,30,15,18,11,11,11,11,10,Brooklyn Nets,2021-22
3,1610612766,Charlotte Hornets,82,43,39,0.524,3976.0,111.7,113.6,111.0,...,29,27,9,7,13,5,15,10,Charlotte Hornets,2021-22
4,1610612741,Chicago Bulls,82,46,36,0.561,3946.0,111.0,112.7,111.1,...,7,17,6,10,8,14,20,10,Chicago Bulls,2021-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,1610612760,Seattle SuperSonics,82,57,25,0.695,3956.0,108.2,109.6,99.5,...,21,15,6,6,4,16,3,10,Seattle SuperSonics,1996-97
768,1610612761,Toronto Raptors,82,30,52,0.366,3966.0,101.3,102.9,105.1,...,20,17,22,22,23,14,21,10,Toronto Raptors,1996-97
769,1610612762,Utah Jazz,82,64,18,0.780,3966.0,110.5,112.4,101.6,...,4,4,10,1,1,18,1,10,Utah Jazz,1996-97
770,1610612763,Vancouver Grizzlies,82,14,68,0.171,3956.0,97.6,99.1,108.9,...,29,29,20,26,28,24,29,10,Vancouver Grizzlies,1996-97


In [4]:
team_df.dtypes

TEAM_ID              int64
TEAM_NAME           object
GP                   int64
W                    int64
L                    int64
W_PCT              float64
MIN                float64
E_OFF_RATING       float64
OFF_RATING         float64
E_DEF_RATING       float64
DEF_RATING         float64
E_NET_RATING       float64
NET_RATING         float64
AST_PCT            float64
AST_TO             float64
AST_RATIO          float64
OREB_PCT           float64
DREB_PCT           float64
REB_PCT            float64
TM_TOV_PCT         float64
EFG_PCT            float64
TS_PCT             float64
E_PACE             float64
PACE               float64
PACE_PER40         float64
POSS                 int64
PIE                float64
GP_RANK              int64
W_RANK               int64
L_RANK               int64
W_PCT_RANK           int64
MIN_RANK             int64
OFF_RATING_RANK      int64
DEF_RATING_RANK      int64
NET_RATING_RANK      int64
AST_PCT_RANK         int64
AST_TO_RANK          int64
A

In [5]:
#Create dataset for modeling
team_model_df = team_df.drop(['GP_RANK',              
'W_RANK',
'L_RANK',
'W_PCT_RANK',
'MIN_RANK',
'OFF_RATING_RANK', 
'DEF_RATING_RANK', 
'NET_RATING_RANK', 
'AST_PCT_RANK',
'AST_TO_RANK',   
'AST_RATIO_RANK', 
'OREB_PCT_RANK', 
'DREB_PCT_RANK', 
'REB_PCT_RANK',  
'TM_TOV_PCT_RANK', 
'EFG_PCT_RANK', 
'TS_PCT_RANK', 
'PACE_RANK',
'PIE_RANK'], axis = 1)

In [6]:
#Convert All total statistics into a per game average
totals_columns = ['FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'PF','STL', 'TOV', 'BLK', 'PTS']

for col_name in totals_columns:
    team_model_df[col_name] = team_model_df[col_name] / team_model_df['GP']
    team_model_df.rename(columns = {col_name: col_name+"_per_game"})
    
#Remove rows with now data    
team_model_df = team_model_df[team_model_df['FGA'] != 0]

KeyError: 'FG3M'

In [11]:
#Create Feature and Target DataSets
features = team_model_df.drop(['CFID', 'CFPARAMS','SEASON', 'GP', 'W_PCT', 'W', 'L', 'TEAM_ID', 'TEAM_NAME', 'OFF_RATING', 'E_NET_RATING', 'DEF_RATING'], axis = 1)
target = team_model_df['W_PCT']

#Splitting data into train/test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.30, random_state=0)

#Scaling the data
scaler = MinMaxScaler()


X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [12]:
X_train_scaled

Unnamed: 0,MIN,E_OFF_RATING,E_DEF_RATING,NET_RATING,AST_PCT,AST_TO,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
337,0.975171,0.572993,0.730088,0.380,0.368627,0.516949,0.467532,0.584906,0.456647,0.431818,0.333333,0.470199,0.457746,0.297170,0.269155,0.269888,0.713814,0.328042
570,0.968963,0.364964,0.265487,0.504,0.435294,0.305085,0.298701,0.264151,0.514451,0.238636,0.406250,0.304636,0.323944,0.386792,0.375246,0.375368,0.754833,0.412698
34,0.664804,0.722628,0.769912,0.484,0.639216,0.644068,0.779221,0.270440,0.953757,0.750000,0.406250,0.814570,0.753521,0.750000,0.750000,0.750737,0.664781,0.513228
770,0.962756,0.317518,0.725664,0.088,0.741176,0.355932,0.610390,0.635220,0.092486,0.181818,0.666667,0.311258,0.274648,0.268868,0.264735,0.265174,0.702499,0.164021
769,0.968963,0.788321,0.402655,0.920,0.901961,0.627119,0.935065,0.654088,0.491329,0.738636,0.572917,0.708609,0.795775,0.339623,0.333497,0.334119,0.735502,0.994709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0.972067,0.649635,0.575221,0.564,0.729412,0.627119,0.740260,0.459119,0.352601,0.329545,0.406250,0.556291,0.591549,0.500000,0.498527,0.499116,0.803395,0.566138
192,0.968963,0.642336,0.371681,0.708,0.486275,0.635593,0.545455,0.062893,0.549133,0.215909,0.218750,0.662252,0.619718,0.603774,0.613949,0.614614,0.847478,0.693122
629,0.975171,0.379562,0.168142,0.612,0.811765,0.508475,0.597403,0.572327,0.624277,0.681818,0.458333,0.218543,0.267606,0.320755,0.340373,0.341190,0.746110,0.661376
559,0.968963,0.379562,0.663717,0.232,0.286275,0.338983,0.259740,0.572327,0.317919,0.318182,0.375000,0.245033,0.253521,0.396226,0.384578,0.384797,0.756247,0.169312


In [15]:
#Find the 20 best features using chi2
selected_features_chi = SelectKBest(f_regression, k=9).fit(X_train_scaled, y_train)

#display the scoring for the features
print(pd.DataFrame(selected_features_chi.scores_, index = X_train_scaled.columns))

#Get a dataframe of selected top 10 features using chi2
cols_chi2 = selected_features_chi.get_support(indices=True)
X_train_kbest_chi2 = X_train_scaled.iloc[:,cols_chi2]
X_test_kbest_chi2 = X_test_scaled.iloc[:,cols_chi2]

                        0
MIN              0.010942
E_OFF_RATING   301.013596
E_DEF_RATING   230.975726
NET_RATING    8055.338395
AST_PCT         23.918350
AST_TO         111.432454
AST_RATIO      120.611762
OREB_PCT         1.701454
DREB_PCT        28.519027
REB_PCT        119.175568
TM_TOV_PCT      42.530029
EFG_PCT        175.334952
TS_PCT         214.569958
E_PACE           1.836485
PACE             2.008500
PACE_PER40       2.006767
POSS             0.180389
PIE           4590.318928


In [16]:
list(X_test_kbest_chi2.columns)

['E_OFF_RATING',
 'E_DEF_RATING',
 'NET_RATING',
 'AST_TO',
 'AST_RATIO',
 'REB_PCT',
 'EFG_PCT',
 'TS_PCT',
 'PIE']