In [1]:
import pandas as pd
import numpy as np
import time
import nba_api
import pandas.io.sql as psql

np.set_printoptions(suppress=True)

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE,f_regression, mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

import importnb
#from nbformat import read

import import_ipynb
from database_interaction import nba_db

importing Jupyter notebook from database_interaction.ipynb
successfully connected
connection closed


In [2]:
#Create instance of database
db = nba_db()

In [3]:
#Connect to database
db.connect_to_db()

successfully connected


In [4]:
#Pull players data
player_df = db.execute_query("SELECT * FROM players")

In [5]:
#Close out connection
db.close_connection()

connection closed


In [6]:
#Create dataset for modeling
player_model_df = player_df.drop(['gp_rank',
'w_rank'                       ,
'l_rank'                       ,
'w_pct_rank'                   ,
'min_rank'                     ,
'e_off_rating_rank'            ,
'off_rating_rank'              ,
'sp_work_off_rating_rank'      ,
'e_def_rating_rank'            ,
'def_rating_rank'              ,
'sp_work_def_rating_rank'      ,
'e_net_rating_rank'            ,
'net_rating_rank'              ,
'sp_work_net_rating_rank'      ,
'ast_pct_rank'                 ,
'ast_to_rank'                  ,
'ast_ratio_rank'               ,
'oreb_pct_rank'                ,
'dreb_pct_rank'                ,
'reb_pct_rank'                 ,
'tm_tov_pct_rank'              ,
'e_tov_pct_rank'               ,
'efg_pct_rank'                 ,
'ts_pct_rank'                  ,
'usg_pct_rank'                 ,
'e_usg_pct_rank'               ,
'e_pace_rank'                  ,
'pace_rank'                    ,
'sp_work_pace_rank'            ,
'pie_rank'                     ,
'fgm_rank'                     ,
'fga_rank'                     ,
'fgm_pg_rank'                  ,
'fga_pg_rank'                  ,
'fg_pct_rank'                  ,
'sp_work_off_rating',
'sp_work_def_rating',
'sp_work_net_rating',
'off_rating',
'def_rating',
'net_rating',
 'season', 'player_id', 'player_name', 'team_id', 'team_abbreviation', 'w', 'l', 'position', 'salary'], axis = 1)

In [7]:
#Create Feature and Target DataSets
features = player_model_df.drop(['w_pct'], axis = 1)
target = player_model_df['w_pct']

#Splitting data into train/test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.30, random_state=0)

#Scaling the data
scaler = MinMaxScaler()


X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [8]:
X_train_scaled

Unnamed: 0,index,age,gp,min,e_off_rating,e_def_rating,e_net_rating,ast_pct,ast_to,ast_ratio,...,pace,pace_per40,sp_work_pace,pie,poss,fgm,fga,fgm_pg,fga_pg,fg_pct
5965,0.484762,0.346154,0.071429,0.059633,0.303333,0.4372,0.421091,0.167,0.095238,0.200,...,0.072309,0.072342,0.072309,0.330333,0.005003,0.000000,0.002301,0.000000,0.025180,0.000
8395,0.682243,0.269231,0.059524,0.318807,0.298000,0.4552,0.410182,0.087,0.063333,0.138,...,0.064627,0.064647,0.064627,0.330333,0.021543,0.006135,0.010124,0.081967,0.133094,0.273
1666,0.135392,0.346154,0.583333,0.240826,0.345333,0.4264,0.449091,0.196,0.117143,0.228,...,0.106042,0.106048,0.106042,0.348500,0.156220,0.080777,0.086516,0.131148,0.136691,0.420
8147,0.662089,0.307692,0.416667,0.332569,0.344667,0.4304,0.447091,0.042,0.023810,0.069,...,0.070940,0.070939,0.070940,0.343667,0.138013,0.062372,0.068569,0.139344,0.147482,0.409
313,0.025437,0.269231,0.595238,0.371560,0.342333,0.4468,0.438182,0.105,0.054762,0.125,...,0.101934,0.101960,0.101934,0.344333,0.240723,0.122699,0.138058,0.196721,0.212230,0.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4859,0.394880,0.230769,0.928571,0.614679,0.344667,0.4300,0.447091,0.243,0.107619,0.225,...,0.089676,0.089696,0.089676,0.351000,0.592078,0.374233,0.382881,0.377049,0.377698,0.440
3264,0.265258,0.461538,0.833333,0.763761,0.343667,0.4252,0.448545,0.097,0.083810,0.134,...,0.081059,0.081079,0.081059,0.346000,0.646282,0.361963,0.367694,0.409836,0.406475,0.443
9845,0.800081,0.538462,0.595238,0.552752,0.336667,0.4108,0.451455,0.144,0.115714,0.220,...,0.067767,0.067813,0.067767,0.346167,0.322724,0.149284,0.150023,0.237705,0.230216,0.448
10799,0.877611,0.384615,0.369048,0.337156,0.296333,0.4160,0.426909,0.065,0.022857,0.069,...,0.087739,0.087772,0.087739,0.345667,0.131897,0.068507,0.072250,0.172131,0.176259,0.427


In [9]:
#Find the 10 best features using chi2
selected_features_chi = SelectKBest(f_regression, k=9).fit(X_train_scaled, y_train)

#display the scoring for the features
print(pd.DataFrame(selected_features_chi.scores_, index = X_train_scaled.columns)[:50])

#Get a dataframe of selected top 10 features using chi2
cols_chi2 = selected_features_chi.get_support(indices=True)
X_train_kbest_chi2 = X_train_scaled.iloc[:,cols_chi2]
X_test_kbest_chi2 = X_test_scaled.iloc[:,cols_chi2]

                       0
index           0.382108
age           200.858574
gp            179.968785
min             2.489419
e_off_rating  297.694720
e_def_rating   94.544820
e_net_rating  397.646222
ast_pct         3.301176
ast_to         29.166009
ast_ratio       5.687433
oreb_pct        3.850490
dreb_pct        0.261538
reb_pct        11.199610
tm_tov_pct      0.082391
e_tov_pct       0.094109
efg_pct       100.364319
ts_pct         87.209375
usg_pct         0.588839
e_usg_pct       4.113160
e_pace          7.537981
pace            1.468300
pace_per40      1.466696
sp_work_pace    1.468400
pie             8.671427
poss           94.241149
fgm           100.844280
fga            69.674167
fgm_pg         23.404335
fga_pg          3.559516
fg_pct         60.470198


In [10]:
list(X_train_kbest_chi2.columns)

['age',
 'gp',
 'e_off_rating',
 'e_def_rating',
 'e_net_rating',
 'efg_pct',
 'ts_pct',
 'poss',
 'fgm']