In [2]:
import pandas as pd
import numpy as np
import time
import nba_api
np.set_printoptions(suppress=True)
import pandas.io.sql as psql

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE,f_regression, mutual_info_regression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler


import import_ipynb
from database_interaction import nba_db


In [3]:
#Create instance of database
db = nba_db()

In [4]:
#Connect to database
db.connect_to_db()

successfully connected


In [5]:
#Pull teams data
team_df = db.execute_query("SELECT * FROM teams")

In [6]:
#Close out connection
db.close_connection()

connection closed


In [7]:
#Create dataset for modeling
team_model_df = team_df.drop(['gp_rank',              
'w_rank',
'l_rank',
'w_pct_rank',
'min_rank',
'off_rating_rank', 
'def_rating_rank', 
'net_rating_rank', 
'ast_pct_rank',
'ast_to_rank',   
'ast_ratio_rank', 
'oreb_pct_rank', 
'dreb_pct_rank', 
'reb_pct_rank',  
'tm_tov_pct_rank', 
'efg_pct_rank', 
'ts_pct_rank', 
'pace_rank',
'pie_rank',
'salary_cap',
'salary_cap_adj'], axis = 1)

In [8]:
# #Convert All total statistics into a per game average
# totals_columns = ['FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'PF','STL', 'TOV', 'BLK', 'PTS']

# for col_name in totals_columns:
#     team_model_df[col_name] = team_model_df[col_name] / team_model_df['GP']
#     team_model_df.rename(columns = {col_name: col_name+"_per_game"})
    
# #Remove rows with now data    
# team_model_df = team_model_df[team_model_df['FGA'] != 0]

In [9]:
#Create Feature and Target DataSets
features = team_model_df.drop(['season', 'gp', 'w_pct', 'w', 'l', 'team_id', 'team_name', 'off_rating', 'e_net_rating', 'def_rating'], axis = 1)
target = team_model_df['w_pct']

#Splitting data into train/test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.30, random_state=0)

#Scaling the data
scaler = MinMaxScaler()


X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [10]:
X_train_scaled

Unnamed: 0,index,min,e_off_rating,e_def_rating,net_rating,ast_pct,ast_to,ast_ratio,oreb_pct,dreb_pct,reb_pct,tm_tov_pct,efg_pct,ts_pct,e_pace,pace,pace_per40,poss,pie
337,0.437095,0.975171,0.572993,0.730088,0.380,0.368627,0.516949,0.467532,0.584906,0.456647,0.431818,0.333333,0.470199,0.457746,0.297170,0.269155,0.269888,0.713814,0.328042
570,0.739300,0.968963,0.364964,0.265487,0.504,0.435294,0.305085,0.298701,0.264151,0.514451,0.238636,0.406250,0.304636,0.323944,0.386792,0.375246,0.375368,0.754833,0.412698
34,0.044099,0.664804,0.722628,0.769912,0.484,0.639216,0.644068,0.779221,0.270440,0.953757,0.750000,0.406250,0.814570,0.753521,0.750000,0.750000,0.750737,0.664781,0.513228
770,0.998703,0.962756,0.317518,0.725664,0.088,0.741176,0.355932,0.610390,0.635220,0.092486,0.181818,0.666667,0.311258,0.274648,0.268868,0.264735,0.265174,0.702499,0.164021
769,0.997406,0.968963,0.788321,0.402655,0.920,0.901961,0.627119,0.935065,0.654088,0.491329,0.738636,0.572917,0.708609,0.795775,0.339623,0.333497,0.334119,0.735502,0.994709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0.989624,0.972067,0.649635,0.575221,0.564,0.729412,0.627119,0.740260,0.459119,0.352601,0.329545,0.406250,0.556291,0.591549,0.500000,0.498527,0.499116,0.803395,0.566138
192,0.249027,0.968963,0.642336,0.371681,0.708,0.486275,0.635593,0.545455,0.062893,0.549133,0.215909,0.218750,0.662252,0.619718,0.603774,0.613949,0.614614,0.847478,0.693122
629,0.815824,0.975171,0.379562,0.168142,0.612,0.811765,0.508475,0.597403,0.572327,0.624277,0.681818,0.458333,0.218543,0.267606,0.320755,0.340373,0.341190,0.746110,0.661376
559,0.725032,0.968963,0.379562,0.663717,0.232,0.286275,0.338983,0.259740,0.572327,0.317919,0.318182,0.375000,0.245033,0.253521,0.396226,0.384578,0.384797,0.756247,0.169312


In [11]:
#Find the 20 best features using chi2
selected_features_chi = SelectKBest(f_regression, k=9).fit(X_train_scaled, y_train)

#display the scoring for the features
print(pd.DataFrame(selected_features_chi.scores_, index = X_train_scaled.columns))

#Get a dataframe of selected top 10 features using chi2
cols_chi2 = selected_features_chi.get_support(indices=True)
X_train_kbest_chi2 = X_train_scaled.iloc[:,cols_chi2]
X_test_kbest_chi2 = X_test_scaled.iloc[:,cols_chi2]

                        0
index            0.068509
min              0.010942
e_off_rating   301.013596
e_def_rating   230.975726
net_rating    8055.338395
ast_pct         23.918350
ast_to         111.432454
ast_ratio      120.611762
oreb_pct         1.701454
dreb_pct        28.519027
reb_pct        119.175568
tm_tov_pct      42.530029
efg_pct        175.334952
ts_pct         214.569958
e_pace           1.836485
pace             2.008500
pace_per40       2.006767
poss             0.180389
pie           4590.318928


In [12]:
list(X_test_kbest_chi2.columns)

['e_off_rating',
 'e_def_rating',
 'net_rating',
 'ast_to',
 'ast_ratio',
 'reb_pct',
 'efg_pct',
 'ts_pct',
 'pie']