In [41]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [42]:
data=pd.read_csv('nba_final.csv')

In [43]:
data.head()

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,,,42.8,No


In [44]:
data.shape

(1408, 45)

In [None]:
data.dtypes

Rk              int64
Player.x       object
Player_ID      object
Pos1           object
Pos2           object
Age             int64
Tm             object
G               int64
GS              int64
MP            float64
FG            float64
FGA           float64
FG.           float64
X3P           float64
X3PA          float64
X3P.          float64
X2P           float64
X2PA          float64
X2P.          float64
eFG.          float64
FT            float64
FTA           float64
FT.           float64
ORB           float64
DRB           float64
TRB           float64
AST           float64
STL           float64
BLK           float64
TOV           float64
PF            float64
PTS           float64
Salary        float64
mean_views    float64
Season         object
Conference     object
Role           object
Fvot            int64
FRank           int64
Pvot          float64
PRank         float64
Mvot          float64
MRank         float64
Score         float64
Play           object
dtype: obj

In [46]:
data.isnull().sum()

Rk               0
Player.x         0
Player_ID        0
Pos1             0
Pos2          1396
Age              0
Tm               0
G                0
GS               0
MP               0
FG               0
FGA              0
FG.              4
X3P              0
X3PA             0
X3P.            99
X2P              0
X2PA             0
X2P.            15
eFG.             4
FT               0
FTA              0
FT.             47
ORB              0
DRB              0
TRB              0
AST              0
STL              0
BLK              0
TOV              0
PF               0
PTS              0
Salary          62
mean_views     138
Season           0
Conference       0
Role             0
Fvot             0
FRank            0
Pvot           159
PRank          159
Mvot           404
MRank          404
Score            0
Play             0
dtype: int64

In [47]:
data['Pos2']=data['Pos2'].fillna('None') # Player here do not have postion 2 to play. They only play at position 1. So fill with None. 

In [48]:
#data['Pos2']=data['Pos2'].replace(np.nan,'None')

In [49]:
#data['Salary'].fillna(data['Salary'].mean()) We can fill with mean/median only if there is less variation. Here we cannot fill with mean/median. We can perform KNN imputation

In [50]:
data['Salary']

0              NaN
1        2700000.0
2        4351320.0
3        2022240.0
4        7680965.0
           ...    
1403     3628920.0
1404    19500000.0
1405       77250.0
1406     2393887.0
1407     2615160.0
Name: Salary, Length: 1408, dtype: float64

In [51]:
data=data.fillna(0) # You can fill salary  with KNN Imputation.

In [52]:
data.isnull().sum() 

Rk            0
Player.x      0
Player_ID     0
Pos1          0
Pos2          0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG.           0
X3P           0
X3PA          0
X3P.          0
X2P           0
X2PA          0
X2P.          0
eFG.          0
FT            0
FTA           0
FT.           0
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Salary        0
mean_views    0
Season        0
Conference    0
Role          0
Fvot          0
FRank         0
Pvot          0
PRank         0
Mvot          0
MRank         0
Score         0
Play          0
dtype: int64

In [53]:
data.duplicated().sum()

0

In [54]:
data.drop_duplicates(inplace=True)

In [55]:
data.drop(columns=['Player.x','Player_ID'],inplace=True)# Target column is play

In [56]:
obj_cols=data.select_dtypes('object').columns

In [57]:
obj_cols

Index(['Pos1', 'Pos2', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

In [17]:
data.dtypes

Rk              int64
Pos1           object
Pos2           object
Age             int64
Tm             object
G               int64
GS              int64
MP            float64
FG            float64
FGA           float64
FG.           float64
X3P           float64
X3PA          float64
X3P.          float64
X2P           float64
X2PA          float64
X2P.          float64
eFG.          float64
FT            float64
FTA           float64
FT.           float64
ORB           float64
DRB           float64
TRB           float64
AST           float64
STL           float64
BLK           float64
TOV           float64
PF            float64
PTS           float64
Salary        float64
mean_views    float64
Season         object
Conference     object
Role           object
Fvot            int64
FRank           int64
Pvot          float64
PRank         float64
Mvot          float64
MRank         float64
Score         float64
Play           object
dtype: object

In [18]:
obj_cols

Index(['Pos1', 'Pos2', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

In [19]:
data.select_dtypes('number')

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG.,X3P,X3PA,...,PTS,Salary,mean_views,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score
0,170,24,22,0,7.4,0.8,1.9,0.405,0.2,0.5,...,2.2,,3.320000,786,123,,,,,83.5
1,58,32,65,0,13.8,1.9,4.6,0.403,0.7,2.0,...,5.0,2700000.0,11.155738,2474,64,,,,,48.2
2,157,21,80,72,28.7,4.9,10.8,0.454,1.0,3.3,...,12.7,4351320.0,1713.986339,22774,29,,,,,40.0
3,352,25,18,0,7.5,1.3,3.0,0.426,0.2,0.8,...,3.5,2022240.0,205.855191,861,120,1.0,52.0,,,75.5
4,10,26,61,25,29.1,3.0,7.6,0.393,1.1,3.5,...,8.7,7680965.0,604.341530,4971,69,7.0,23.0,,,42.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,109,21,77,0,17.6,2.5,5.2,0.473,0.5,1.6,...,6.6,3628920.0,470.320548,16287,74,1.0,52.0,0.0,8.0,86.8
1404,294,23,63,62,34.5,8.4,18.0,0.467,1.9,5.1,...,23.7,19500000.0,1415.268493,519746,7,17.0,8.0,0.0,8.0,36.5
1405,308,26,1,0,4.0,0.0,1.0,0.000,0.0,1.0,...,0.0,77250.0,,136,114,0.0,61.0,0.0,8.0,132.2
1406,393,34,68,3,12.9,1.3,2.8,0.440,0.0,0.1,...,3.9,2393887.0,1183.576503,21525,50,2.0,41.0,0.0,7.0,69.8


In [20]:
num_cols=data.select_dtypes('number').columns

In [21]:
num_cols

Index(['Rk', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG.', 'X3P', 'X3PA', 'X3P.',
       'X2P', 'X2PA', 'X2P.', 'eFG.', 'FT', 'FTA', 'FT.', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Salary', 'mean_views', 'Fvot',
       'FRank', 'Pvot', 'PRank', 'Mvot', 'MRank', 'Score'],
      dtype='object')

In [22]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le=LabelEncoder()
#ohe=OneHotEncoder()

In [98]:
#data['Pos1]=le.fit_ransform(data['Pos1']
for col in obj_cols:
    data[col]=le.fit_transform(data[col])

In [99]:
data[obj_cols]

Unnamed: 0,Pos1,Pos2,Tm,Season,Conference,Role,Play
0,0,1,6,0,1,1,0
1,2,1,11,0,0,0,0
2,3,1,21,0,0,1,0
3,1,1,17,0,1,1,0
4,1,1,24,0,1,1,0
...,...,...,...,...,...,...,...
1403,0,1,24,2,1,1,0
1404,4,1,3,2,0,0,0
1405,4,1,8,2,0,0,0
1406,0,1,8,2,0,1,0


In [100]:
data[obj_cols]

Unnamed: 0,Pos1,Pos2,Tm,Season,Conference,Role,Play
0,0,1,6,0,1,1,0
1,2,1,11,0,0,0,0
2,3,1,21,0,0,1,0
3,1,1,17,0,1,1,0
4,1,1,24,0,1,1,0
...,...,...,...,...,...,...,...
1403,0,1,24,2,1,1,0
1404,4,1,3,2,0,0,0
1405,4,1,8,2,0,0,0
1406,0,1,8,2,0,1,0


In [101]:
X=data.drop(columns='Play')
y=data['Play']


In [102]:
X

Unnamed: 0,Rk,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,...,Season,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score
0,170,0,1,24,6,22,0,7.4,0.8,1.9,...,0,1,1,786,123,0.0,0.0,0.0,0.0,83.5
1,58,2,1,32,11,65,0,13.8,1.9,4.6,...,0,0,0,2474,64,0.0,0.0,0.0,0.0,48.2
2,157,3,1,21,21,80,72,28.7,4.9,10.8,...,0,0,1,22774,29,0.0,0.0,0.0,0.0,40.0
3,352,1,1,25,17,18,0,7.5,1.3,3.0,...,0,1,1,861,120,1.0,52.0,0.0,0.0,75.5
4,10,1,1,26,24,61,25,29.1,3.0,7.6,...,0,1,1,4971,69,7.0,23.0,0.0,0.0,42.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,109,0,1,21,24,77,0,17.6,2.5,5.2,...,2,1,1,16287,74,1.0,52.0,0.0,8.0,86.8
1404,294,4,1,23,3,63,62,34.5,8.4,18.0,...,2,0,0,519746,7,17.0,8.0,0.0,8.0,36.5
1405,308,4,1,26,8,1,0,4.0,0.0,1.0,...,2,0,0,136,114,0.0,61.0,0.0,8.0,132.2
1406,393,0,1,34,8,68,3,12.9,1.3,2.8,...,2,0,1,21525,50,2.0,41.0,0.0,7.0,69.8


In [88]:
data['Play'].value_counts() # Imbalance data you can use stratify or tree based algorithm. Do not use linear algorithms.

0    1335
1      73
Name: Play, dtype: int64

In [107]:
#Split data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [108]:
X

Unnamed: 0,Rk,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,...,Season,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score
0,170,0,1,24,6,22,0,7.4,0.8,1.9,...,0,1,1,786,123,0.0,0.0,0.0,0.0,83.5
1,58,2,1,32,11,65,0,13.8,1.9,4.6,...,0,0,0,2474,64,0.0,0.0,0.0,0.0,48.2
2,157,3,1,21,21,80,72,28.7,4.9,10.8,...,0,0,1,22774,29,0.0,0.0,0.0,0.0,40.0
3,352,1,1,25,17,18,0,7.5,1.3,3.0,...,0,1,1,861,120,1.0,52.0,0.0,0.0,75.5
4,10,1,1,26,24,61,25,29.1,3.0,7.6,...,0,1,1,4971,69,7.0,23.0,0.0,0.0,42.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,109,0,1,21,24,77,0,17.6,2.5,5.2,...,2,1,1,16287,74,1.0,52.0,0.0,8.0,86.8
1404,294,4,1,23,3,63,62,34.5,8.4,18.0,...,2,0,0,519746,7,17.0,8.0,0.0,8.0,36.5
1405,308,4,1,26,8,1,0,4.0,0.0,1.0,...,2,0,0,136,114,0.0,61.0,0.0,8.0,132.2
1406,393,0,1,34,8,68,3,12.9,1.3,2.8,...,2,0,1,21525,50,2.0,41.0,0.0,7.0,69.8


In [109]:
data.head()

Unnamed: 0,Rk,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,0,1,24,6,22,0,7.4,0.8,1.9,...,1,1,786,123,0.0,0.0,0.0,0.0,83.5,0
1,58,2,1,32,11,65,0,13.8,1.9,4.6,...,0,0,2474,64,0.0,0.0,0.0,0.0,48.2,0
2,157,3,1,21,21,80,72,28.7,4.9,10.8,...,0,1,22774,29,0.0,0.0,0.0,0.0,40.0,0
3,352,1,1,25,17,18,0,7.5,1.3,3.0,...,1,1,861,120,1.0,52.0,0.0,0.0,75.5,0
4,10,1,1,26,24,61,25,29.1,3.0,7.6,...,1,1,4971,69,7.0,23.0,0.0,0.0,42.8,0


In [110]:
X_train.dtypes

Rk              int64
Pos1            int64
Pos2            int64
Age             int64
Tm              int64
G               int64
GS              int64
MP            float64
FG            float64
FGA           float64
FG.           float64
X3P           float64
X3PA          float64
X3P.          float64
X2P           float64
X2PA          float64
X2P.          float64
eFG.          float64
FT            float64
FTA           float64
FT.           float64
ORB           float64
DRB           float64
TRB           float64
AST           float64
STL           float64
BLK           float64
TOV           float64
PF            float64
PTS           float64
Salary        float64
mean_views    float64
Season          int64
Conference      int64
Role            int64
Fvot            int64
FRank           int64
Pvot          float64
PRank         float64
Mvot          float64
MRank         float64
Score         float64
dtype: object

In [111]:
data.describe()

Unnamed: 0,Rk,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
count,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,...,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0,1408.0
mean,257.701705,2.020597,1.014915,26.139205,14.424716,54.099432,25.90696,20.162713,3.262429,7.173153,...,0.5,0.553267,117696.3,61.59517,7.216619,38.385653,2.09517,5.098011,75.578338,0.051847
std,150.765855,1.442674,0.207683,4.28255,8.616938,24.052508,28.664578,9.066041,2.181694,4.591826,...,0.500178,0.497331,428627.2,36.681104,25.022761,26.005078,12.607836,3.473575,41.40498,0.221796
min,1.0,0.0,0.0,19.0,0.0,1.0,0.0,0.7,0.0,0.0,...,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,127.0,1.0,1.0,23.0,7.0,36.0,1.0,13.3,1.6,3.7,...,0.0,0.0,2135.5,30.0,0.0,16.0,0.0,0.0,45.0,0.0
50%,257.0,2.0,1.0,25.0,14.0,62.0,13.0,20.0,2.8,6.15,...,0.5,1.0,6843.0,60.0,1.0,39.0,0.0,7.0,69.9,0.0
75%,385.25,3.0,1.0,29.0,22.0,74.0,52.0,27.5,4.5,9.9,...,1.0,1.0,29252.5,90.25,3.0,60.0,0.0,8.0,109.2,0.0
max,540.0,4.0,4.0,42.0,29.0,82.0,82.0,37.8,10.8,24.5,...,1.0,1.0,4620809.0,145.0,269.0,88.0,100.0,9.0,166.8,1.0


In [112]:
# Standardization /Scaling is done after splitting and 
# of numeric columns. So encoding is done first.


In [113]:
sc=StandardScaler()
X_train_scaled=sc.fit_transform(X_train)
X_test_scaled=sc.transform(X_test) 
# Follow same procedur as X_train for mean and sd)
X_train_scaled
#MinMaxScaler()

array([[ 1.59054607e+00,  1.38741454e+00, -6.89860274e-02, ...,
        -1.66815972e-01,  2.61053640e-01,  4.48574442e-01],
       [-5.28180447e-01,  1.38741454e+00, -6.89860274e-02, ...,
        -1.66815972e-01,  5.48110671e-01, -1.02884746e+00],
       [-5.66143564e-02, -6.91860667e-01, -6.89860274e-02, ...,
         7.50782031e+00, -1.17423151e+00, -1.79652747e+00],
       ...,
       [-6.87583070e-01,  1.23106880e-03, -6.89860274e-02, ...,
        -1.66815972e-01,  5.48110671e-01,  6.41701488e-01],
       [ 1.13890531e+00, -1.38495240e+00, -6.89860274e-02, ...,
        -1.66815972e-01, -1.46128854e+00, -1.55511867e+00],
       [-7.67284381e-01, -6.91860667e-01, -6.89860274e-02, ...,
        -1.66815972e-01,  1.12222473e+00,  1.80046377e+00]])

In [114]:
log_reg=LogisticRegression()
log_reg.fit(X_train_scaled,y_train)
y_pred=log_reg.predict(X_test_scaled)
accuracy_score(y_test,y_pred)

0.975177304964539

In [115]:
#Check for OverFitting using Training data part

In [116]:
y_pred_train=log_reg.predict(X_train_scaled)

In [117]:
accuracy_score(y_train,y_pred_train)

0.9893428063943162

In [118]:
#It is not overfitting. If data is imbalanced, we should not trust accuracy score. . We use ROC_AUC_Score
# It works for both blanced and imbalanced data.
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred)

0.7981273408239701

In [113]:
roc_auc_score(y_train,y_pred_train)

0.9210092987214258

In [135]:
# Clear cut case of Overfitting dur to large no.of columns (42)
#Transform the data with PCA
from sklearn.decomposition import PCA
pca=PCA(n_components=0.90)
# PCA Transform input columns scaled version
X_train_pca=pca.fit_transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

In [137]:
X_train_pca.shape

(1126, 17)

In [139]:
log_reg2=LogisticRegression()
log_reg2.fit(X_train_pca,y_train)
y_pred2=log_reg2.predict(X_test_pca)

In [141]:
roc_auc_score(y_test,y_pred2)

0.9

In [143]:
y_pred_train2=log_reg2.predict(X_train_pca)


In [145]:
roc_auc_score(y_train,y_pred_train2)

0.8855902105127212

In [147]:
#If score difference ismore than 10% it is case of overfitting