# Logestic Regression Model - PRUNE

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/nba_champs"
engine = create_engine(db_string)

In [3]:
stats_df = pd.read_sql("select * from \"stats\"", db_string);
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2020-21,WAS,72,34,38,0.472,-1.83,-0.01,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,1,0,2020-21,UTA,72,52,20,0.722,9.25,-0.29,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,2,0,2020-21,TOR,72,27,45,0.375,-0.47,-0.07,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,3,0,2020-21,SAS,72,33,39,0.458,-1.74,0.15,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,4,0,2020-21,SAC,72,31,41,0.431,-3.68,0.23,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [4]:
test_df= pd.read_sql("select * from \"cy_stats\"", db_string)
test_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2021-22,MIA,6,5,1,0.833,17.0,1.07,...,0.525,0.565,13.6,27.5,0.197,0.464,0.501,13.4,15.9,0.181
1,1,0,2021-22,WAS,7,5,2,0.714,2.57,1.11,...,0.505,0.549,11.5,20.4,0.213,0.492,0.535,11.0,20.5,0.197
2,2,0,2021-22,UTA,6,5,1,0.833,12.83,-3.08,...,0.526,0.574,14.2,24.6,0.236,0.45,0.485,10.8,23.1,0.164
3,3,0,2021-22,TOR,8,5,3,0.625,4.63,1.25,...,0.487,0.518,12.1,29.8,0.158,0.521,0.554,16.8,20.6,0.178
4,4,0,2021-22,SAS,7,2,5,0.286,-0.43,-1.5,...,0.526,0.542,12.2,22.2,0.113,0.549,0.568,15.2,21.4,0.13


In [5]:
# Pull out Season, TM and Team_Name into DF
name_df = stats_df[['Season', 'Tm',]]
name_df.head()

Unnamed: 0,Season,Tm
0,2020-21,WAS
1,2020-21,UTA
2,2020-21,TOR
3,2020-21,SAS
4,2020-21,SAC


In [6]:
#list(stats_df.columns)

In [7]:
all_features_df = stats_df
all_features_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2020-21,WAS,72,34,38,0.472,-1.83,-0.01,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,1,0,2020-21,UTA,72,52,20,0.722,9.25,-0.29,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,2,0,2020-21,TOR,72,27,45,0.375,-0.47,-0.07,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,3,0,2020-21,SAS,72,33,39,0.458,-1.74,0.15,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,4,0,2020-21,SAC,72,31,41,0.431,-3.68,0.23,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [8]:
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2020-21,WAS,72,34,38,0.472,-1.83,-0.01,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,1,0,2020-21,UTA,72,52,20,0.722,9.25,-0.29,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,2,0,2020-21,TOR,72,27,45,0.375,-0.47,-0.07,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,3,0,2020-21,SAS,72,33,39,0.458,-1.74,0.15,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,4,0,2020-21,SAC,72,31,41,0.431,-3.68,0.23,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [9]:
# Drop un needed columns
all_features_df.drop(['index',
 'Season',
 'Tm',
 'G',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 #'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 #'TS_pct',
 #'TOV_pct',
 #'ORB_pct',
 #'FT_FGA',
 #'Opp_eFG_pct',
 #'Opp_TS_pct',
 #'Opp_TOV_pct',
 #'Opp_ORB_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [10]:
all_features_df.head()

Unnamed: 0,Position,Win_pct,MOV,SOS,SRS,Pace,ORtg,DRtg,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0.472,-1.83,-0.01,-1.85,104.1,111.2,113.0,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,0,0.722,9.25,-0.29,8.97,98.5,117.6,108.3,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,0,0.375,-0.47,-0.07,-0.54,99.2,112.0,112.5,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,0,0.458,-1.74,0.15,-1.58,98.9,111.0,112.8,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,0,0.431,-3.68,0.23,-3.45,100.0,113.6,117.2,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [11]:
# Remove Position target from features data
y = all_features_df.Position
X = all_features_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


In [12]:
feature_names = X.columns

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [14]:
# Create a logistic regression model
classifier_AF = LogisticRegression(max_iter=1000)
classifier_AF

LogisticRegression(max_iter=1000)

In [15]:
# Fit (train) our model by using the training data
classifier_AF.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [16]:
# Validate the model by using the test data
print(f"Logistic Regression Training Data Score: {classifier_AF.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Testing Data Score: {classifier_AF.score(X_test_scaled, y_test)}")

Logistic Regression Training Data Score: 0.9642346208869814
Logistic Regression Testing Data Score: 0.9786324786324786


All features score = 0.97863

Find accuracy using top 10 RFE features

In [17]:
stats_df = pd.read_sql("select * from \"stats\"", db_string);
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,G,W,L,Win_pct,MOV,SOS,...,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0,2020-21,WAS,72,34,38,0.472,-1.83,-0.01,...,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,1,0,2020-21,UTA,72,52,20,0.722,9.25,-0.29,...,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,2,0,2020-21,TOR,72,27,45,0.375,-0.47,-0.07,...,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,3,0,2020-21,SAS,72,33,39,0.458,-1.74,0.15,...,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,4,0,2020-21,SAC,72,31,41,0.431,-3.68,0.23,...,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [18]:
# Drop un needed columns
stats_df.drop(['index',
 'Season',
 'Tm',
 'G',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 #'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 #'TS_pct',
 #'TOV_pct',
 #'ORB_pct',
 #'FT_FGA',
 #'Opp_eFG_pct',
 #'Opp_TS_pct',
 #'Opp_TOV_pct',
 #'Opp_ORB_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [19]:
# Remove Position target from features data
y = stats_df.Position
X = stats_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [20]:
# https://towardsdatascience.com/a-look-into-feature-importance-in-logistic-regression-models-a4aa970f9b0f
from sklearn.feature_selection import RFE

predictors = X_train
selector = RFE(classifier_AF, n_features_to_select = 1)
selector = selector.fit(predictors, y_train)

In [21]:
order = selector.ranking_
order

array([ 2,  8,  3,  1, 12,  6,  5, 10, 15, 16, 11,  4, 17, 14,  7, 13,  9])

In [22]:
order_df = pd.DataFrame(order, columns = ['importance'])
order_df.head()

Unnamed: 0,importance
0,2
1,8
2,3
3,1
4,12


In [23]:
feature_importance = pd.DataFrame(feature_names, columns = ["feature"])
feature_importance.head()

Unnamed: 0,feature
0,Win_pct
1,MOV
2,SOS
3,SRS
4,Pace


In [24]:
feature_imp_df = order_df.join(feature_importance)
feature_imp_df

Unnamed: 0,importance,feature
0,2,Win_pct
1,8,MOV
2,3,SOS
3,1,SRS
4,12,Pace
5,6,ORtg
6,5,DRtg
7,10,eFG_pct
8,15,TS_pct
9,16,TOV_pct


In [25]:
rfe_df = stats_df
rfe_df.head()

Unnamed: 0,Position,Win_pct,MOV,SOS,SRS,Pace,ORtg,DRtg,eFG_pct,TS_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TS_pct,Opp_TOV_pct,Opp_ORB_pct,Opp_FT_FGA
0,0,0.472,-1.83,-0.01,-1.85,104.1,111.2,113.0,0.531,0.569,12.3,21.3,0.221,0.539,0.576,12.5,22.4,0.217
1,0,0.722,9.25,-0.29,8.97,98.5,117.6,108.3,0.563,0.597,12.7,24.5,0.195,0.507,0.537,10.3,20.7,0.159
2,0,0.375,-0.47,-0.07,-0.54,99.2,112.0,112.5,0.529,0.567,11.9,20.8,0.196,0.543,0.584,14.4,23.7,0.234
3,0,0.458,-1.74,0.15,-1.58,98.9,111.0,112.8,0.517,0.554,10.2,20.0,0.192,0.541,0.57,11.8,22.7,0.174
4,0,0.431,-3.68,0.23,-3.45,100.0,113.6,117.2,0.549,0.578,12.0,21.3,0.185,0.557,0.591,12.1,25.0,0.199


In [26]:
# Drop un needed columns
rfe_df.drop([#'index',
 #'Season',
 #'Tm',
 #'G',
 #'W',
 #'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 'TS_pct',
 'TOV_pct',
 'ORB_pct',
 #'FT_FGA',
 'Opp_eFG_pct',
 'Opp_TS_pct',
 #'Opp_TOV_pct',
 'Opp_ORB_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [27]:
rfe_df.head()

Unnamed: 0,Position,Win_pct,MOV,SOS,SRS,ORtg,DRtg,eFG_pct,FT_FGA,Opp_TOV_pct,Opp_FT_FGA
0,0,0.472,-1.83,-0.01,-1.85,111.2,113.0,0.531,0.221,12.5,0.217
1,0,0.722,9.25,-0.29,8.97,117.6,108.3,0.563,0.195,10.3,0.159
2,0,0.375,-0.47,-0.07,-0.54,112.0,112.5,0.529,0.196,14.4,0.234
3,0,0.458,-1.74,0.15,-1.58,111.0,112.8,0.517,0.192,11.8,0.174
4,0,0.431,-3.68,0.23,-3.45,113.6,117.2,0.549,0.185,12.1,0.199


In [28]:
stats_df.head()

Unnamed: 0,Position,Win_pct,MOV,SOS,SRS,ORtg,DRtg,eFG_pct,FT_FGA,Opp_TOV_pct,Opp_FT_FGA
0,0,0.472,-1.83,-0.01,-1.85,111.2,113.0,0.531,0.221,12.5,0.217
1,0,0.722,9.25,-0.29,8.97,117.6,108.3,0.563,0.195,10.3,0.159
2,0,0.375,-0.47,-0.07,-0.54,112.0,112.5,0.529,0.196,14.4,0.234
3,0,0.458,-1.74,0.15,-1.58,111.0,112.8,0.517,0.192,11.8,0.174
4,0,0.431,-3.68,0.23,-3.45,113.6,117.2,0.549,0.185,12.1,0.199


In [29]:
# Remove Position target from features data
y = rfe_df.Position
X = rfe_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [30]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [31]:
# Create a logistic regression model
classifier_RFE = LogisticRegression(max_iter=1000)
classifier_RFE

LogisticRegression(max_iter=1000)

In [32]:
# Fit (train) our model by using the training data
classifier_RFE.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [34]:
# Validate the model by using the test data
print(f"Logistic Regression with RFE features only Training Data Score: {classifier_RFE.score(X_train_scaled, y_train)}")
print(f"Logistic Regression with RFE features only Testing Data Score: {classifier_RFE.score(X_test_scaled, y_test)}")

Logistic Regression with RFE features only Training Data Score: 0.9656652360515021
Logistic Regression with RFE features only Testing Data Score: 0.9743589743589743


RFE features score = 0.97435

In [35]:
# Pull out Season, TM and Team_Name into DF
test_name_df = test_df[['Season', 'Tm']]
test_name_df.head()

Unnamed: 0,Season,Tm
0,2021-22,MIA
1,2021-22,WAS
2,2021-22,UTA
3,2021-22,TOR
4,2021-22,SAS


In [36]:
test_df_pos= test_df['Position']
test_df.drop(['index',
 'Position',
 'Season',
 'Tm',
 'G',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 'TS_pct',
 'TOV_pct',
 'ORB_pct',
 #'FT_FGA',
 'Opp_eFG_pct',
 'Opp_TS_pct',
 #'Opp_TOV_pct',
 'Opp_ORB_pct',
 #'Opp_FT_FGA'
             ], axis=1, inplace=True)
test_df

Unnamed: 0,Win_pct,MOV,SOS,SRS,ORtg,DRtg,eFG_pct,FT_FGA,Opp_TOV_pct,Opp_FT_FGA
0,0.833,17.0,1.07,18.07,112.2,95.5,0.525,0.197,13.4,0.181
1,0.714,2.57,1.11,3.68,108.8,106.3,0.505,0.213,11.0,0.197
2,0.833,12.83,-3.08,9.76,111.9,99.0,0.526,0.236,10.8,0.164
3,0.625,4.63,1.25,5.88,107.4,102.7,0.487,0.158,16.8,0.178
4,0.286,-0.43,-1.5,-1.93,106.6,107.0,0.526,0.113,15.2,0.13
5,0.5,-2.83,-1.06,-3.89,107.8,110.6,0.508,0.177,10.1,0.187
6,0.429,1.86,-0.95,0.91,111.3,109.4,0.54,0.154,12.2,0.146
7,0.4,-5.0,-1.64,-6.64,104.4,109.3,0.511,0.175,13.4,0.122
8,0.714,8.57,-2.13,6.44,116.4,107.5,0.571,0.186,13.0,0.16
9,0.25,-7.63,3.26,-4.37,105.9,113.7,0.518,0.183,10.8,0.174


In [37]:
df_X_test_scaled = scaler.fit_transform(test_df)

In [38]:
df_test_df=classifier_RFE.predict(df_X_test_scaled)
df_test_df

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [39]:
cy_test_df = pd.DataFrame(df_test_df, columns = ['Position'])
cy_test_df.head()

Unnamed: 0,Position
0,1
1,0
2,0
3,0
4,0


In [40]:
joined_df = test_name_df.join(cy_test_df)
joined_df

Unnamed: 0,Season,Tm,Position
0,2021-22,MIA,1
1,2021-22,WAS,0
2,2021-22,UTA,0
3,2021-22,TOR,0
4,2021-22,SAS,0
5,2021-22,SAC,0
6,2021-22,POR,0
7,2021-22,PHO,0
8,2021-22,PHI,0
9,2021-22,ORL,0


In [41]:
classifier_RFE.predict_proba(df_X_test_scaled)

array([[4.15314616e-01, 5.84685384e-01],
       [9.85834280e-01, 1.41657195e-02],
       [8.71370841e-01, 1.28629159e-01],
       [9.66482510e-01, 3.35174897e-02],
       [9.98128903e-01, 1.87109694e-03],
       [9.98733412e-01, 1.26658775e-03],
       [9.96345050e-01, 3.65495030e-03],
       [9.99500093e-01, 4.99906674e-04],
       [9.57602453e-01, 4.23975468e-02],
       [9.99821688e-01, 1.78312459e-04],
       [9.99970251e-01, 2.97485595e-05],
       [9.71534789e-01, 2.84652112e-02],
       [9.99878629e-01, 1.21371366e-04],
       [9.97218910e-01, 2.78109039e-03],
       [9.98057494e-01, 1.94250568e-03],
       [9.93167466e-01, 6.83253432e-03],
       [9.95743559e-01, 4.25644063e-03],
       [9.96060669e-01, 3.93933088e-03],
       [9.98236557e-01, 1.76344341e-03],
       [9.98675975e-01, 1.32402471e-03],
       [9.99950646e-01, 4.93537510e-05],
       [9.19173138e-01, 8.08268620e-02],
       [9.99915624e-01, 8.43759195e-05],
       [9.70382600e-01, 2.96174001e-02],
       [9.956953