### **1. Import the required packages**

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### **2. Reading and Exploring the Data**

In [None]:
data = pd.read_csv('nba_final.csv')

In [None]:
data.head()

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,,,42.8,No


In [None]:
data['Season']

Unnamed: 0,Season
0,2016-17
1,2016-17
2,2016-17
3,2016-17
4,2016-17
...,...
1403,2018-19
1404,2018-19
1405,2018-19
1406,2018-19


In [None]:
data.shape  #print the shape of the dataframe

(1408, 45)

In [None]:
data.dtypes #print the datatype of the values present in each column

Unnamed: 0,0
Rk,int64
Player.x,object
Player_ID,object
Pos1,object
Pos2,object
Age,int64
Tm,object
G,int64
GS,int64
MP,float64


In [None]:
data.isnull().sum() #print the total number of missing values in each column

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,1396
Age,0
Tm,0
G,0
GS,0
MP,0


In [None]:
data.drop(columns = 'Pos2', inplace = True)

In [None]:
data = data.fillna(0)

In [None]:
data.duplicated().sum() #check the number of duplicate rows in the data

np.int64(0)

In [None]:
data.drop_duplicates(inplace = True)

In [None]:
data.drop(columns = ['Player.x', 'Player_ID'], inplace = True)

#### **Encode the categorical columns using LabelEncoder**

In [None]:
obj_cols = data.select_dtypes('object').columns

In [None]:
obj_cols

Index(['Pos1', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

In [None]:
le = LabelEncoder()

for column in obj_cols:
  data[column] = le.fit_transform(data[column])

#### **Check for Imbalance**

In [None]:
data['Play'].value_counts()

Unnamed: 0_level_0,count
Play,Unnamed: 1_level_1
0,1335
1,73


### **Machine Learning Process**

In [None]:
X = data.drop(columns = 'Play')
y = data['Play']

In [None]:
#split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify = y)

#### **Mean Centering / Standardization / Scaling of the data**

1. Always do the scaling of the data after splitting it into training and testing

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### **Apply Logistic Regression on the data**

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = log_reg.predict(X_test_scaled)

In [None]:
accuracy_score(y_test, y_pred)

0.9787234042553191

In [None]:
roc_auc_score(y_test, y_pred)

np.float64(0.9258426966292135)

### **Apply PCA on the data**

In [None]:
pca = PCA(n_components = 10)

In [None]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
X_train_pca.shape

(1126, 10)

In [None]:
pca.explained_variance_ratio_

array([0.37905378, 0.12027188, 0.06486726, 0.06188172, 0.04860752,
       0.03244805, 0.03094522, 0.02541365, 0.02488225, 0.02116037])

In [None]:
exp_var = sum(pca.explained_variance_ratio_ * 100)
print('Total Variance Explained:', exp_var)

Total Variance Explained: 80.95317067175095


#### **Apply logistic regression on the pca transformed data**

In [None]:
log_reg2 = LogisticRegression()
log_reg2.fit(X_train_pca, y_train)

In [None]:
y_pred_2 = log_reg2.predict(X_test_pca)

In [None]:
roc_auc_score(y_test, y_pred_2)

np.float64(0.8925093632958802)

### **Apply the LDA on the data**

In [None]:
lda = LinearDiscriminantAnalysis()

In [None]:
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

In [None]:
log_reg_3 = LogisticRegression()
log_reg_3.fit(X_train_lda, y_train)

In [None]:
y_pred_3 = log_reg_3.predict(X_test_lda)

In [None]:
roc_auc_score(y_test, y_pred_3)

np.float64(0.8591760299625467)