In [3]:
#Import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [5]:
#Load the iris dataset
df=sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
#Check the target variables class proportions
df['species'].value_counts(1)
#So multi class Logistic regression

versicolor    0.333333
setosa        0.333333
virginica     0.333333
Name: species, dtype: float64

In [8]:
df.shape

(150, 5)

In [12]:
#Describe the data - Statistical summary
df.describe().round(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [13]:
#X,Y split
x=df.drop(['species'],axis=1)
y=df['species']

In [14]:
#Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

In [15]:
#Check Shape
print(x_train.shape)
print(x_test.shape)

(120, 4)
(30, 4)


In [21]:
#Scaling the data - needed for PCA - because lot of matrix multiplication (After scaling values in each column
#it converts values in each column to its Z-scores - z-score ((Each_value)-(mean_of_column))/(Standard_deviation_of_column)
#Once scaling process is completed, 99% of the values will be between -3 to 3
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
#Scaler.fit(x_train)
#scaler.transform(x_train)
#Test dataset will only need need transform to avoid data leakage on to the model
x_test_scaled=scaler.transform(x_test)

In [22]:
x_train_scaled

array([[ 0.76416119, -0.18596829,  1.1178787 ,  1.2484939 ],
       [-0.42673937, -1.39880497,  0.10968747,  0.09800464],
       [ 0.76416119, -0.18596829,  0.94984683,  0.73716534],
       [-0.0694692 , -0.9136703 ,  0.05367685, -0.0298275 ],
       [ 1.12143136, -0.18596829,  0.94984683,  1.12066176],
       [-0.18855925, -0.67110296,  0.38974059,  0.09800464],
       [ 1.0023413 ,  0.05659904,  0.50176184,  0.35366892],
       [ 0.04962086, -0.18596829,  0.22170872,  0.35366892],
       [-0.30764931, -0.9136703 ,  0.22170872,  0.09800464],
       [ 2.19324186, -0.18596829,  1.28591057,  1.37632604],
       [-0.90309959,  1.51200306, -1.29057812, -1.05248462],
       [-1.49854987,  1.26943572, -1.57063124, -1.3081489 ],
       [-1.37945981,  0.29916638, -1.2345675 , -1.3081489 ],
       [ 1.0023413 , -0.18596829,  0.66979371,  0.6093332 ],
       [-0.30764931, -0.18596829,  0.38974059,  0.35366892],
       [-1.85582003, -0.18596829, -1.51462062, -1.43598104],
       [ 1.47870152, -0.

In [23]:
x_train_scaled[:,1].mean() #Mean of  first column

-1.0565622451016074e-15

In [24]:
x_train_scaled[:,1].std() #STD of first column

0.9999999999999999

In [26]:
#Train and predict using a Logistic regression model
#Fit logistic regression model and get its predictions on train and test datasets
logreg=LogisticRegression()
logreg.fit(x_train_scaled,y_train)
train_pred=logreg.predict(x_train_scaled)
test_pred=logreg.predict(x_test_scaled)

In [27]:
#import accuracy score from sklearn.metrics and calculate the accuracy on train & test data  predictions
from sklearn.metrics import accuracy_score
train_acc=accuracy_score(y_train,train_pred)
test_acc=accuracy_score(y_test,test_pred)
train_acc,test_acc

(0.9666666666666667, 0.9666666666666667)

In [28]:
#Implement PCA to reduce dimensions
#Create a PCA object to retain 95% of explainability
pca=PCA(n_components=0.95) #0.95 means I want to lose only 5% of ability to explain the dataset
pca.fit(x_train_scaled)
#Transform the original training dataset
x_train_trf=pca.transform(x_train_scaled)
x_test_trf=pca.transform(x_test_scaled)

In [30]:
x_train_trf.shape,x_test_trf.shape

((120, 2), (30, 2))

In [31]:
x_train_trf

array([[ 1.80493828, -0.21426637],
       [ 0.25517327,  1.44497662],
       [ 1.41820595, -0.17141317],
       [ 0.21325761,  0.87781662],
       [ 1.82305213, -0.32669546],
       [ 0.35545092,  0.67683443],
       [ 1.0039873 , -0.44280731],
       [ 0.40276083,  0.12422613],
       [ 0.25779311,  0.94835163],
       [ 2.72656467, -0.73024854],
       [-2.20922505, -0.9852658 ],
       [-2.76710222, -0.52418125],
       [-2.25927778,  0.33209822],
       [ 1.30864723, -0.23969052],
       [ 0.3123476 ,  0.2465219 ],
       [-2.61967987,  0.96765863],
       [ 2.14104661, -0.45689302],
       [ 0.13368207,  0.89529232],
       [-2.10396433,  0.47567551],
       [-0.46513895,  1.6435746 ],
       [ 0.14071226,  1.42394472],
       [-1.82179701, -0.09244361],
       [ 1.10949254,  0.79520394],
       [-2.3864584 , -0.6962729 ],
       [-0.50267584,  1.96007959],
       [ 1.95367128,  0.87259532],
       [ 0.30829558,  1.10292442],
       [-2.52140712, -0.64451214],
       [-2.16624472,

In [34]:
#Coefficient values of principle component 1 & 2
pca.components_

array([[ 0.52606701, -0.25776909,  0.5804543 ,  0.56558059],
       [-0.35178491, -0.93267558, -0.02015382, -0.07718464]])

In [36]:
#Eigen values of PCA1 and PCA2
pca.explained_variance_
#2.94+0.92 = 4 (=original number of feature columns)

array([2.94793148, 0.92567629])

In [38]:
pca.explained_variance_ratio_
#(First PCA component can explain 73% and second can explain 23% = 96% which is higher than 95% cut off defined earlier)

array([0.73084135, 0.22949058])

In [40]:
#Train a fresh Logisti model on the smaller  dimension dataset obtained using PCA
lr_pca=LogisticRegression()
lr_pca.fit(x_train_trf,y_train)
train_pred_trf=lr_pca.predict(x_train_trf)
test_pred_trf=lr_pca.predict(x_test_trf)

In [41]:
#Calculate Accuracy on training and test data predictions
train_acc_trf=accuracy_score(y_train,train_pred_trf)
test_acc_trf=accuracy_score(y_test,test_pred_trf)
train_acc_trf,test_acc_trf
#Accuracy has come down from 96% to 92.5% and from 96% to 90%

(0.925, 0.9)