In [None]:
# Principle Component Analysis (PCA) Case Study

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# 1. Read the sample dataset given to you.

iris1 = pd.read_csv("iris.csv")

iris1.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [None]:
columnNames = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
iris1.columns = columnNames
iris1.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [None]:
irisData = pd.read_csv("iris.csv", names=columnNames)

irisData.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
# x = irisData.drop("Class", axis=1)
x = irisData.iloc[:,:4]
x.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
y = irisData.iloc[:,-1]
y.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Class, dtype: object

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [None]:
pca1 = PCA()

x_train = pca1.fit_transform(x_train)
x_test = pca1.transform(x_test)

var1 = pca1.explained_variance_ratio_
var1

array([0.92859758, 0.0474831 , 0.0193921 , 0.00452721])

<class 'sklearn.decomposition._pca.PCA'>


(array([10.89787455]), None)

In [None]:
rfc = RandomForestClassifier(n_estimators=5)
rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test)

cmRF = confusion_matrix(y_test, y_pred)
print("Random Forest confusion matrix :\n",cmRF)

aScoreRF = accuracy_score(y_test, y_pred)
print("Random Forest accuracy score :",aScoreRF)


Random Forest confusion matrix :
 [[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
Random Forest accuracy score : 0.9666666666666667


In [None]:
numColumns = len(x.columns)

for c in range(1, numColumns+1):
  x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)

  pca = PCA(n_components=c)
  x_train = pca.fit_transform(x_train)
  x_test = pca.transform(x_test)
  
  rfc = RandomForestClassifier(random_state=2)
  rfc.fit(x_train, y_train)

  y_pred = rfc.predict(x_test)

  cmRF = confusion_matrix(y_test, y_pred)
  print("Confusuion Matrix with {0} Principle Components:",format(c))
  print(cmRF)
  aScoreRF = accuracy_score(y_test, y_pred)
  print("Accuracy Score with {0} Principle Components:",format(c))
  print(aScoreRF,"\n")


Confusuion Matrix with {0} Principle Components: 1
[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy Score with {0} Principle Components: 1
0.9333333333333333 

Confusuion Matrix with {0} Principle Components: 2
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
Accuracy Score with {0} Principle Components: 2
0.9666666666666667 

Confusuion Matrix with {0} Principle Components: 3
[[11  0  0]
 [ 0 11  2]
 [ 0  0  6]]
Accuracy Score with {0} Principle Components: 3
0.9333333333333333 

Confusuion Matrix with {0} Principle Components: 4
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
Accuracy Score with {0} Principle Components: 4
0.9666666666666667 



In [None]:
# x_test = np.expand_dims(x_test, axis=1)
# x_test.shape