# PCA Principal Component Analysis

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)

In [3]:
X = dataset.drop('Class', 1)
y = dataset['Class']

In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
X_train

array([[ 0.61303014,  0.10850105,  0.94751783,  0.73603967],
       [-0.56776627, -0.12400121,  0.38491447,  0.34808318],
       [-0.80392556,  1.03851009, -1.30289562, -1.3330616 ],
       [ 0.25879121, -0.12400121,  0.60995581,  0.73603967],
       [ 0.61303014, -0.58900572,  1.00377816,  1.25331499],
       [-0.80392556, -0.82150798,  0.04735245,  0.21876435],
       [-0.21352735,  1.73601687, -1.19037495, -1.20374277],
       [ 0.14071157, -0.82150798,  0.72247648,  0.47740201],
       [ 0.02263193, -0.12400121,  0.21613346,  0.34808318],
       [-0.09544771, -1.05401024,  0.10361279, -0.03987331],
       [ 1.0853487 , -0.12400121,  0.94751783,  1.12399616],
       [-1.39432376,  0.34100331, -1.41541629, -1.3330616 ],
       [ 1.20342834,  0.10850105,  0.72247648,  1.38263382],
       [-1.04008484,  1.03851009, -1.24663528, -0.81578628],
       [-0.56776627,  1.50351461, -1.30289562, -1.3330616 ],
       [-1.04008484, -2.4490238 , -0.1776889 , -0.29851096],
       [ 0.73110978, -0.

In [7]:
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [8]:
explained_variance = pca.explained_variance_ratio_

In [9]:
explained_variance

array([0.72226528, 0.23974795, 0.03338117, 0.0046056 ])

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [11]:
X_train

array([[ 1.27228206e+00],
       [ 1.52231770e-01],
       [-2.18764183e+00],
       [ 9.41913396e-01],
       [ 1.76227837e+00],
       [-8.16806401e-02],
       [-1.89794863e+00],
       [ 9.59395493e-01],
       [ 3.65661204e-01],
       [ 2.31845912e-01],
       [ 1.79730127e+00],
       [-2.40350676e+00],
       [ 1.82199968e+00],
       [-1.98427118e+00],
       [-2.17073306e+00],
       [-2.55598076e-01],
       [ 1.38862320e+00],
       [ 1.93409268e+00],
       [ 1.12243099e+00],
       [ 1.88169499e+00],
       [ 5.43123767e-02],
       [ 2.70365641e+00],
       [ 6.91516012e-01],
       [ 1.30048660e-02],
       [ 1.22853905e+00],
       [ 1.01881714e+00],
       [ 2.57616370e-01],
       [ 1.44310720e+00],
       [ 1.03458720e+00],
       [ 1.43772035e+00],
       [ 3.84673096e-01],
       [-2.18860624e+00],
       [ 1.05608476e+00],
       [ 1.92641524e-01],
       [-1.12363985e-01],
       [ 1.72424265e-01],
       [ 5.48040969e-01],
       [ 1.88178641e+00],
       [-2.2

In [12]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy', accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy 0.9333333333333333


### Principal Component Analysis with HP Waiting Times

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE

hp_me = pd.read_csv("harryPotterClean.csv")

In [15]:
smote = SMOTE(random_state=42)

#Function to perform oversampling
def overSampling(X_train, y_train, y_test, method):
    X_train_os, y_train_os= method.fit_resample(X_train, y_train)
    # Check the number of records after over sampling
    print(sorted(Counter(y_train_os).items())) 
    return(X_train_os, y_train_os)

#Fucntion to split into X and Y
def getXandY(df):
    df.drop(df.tail(20).index,inplace=True) 
    x = df.drop(['Harry_Potter_and_the_Forbidden'],axis=1)
    y = df.Harry_Potter_and_the_Forbidden
    return(x,y)

#Function to split in test and train
def trainTest(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True)
    return(X_train, X_test, y_train, y_test)

Here I will be trimming the outliers, and also reducing the number of classes such as in the past notebook

In [16]:
#Delete all rows which contain outliers
hp_clean = hp_me[hp_me.Harry_Potter_and_the_Forbidden != 0] #delete rows with 0 min
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 180]
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 150]
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 145]
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 135]
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 130]
hp_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 11]
hp_fin_clean = hp_clean[hp_clean.Harry_Potter_and_the_Forbidden != 125]

#Replace times ending in 5 and also compacting the rest to have only 6 classes
a=hp_fin_clean.Harry_Potter_and_the_Forbidden.replace([5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 120, 110, 150,80, 90, 50,70], 
                                                       [10,10, 20, 30, 40, 50, 60, 60, 100, 100, 100, 100, 100, 100, 100,100, 100, 60,100])
#Create the final dataframe
df=pd.DataFrame(a)
hp2=hp_fin_clean.drop('Harry_Potter_and_the_Forbidden',axis=1)
hp3=pd.concat([hp2, df], axis=1)
hp4=hp3.drop('Unnamed: 0',axis=1)
hp4.Harry_Potter_and_the_Forbidden.unique()

array([ 40.,  20.,  10.,  30.,  60., 100.,  50.,  nan])

In [17]:
#create a csv of the compressed dataset
hp4.to_csv('hp_waitTime_6vars.csv')

In [18]:
#Split the compact dataframe into X and Y and train and test
x,y=getXandY(hp4)
X_train, X_test, y_train, y_test = trainTest(x,y)

In [19]:
print(sorted(Counter(y_train).items()))

[(10.0, 3557), (20.0, 1891), (30.0, 1095), (40.0, 1274), (50.0, 197), (60.0, 1870), (100.0, 1134)]


In [20]:
X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)

[(10.0, 3557), (20.0, 3557), (30.0, 3557), (40.0, 3557), (50.0, 3557), (60.0, 3557), (100.0, 3557)]


In [21]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train_os)
X_test_pca = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
explained_variance

array([3.97896184e-01, 1.36498318e-01, 1.21754728e-01, 9.01434349e-02,
       8.43473958e-02, 5.28936831e-02, 4.08584914e-02, 2.71684597e-02,
       2.35586558e-02, 1.38268444e-02, 1.10538054e-02, 1.16819983e-35])

Metrics with all the variables

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report

lg = LogisticRegression(max_iter=20000)
lg.fit(X_train_pca, y_train_os).decision_function(X_test_pca)
y_pred=lg.predict(X_test_pca)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        10.0       0.57      0.53      0.55      1509
        20.0       0.22      0.13      0.17       814
        30.0       0.14      0.12      0.13       477
        40.0       0.25      0.17      0.20       554
        50.0       0.05      0.36      0.09       104
        60.0       0.27      0.11      0.16       785
       100.0       0.22      0.46      0.30       480

    accuracy                           0.30      4723
   macro avg       0.25      0.27      0.23      4723
weighted avg       0.33      0.30      0.30      4723



Lets make a function to try out the list 

In [24]:
def myPCA(n,X_train,X_test,y_train):
    print("-------------------------number of components = ", n)
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    lg = LogisticRegression(max_iter=20000)
    lg.fit(X_train_pca, y_train).decision_function(X_test_pca)
    y_pred=lg.predict(X_test_pca)
    print(classification_report(y_test, y_pred))

In [25]:
l = list(range(1,13))

for i in l:
    print(i)
    myPCA(i,X_train,X_test,y_train)

1
-------------------------number of components =  1
              precision    recall  f1-score   support

        10.0       0.32      1.00      0.48      1509
        20.0       0.00      0.00      0.00       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.00      0.00      0.00       785
       100.0       0.00      0.00      0.00       480

    accuracy                           0.32      4723
   macro avg       0.05      0.14      0.07      4723
weighted avg       0.10      0.32      0.15      4723

2
-------------------------number of components =  2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.32      1.00      0.48      1509
        20.0       0.00      0.00      0.00       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.00      0.00      0.00       785
       100.0       0.00      0.00      0.00       480

    accuracy                           0.32      4723
   macro avg       0.05      0.14      0.07      4723
weighted avg       0.10      0.32      0.15      4723

3
-------------------------number of components =  3
              precision    recall  f1-score   support

        10.0       0.32      1.00      0.48      1509
        20.0       0.00      0.00      0.00       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.33      0.96      0.50      1509
        20.0       0.00      0.00      0.00       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.17      0.06      0.09       785
       100.0       0.22      0.05      0.08       480

    accuracy                           0.32      4723
   macro avg       0.10      0.15      0.10      4723
weighted avg       0.16      0.32      0.18      4723

5
-------------------------number of components =  5
              precision    recall  f1-score   support

        10.0       0.33      0.94      0.49      1509
        20.0       0.00      0.00      0.00       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.16  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.35      0.93      0.51      1509
        20.0       0.20      0.05      0.08       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.18      0.08      0.11       785
       100.0       0.27      0.10      0.15       480

    accuracy                           0.33      4723
   macro avg       0.14      0.17      0.12      4723
weighted avg       0.20      0.33      0.21      4723

7
-------------------------number of components =  7
              precision    recall  f1-score   support

        10.0       0.36      0.88      0.51      1509
        20.0       0.20      0.09      0.12       814
        30.0       0.00      0.00      0.00       477
        40.0       0.00      0.00      0.00       554
        50.0       0.00      0.00      0.00       104
        60.0       0.19  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.41      0.80      0.54      1509
        20.0       0.22      0.09      0.13       814
        30.0       0.00      0.00      0.00       477
        40.0       0.23      0.02      0.03       554
        50.0       0.00      0.00      0.00       104
        60.0       0.23      0.33      0.27       785
       100.0       0.37      0.18      0.25       480

    accuracy                           0.35      4723
   macro avg       0.21      0.20      0.17      4723
weighted avg       0.27      0.35      0.27      4723

9
-------------------------number of components =  9


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.43      0.80      0.56      1509
        20.0       0.20      0.10      0.13       814
        30.0       0.00      0.00      0.00       477
        40.0       0.17      0.01      0.02       554
        50.0       0.00      0.00      0.00       104
        60.0       0.24      0.38      0.30       785
       100.0       0.33      0.18      0.23       480

    accuracy                           0.36      4723
   macro avg       0.20      0.21      0.18      4723
weighted avg       0.27      0.36      0.28      4723

10
-------------------------number of components =  10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.44      0.79      0.56      1509
        20.0       0.25      0.14      0.18       814
        30.0       0.00      0.00      0.00       477
        40.0       0.22      0.03      0.06       554
        50.0       0.00      0.00      0.00       104
        60.0       0.25      0.37      0.30       785
       100.0       0.32      0.19      0.24       480

    accuracy                           0.36      4723
   macro avg       0.21      0.22      0.19      4723
weighted avg       0.28      0.36      0.29      4723

11
-------------------------number of components =  11


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        10.0       0.46      0.86      0.60      1509
        20.0       0.23      0.13      0.17       814
        30.0       1.00      0.00      0.00       477
        40.0       0.31      0.06      0.09       554
        50.0       0.00      0.00      0.00       104
        60.0       0.28      0.37      0.32       785
       100.0       0.32      0.21      0.26       480

    accuracy                           0.39      4723
   macro avg       0.37      0.23      0.21      4723
weighted avg       0.40      0.39      0.31      4723

12
-------------------------number of components =  12
              precision    recall  f1-score   support

        10.0       0.46      0.86      0.60      1509
        20.0       0.23      0.13      0.17       814
        30.0       1.00      0.00      0.00       477
        40.0       0.31      0.06      0.09       554
        50.0       0.00      0.00      0.00       104
        60.0       0.28

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
