# Principal Component Analysis (PCA)


In [8]:
# Imports
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import sys
sys.path.append("../utils")
from utils import PCA_reduction

Let's prepare the data for PCA. PCA is effected by scale so we need to scale the features in the data before applying PCA. We can transform the data onto unit scale (mean = 0 and variance = 1) which is a requirement for the optimal performance of many machine learning algorithms. StandardScaler helps standardize the dataset’s features.
Notice the code below has .95 for the number of components parameter. It means that scikit-learn choose the minimum number of principal components such that 95% of the variance is retained.

In [9]:
gt = pd.read_csv('../../dumps/2020.03.11-17.39.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
scaler.fit(data_train)
data_train_raw = scaler.transform(data_train)
data_test_raw = scaler.transform(data_test)
cell_text = []
for i in [1,0.99,0.95,0.90,0.85]:
    row = []
    row.append(i)
    start = time.time()
    pca = PCA(i) if i != 1 else PCA()
    pca.fit(data_train_raw)
    data_train = pca.transform(data_train_raw)
    data_test = pca.transform(data_test_raw)
    clf = KNeighborsClassifier(n_neighbors=6,p=2)
    clf.fit(data_train, target_train)
    end = time.time()
    row.append(clf.score(data_train, target_train))
    row.append(clf.score(data_test, target_test))
    row.append(pca.n_components_)
    row.append(end-start)
    cell_text.append(row)
print(tabulate(cell_text, headers = ['Variance','Training acc','Test acc','Components','Time (s)']))

  Variance    Training acc    Test acc    Components    Time (s)
----------  --------------  ----------  ------------  ----------
      1           0.984867    0.979939           119    0.175548
      0.99        0.984867    0.977899            98    0.171339
      0.95        0.984186    0.977219            77    0.170872
      0.9         0.984526    0.978579            60    0.137358
      0.85        0.984696    0.978239            47    0.128116


It's interesting to notice that we can nearly keep the same accuracies while gathering our features from a set of 119 to 47 and clearly improves the timing. Here we're using quite a small dataset (14k), but it makes no doubt that saved time is significant with even more samples.

Let's just test without standardization and with normalization in order to see how performances are impacted.

In [11]:
gt = pd.read_csv('../../dumps/2020.03.11-17.39.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

pca = PCA(0.95)

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)

pca.fit(data_train)
data_train = pca.transform(data_train)
data_test = pca.transform(data_test)
clf = KNeighborsClassifier(n_neighbors=6,p=2)
clf.fit(data_train, target_train)
print("Training set accuracy: {:.2f}".format(clf.score(data_train, target_train)))
print("Test set accuracy: {:.2f}".format(clf.score(data_test, target_test)))

Training set accuracy: 0.96
Test set accuracy: 0.96


In [12]:
gt = pd.read_csv('../../dumps/2020.03.11-17.39.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

pca = PCA(0.95)

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = Normalizer()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

pca.fit(data_train)
data_train = pca.transform(data_train)
data_test = pca.transform(data_test)
clf = KNeighborsClassifier(n_neighbors=6,p=2)
clf.fit(data_train, target_train)
print("Training set accuracy: {:.2f}".format(clf.score(data_train, target_train)))
print("Test set accuracy: {:.2f}".format(clf.score(data_test, target_test)))

Training set accuracy: 0.97
Test set accuracy: 0.96


Performances are better when we apply Normalization.