PCA Implementation
(ref: https://www.youtube.com/watch?v=Rjr62b_h7S4&t=394s&ab_channel=AssemblyAI)

In [49]:
import numpy as np

class PCA:
    def __init__(self, num_components):
        self.num_components = num_components
        self.components = None
        self.mean = None

    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X = X - self.mean

        cov_mat = np.dot(X.T, X) / (X.shape[0] - 1)
        eigenvalues, eigenvectors = np.linalg.eigh(cov_mat)

        eigenvectors = eigenvectors.T
        
        arg_ind = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[arg_ind]
        eigenvectors = eigenvectors[arg_ind]

        self.components = eigenvectors[:self.num_components]
         
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)

Running PCA on MNIST data

In [50]:
import numpy as np
import keras
import random
from keras.datasets import mnist
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

#reshaping images
train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# normalize
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

random_sample_indices = random.sample(range(train_images.shape[0]), 40000)
train_images_25 = train_images[random_sample_indices]
train_labels_25 = train_labels[random_sample_indices]


train_images_final_80, validation_images_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_images_25, train_labels_25, test_size=0.3, random_state=42)

print("Final train dataset size: ", train_images_final_80.shape)
print("Final validation dataset size: ", validation_images_final_10.shape)


Final train dataset size:  (28000, 784)
Final validation dataset size:  (12000, 784)


In [51]:

pca = PCA(5)
pca.fit(train_images)
projection_train_mnist = pca.transform(train_images)
projection_test_mnist = pca.transform(test_images)

model = LogisticRegression(max_iter=2000)
model.fit(projection_train_mnist, train_labels)

accuracy = model.score(projection_test_mnist, test_labels)
print("LR Accuracy MNIST for D=5:", accuracy)



LR Accuracy MNIST for D=5: 0.6875


In [52]:
pca = PCA(20)
pca.fit(train_images_final_80)
projection_train_mnist = pca.transform(train_images_final_80)

print("original shape:", train_images_final_80.shape)
print("Projection shape: ",projection_train_mnist.shape)

projection_test_mnist = pca.transform(test_images)

model = LogisticRegression(max_iter=2500)
model.fit(projection_train_mnist, train_labels_final_80)

accuracy = model.score(projection_test_mnist, test_labels)
print("LR Accuracy MNIST for D=20:", accuracy)



original shape: (28000, 784)
Projection shape:  (28000, 20)
LR Accuracy MNIST for D=20: 0.8802


In [53]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

dataset_path = 'spambase.data'
df = pd.read_csv(dataset_path, header=None)

X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

spambase_train_norm = scaler.fit_transform(X_train)
spambase_test_norm = scaler.transform(X_test)

print(spambase_test_norm.shape)

(921, 57)


In [54]:
pca = PCA(5)
pca.fit(spambase_train_norm)

projection_train_mnist = pca.transform(spambase_train_norm)
projection_test_mnist = pca.transform(spambase_test_norm)

print(projection_train_mnist.shape)

model = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
model.fit(projection_train_mnist, y_train)

accuracy_spambase_pca = model.score(projection_test_mnist, y_test)
print("LR Accuracy Spambase with D=5:", accuracy_spambase_pca)

(3680, 5)
LR Accuracy Spambase with D=5: 0.8783930510314875


In [56]:
model = LogisticRegression()
model.fit(spambase_train_norm, y_train)

accuracy_spambase = model.score(spambase_test_norm, y_test)
print("LR Accuracy Spambase with logistic reg:", accuracy_spambase)

smallest_d = 101
min_delta = 1000

for d in reversed(range(1, 20)):
    pca = PCA(d)
    pca.fit(spambase_train_norm)
    projection_train = pca.transform(spambase_train_norm)
    projection_test = pca.transform(spambase_test_norm)

    model = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=1500)
    model.fit(projection_train, y_train)

    accuracy_d = model.score(projection_test, y_test)
    print("Accuracy for D = ", d, " :", accuracy_d)
    if abs(accuracy_spambase-accuracy_d) < min_delta:
        min_delta = abs(accuracy_spambase-accuracy_d)
        smallest_d = d

print("Smallest D: ", smallest_d)

LR Accuracy Spambase with logistic reg: 0.9196525515743756
Accuracy for D =  19  : 0.9077090119435396
Accuracy for D =  18  : 0.9011943539630836
Accuracy for D =  17  : 0.9022801302931596
Accuracy for D =  16  : 0.9011943539630836
Accuracy for D =  15  : 0.9044516829533116
Accuracy for D =  14  : 0.8979370249728555
Accuracy for D =  13  : 0.8968512486427795
Accuracy for D =  12  : 0.8979370249728555
Accuracy for D =  11  : 0.9001085776330076
Accuracy for D =  10  : 0.8979370249728555
Accuracy for D =  9  : 0.8914223669923995
Accuracy for D =  8  : 0.8870792616720955
Accuracy for D =  7  : 0.8794788273615635
Accuracy for D =  6  : 0.8773072747014115
Accuracy for D =  5  : 0.8783930510314875
Accuracy for D =  4  : 0.8762214983713354
Accuracy for D =  3  : 0.8675352877307275
Accuracy for D =  2  : 0.8686210640608035
Accuracy for D =  1  : 0.8447339847991314
Smallest D:  19
