In [1]:
import os
import numpy as np
import codecs
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


load_dotenv()

True

In [2]:
path_to_data = os.environ.get("MNIST_DATAPATH")
mnist_files = os.listdir(path_to_data)
mnist_files = [x for x in mnist_files if x.endswith("ubyte")]


def convert_to_int(byte):
    integer = int(codecs.encode(byte, 'hex'), 16)
    return integer


dataset = {}
for file in mnist_files:
    print("Reading", file)
    with open(path_to_data + file, "rb") as f:
        data = f.read()
        type_of_data = convert_to_int(data[:4])
        length = convert_to_int(data[4:8])
        if type_of_data == 2051:
            category = "images"
            number_of_rows = convert_to_int(data[8:12])
            number_of_columns = convert_to_int(data[12:16])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, number_of_rows, number_of_columns)
        if type_of_data == 2049:
            category = "labels"
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set = "train"
        if length == 10000:
            set = "test"
        dataset[set + '_' + category] = parsed

print(dataset["train_images"][0, :, :])

Reading t10k-images-idx3-ubyte
Reading t10k-labels-idx1-ubyte
Reading train-images-idx3-ubyte
Reading train-labels-idx1-ubyte
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 2

In [3]:
train_images = dataset['train_images']
train_labels = dataset['train_labels']
train_images_flattened = train_images.reshape(60000, -1)
train_images_flattened = train_images_flattened / 255

test_images = dataset['test_images']
test_labels = dataset['test_labels']
test_images_flattened = test_images.reshape(10000, -1)
test_images_flattened = test_images_flattened / 255

## PCA self implementation

In [4]:
def self_implemented_pca(original_data, d) -> tuple:
    covariance_matrix = np.cov(original_data, rowvar=False)

    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

    sorted_indices = np.argsort(eigenvalues)[::-1]  # Descending order
    top_eigenvectors = eigenvectors[:, sorted_indices[:d]]

    pca = np.dot(original_data, top_eigenvectors)

    return pca, top_eigenvectors

In [5]:
mnist_train_d5, eigen_vectors_d5 = self_implemented_pca(train_images_flattened, 5)
mnist_test_d5 = np.dot(test_images_flattened, eigen_vectors_d5)

mnist_train_d20, eigen_vectors_d20 = self_implemented_pca(train_images_flattened, 20)
mnist_test_d20 = np.dot(test_images_flattened, eigen_vectors_d20)

print(f"Original shape: {train_images_flattened.shape}, PCA (D=5) shape: {mnist_train_d5.shape}, PCA (D=20) shape: {mnist_train_d20.shape}")

Original shape: (60000, 784), PCA (D=5) shape: (60000, 5), PCA (D=20) shape: (60000, 20)


### L2 regularized Logistic regression

In [6]:
log_reg_pca5 = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, multi_class='multinomial')
log_reg_pca5.fit(mnist_train_d5, train_labels)
y_pred_pca5 = log_reg_pca5.predict(mnist_test_d5)

acc_pca5 = accuracy_score(test_labels, y_pred_pca5)
print(f'Logistic Regression Accuracy with PCA (D=5): {acc_pca5:.4f}')

log_reg_pca20 = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, multi_class='multinomial')
log_reg_pca20.fit(mnist_train_d20, train_labels)
y_pred_pca20 = log_reg_pca20.predict(mnist_test_d20)

acc_pca20 = accuracy_score(test_labels, y_pred_pca20)
print(f'Logistic Regression Accuracy with PCA (D=20): {acc_pca20:.4f}')



Logistic Regression Accuracy with PCA (D=5): 0.6873




Logistic Regression Accuracy with PCA (D=20): 0.8803


### Decision tree

In [7]:
dt_pca5 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_pca5.fit(mnist_train_d5, train_labels)
y_pred_dt_pca5 = dt_pca5.predict(mnist_test_d5)

acc_dt_pca5 = accuracy_score(test_labels, y_pred_dt_pca5)
print(f'Decision Tree Accuracy with PCA (D=5): {acc_dt_pca5:.4f}')

dt_pca20 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_pca20.fit(mnist_train_d20, train_labels)
y_pred_dt_pca20 = dt_pca20.predict(mnist_test_d20)

acc_dt_pca20 = accuracy_score(test_labels, y_pred_dt_pca20)
print(f'Decision Tree Accuracy with PCA (D=20): {acc_dt_pca20:.4f}')

Decision Tree Accuracy with PCA (D=5): 0.6957
Decision Tree Accuracy with PCA (D=20): 0.7954


In [8]:
print(f'Logistic Regression Accuracy with PCA (D=5): {acc_pca5:.4f}')
print(f'Logistic Regression Accuracy with PCA (D=20): {acc_pca20:.4f}')
print(f'Decision Tree Accuracy with PCA (D=5): {acc_dt_pca5:.4f}')
print(f'Decision Tree Accuracy with PCA (D=20): {acc_dt_pca20:.4f}')

Logistic Regression Accuracy with PCA (D=5): 0.6873
Logistic Regression Accuracy with PCA (D=20): 0.8803
Decision Tree Accuracy with PCA (D=5): 0.6957
Decision Tree Accuracy with PCA (D=20): 0.7954


#### Original scores:
Logistic Regression Accuracy with PCA (D=5): 0.6800
Logistic Regression Accuracy with PCA (D=20): 0.8721
Decision Tree Accuracy with PCA (D=5): 0.6943
Decision Tree Accuracy with PCA (D=20): 0.8027