# MNIST 

In [None]:
import os
import numpy as np
import codecs
from dotenv import load_dotenv
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


load_dotenv()

True

In [2]:
path_to_data = os.environ.get("MNIST_DATAPATH")
mnist_files = os.listdir(path_to_data)
mnist_files = [x for x in mnist_files if x.endswith("ubyte")]


def convert_to_int(byte):
    integer = int(codecs.encode(byte, 'hex'), 16)
    return integer


dataset = {}
for file in mnist_files:
    print("Reading", file)
    with open(path_to_data + file, "rb") as f:
        data = f.read()
        type_of_data = convert_to_int(data[:4])
        length = convert_to_int(data[4:8])
        if type_of_data == 2051:
            category = "images"
            number_of_rows = convert_to_int(data[8:12])
            number_of_columns = convert_to_int(data[12:16])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, number_of_rows, number_of_columns)
        if type_of_data == 2049:
            category = "labels"
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set = "train"
        if length == 10000:
            set = "test"
        dataset[set + '_' + category] = parsed

print(dataset["train_images"][0, :, :])

Reading t10k-images-idx3-ubyte
Reading t10k-labels-idx1-ubyte
Reading train-images-idx3-ubyte
Reading train-labels-idx1-ubyte
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 2

In [3]:
train_images = dataset['train_images']
train_labels = dataset['train_labels']
train_images_flattened = train_images.reshape(60000, -1)
train_images_flattened = train_images_flattened / 255

test_images = dataset['test_images']
test_labels = dataset['test_labels']
test_images_flattened = test_images.reshape(10000, -1)
test_images_flattened = test_images_flattened / 255

In [5]:
scaler =  StandardScaler()
mnist_features_train = scaler.fit_transform(train_images_flattened)
mnist_features_test = scaler.transform(test_images_flattened)

pca_5 = PCA(n_components=5)
mnist_features_train_pca5 = pca_5.fit_transform(mnist_features_train)
mnist_features_test_pca5 = pca_5.transform(mnist_features_test)

pca_20 = PCA(n_components=20)
mnist_features_train_pca20 = pca_20.fit_transform(mnist_features_train)
mnist_features_test_pca20 = pca_20.transform(mnist_features_test)

print(f"Original Shape: train - {mnist_features_train.shape}, test - {mnist_features_test.shape}\nPCA_5 shape: train - {mnist_features_train_pca5.shape}, test - {mnist_features_test_pca5.shape}\nPCA_20 shape: train - {mnist_features_train_pca20.shape}, test - {mnist_features_test_pca20.shape}")

Original Shape: train - (60000, 784), test - (10000, 784)
PCA_5 shape: train - (60000, 5), test - (10000, 5)
PCA_20 shape: train - (60000, 20), test - (10000, 20)


### L2 regularized logistic regression

#### PCA D = 5

In [None]:
log_reg_pca5 = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, multi_class='multinomial')
log_reg_pca5.fit(mnist_features_train_pca5, train_labels)
y_pred_pca5 = log_reg_pca5.predict(mnist_features_test_pca5)

acc_pca5 = accuracy_score(test_labels, y_pred_pca5)
print(f'Logistic Regression Accuracy with PCA (D=5): {acc_pca5:.4f}')



Logistic Regression Accuracy with PCA (D=5): 0.6800


#### PCA D = 20

In [None]:
log_reg_pca20 = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, multi_class='multinomial')
log_reg_pca20.fit(mnist_features_train_pca20, train_labels)
y_pred_pca20 = log_reg_pca20.predict(mnist_features_test_pca20)

acc_pca20 = accuracy_score(test_labels, y_pred_pca20)
print(f'Logistic Regression Accuracy with PCA (D=20): {acc_pca20:.4f}')



Logistic Regression Accuracy with PCA (D=20): 0.8721


### Decision Tree

#### PCA D = 5

In [8]:
dt_pca5 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_pca5.fit(mnist_features_train_pca5, train_labels)
y_pred_dt_pca5 = dt_pca5.predict(mnist_features_test_pca5)

acc_dt_pca5 = accuracy_score(test_labels, y_pred_dt_pca5)
print(f'Decision Tree Accuracy with PCA (D=5): {acc_dt_pca5:.4f}')

Decision Tree Accuracy with PCA (D=5): 0.6943


#### PCA D = 20

In [9]:
dt_pca20 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_pca20.fit(mnist_features_train_pca20, train_labels)
y_pred_dt_pca20 = dt_pca20.predict(mnist_features_test_pca20)

acc_dt_pca20 = accuracy_score(test_labels, y_pred_dt_pca20)
print(f'Decision Tree Accuracy with PCA (D=20): {acc_dt_pca20:.4f}')

Decision Tree Accuracy with PCA (D=20): 0.8027


### Results for MNIST

In [10]:
print(f'Logistic Regression Accuracy with PCA (D=5): {acc_pca5:.4f}')
print(f'Logistic Regression Accuracy with PCA (D=20): {acc_pca20:.4f}')
print(f'Decision Tree Accuracy with PCA (D=5): {acc_dt_pca5:.4f}')
print(f'Decision Tree Accuracy with PCA (D=20): {acc_dt_pca20:.4f}')

Logistic Regression Accuracy with PCA (D=5): 0.6800
Logistic Regression Accuracy with PCA (D=20): 0.8721
Decision Tree Accuracy with PCA (D=5): 0.6943
Decision Tree Accuracy with PCA (D=20): 0.8027


# Spambase Dataset

In [11]:
spambase = fetch_openml(name="Spambase", version=1, parser="pandas")
features_sb = spambase.data
labels_sb = spambase.target.astype(int)

print(f"Dataset shape: {len(labels_sb)}")
print("Label distribution:", labels_sb.value_counts())

Dataset shape: 4601
Label distribution: class
0    2788
1    1813
Name: count, dtype: int64


In [12]:
features_sb_train, features_sb_test, labels_sb_train, labels_sb_test = train_test_split(features_sb, labels_sb, test_size=0.2, random_state=97)
print(f"Training set shape: {features_sb_train.shape}")
print(f"Test set shape: {features_sb_test.shape}")

Training set shape: (3680, 57)
Test set shape: (921, 57)


In [13]:
scaler_sb = StandardScaler()
spambase_features_train = scaler_sb.fit_transform(features_sb_train)
spambase_features_test = scaler_sb.transform(features_sb_test)

pca_10 = PCA(n_components=10)
spambase_features_train_pca10 = pca_10.fit_transform(spambase_features_train)
spambase_features_test_pca10 = pca_10.transform(spambase_features_test)

print("Explained Variance Ratio:", sum(pca_10.explained_variance_ratio_))

Explained Variance Ratio: 0.38877770578183435


In [None]:
log_reg_spam_pca10 = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000)
log_reg_spam_pca10.fit(spambase_features_train_pca10, labels_sb_train)
y_pred_spam_pca = log_reg_spam_pca10.predict(spambase_features_test_pca10)

acc_spam_pca = accuracy_score(labels_sb_test, y_pred_spam_pca)
print(f'Logistic Regression Accuracy with PCA (D=10) on Spambase: {acc_spam_pca:.4f}')

Logistic Regression Accuracy with PCA (D=10) on Spambase: 0.8936


In [None]:
original_spam_acc = 0.9121

for d in range(1, 51, 3):  # Try PCA from D=5 to D=50
    pca_d = PCA(n_components=d)
    X_train_spam_pca_d = pca_d.fit_transform(spambase_features_train)
    X_test_spam_pca_d = pca_d.transform(spambase_features_test)

    log_reg_spam_pca_d = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000)
    log_reg_spam_pca_d.fit(X_train_spam_pca_d, labels_sb_train)
    y_pred_spam_pca = log_reg_spam_pca_d.predict(X_test_spam_pca_d)

    acc_pca_d = accuracy_score(labels_sb_test, y_pred_spam_pca)
    print(f'PCA (D={d}): Accuracy = {acc_pca_d:.4f}')
    
    if acc_pca_d >= original_spam_acc - 0.02 and acc_pca_d <= original_spam_acc + 0.02:  # Stop if comparable accuracy is reached
        print(f"Smallest D for comparable accuracy: {d}")
        break

PCA (D=1): Accuracy = 0.8284
PCA (D=4): Accuracy = 0.8730
PCA (D=7): Accuracy = 0.8708
PCA (D=10): Accuracy = 0.8936
Smallest D for comparable accuracy: 10
