In [2]:
import numpy as np
import keras
import random
from keras.datasets import mnist
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

#reshaping images
train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# normalize
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

random_sample_indices = random.sample(range(train_images.shape[0]), 20000)
train_images_25 = train_images[random_sample_indices]
train_labels_25 = train_labels[random_sample_indices]


train_images_final_80, validation_images_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_images_25, train_labels_25, test_size=0.1, random_state=42)

print("Final train dataset size: ", train_images_final_80.shape)
print("Final validation dataset size: ", validation_images_final_10.shape)


Final train dataset size:  (18000, 784)
Final validation dataset size:  (2000, 784)


In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
projection_train = pca.fit_transform(train_images_final_80)
projection_test = pca.transform(test_images)

print(projection_train.shape)

model = LogisticRegression(solver='saga', multi_class='multinomial')
model.fit(projection_train, train_labels_final_80)

accuracy = model.score(projection_test, test_labels)
print("LR Accuracy MNIST with D=5:", accuracy) #68%

(18000, 5)
LR Accuracy MNIST with D=5: 0.6874


In [4]:
pca = PCA(n_components=20)
projection_train = pca.fit_transform(train_images_final_80)
projection_test = pca.transform(test_images)

print(projection_train.shape)

model = LogisticRegression(solver='saga', multi_class='multinomial')
model.fit(projection_train, train_labels_final_80)

accuracy = model.score(projection_test, test_labels)
print("LR Accuracy MNIST with D=20:", accuracy) #66%

(18000, 20)
LR Accuracy MNIST with D=20: 0.8784


Spambase

In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

dataset_path = 'spambase.data'
df = pd.read_csv(dataset_path, header=None)

X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

spambase_train_norm = scaler.fit_transform(X_train)
spambase_test_norm = scaler.transform(X_test)

print(spambase_test_norm.shape)

(921, 57)


In [6]:
pca = PCA(n_components=5)
projection_train = pca.fit_transform(spambase_train_norm)
projection_test = pca.transform(spambase_test_norm)

print(projection_train.shape)

model = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
model.fit(projection_train, y_train)

accuracy_spambase_pca = model.score(projection_test, y_test)
print("LR Accuracy Spambase with D=5:", accuracy_spambase_pca)

(3680, 5)
LR Accuracy Spambase with D=5: 0.8773072747014115


In [7]:
model = LogisticRegression()
model.fit(spambase_train_norm, y_train)

accuracy_spambase = model.score(spambase_test_norm, y_test)
print("LR Accuracy Spambase with logistic reg:", accuracy_spambase)

smallest_d = 101
min_delta = 1000

for d in reversed(range(1, 40)):
    pca = PCA(n_components=d)
    projection_train = pca.fit_transform(spambase_train_norm)
    projection_test = pca.transform(spambase_test_norm)

    model = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=1500)
    model.fit(projection_train, y_train)

    accuracy_d = model.score(projection_test, y_test)
    # print("Accuracy for D = ", d, " :", accuracy_d)
    if abs(accuracy_spambase-accuracy_d) < min_delta:
        min_delta = abs(accuracy_spambase-accuracy_d)
        smallest_d = d

print("Smallest D: ", smallest_d)

LR Accuracy Spambase with logistic reg: 0.9196525515743756




Smallest D:  26
