In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler # Preprocessing recommended by: https://www.geeksforgeeks.org/data-analysis/principal-component-analysis-with-python/
from sklearn.decomposition import PCA
import umap
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from NM_classifier import find_nearest_mean
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

In [None]:
train_in = np.genfromtxt("train_in.csv", delimiter=",")
train_out = np.genfromtxt("train_out.csv", delimiter=",")
test_in = np.genfromtxt("test_in.csv", delimiter=",")
test_out = np.genfromtxt("test_out.csv", delimiter=",")

In [None]:
#Part 1 Problem 1

def dist(x, y):
    z = x - y
    return np.linalg.norm(z)

cloud = []

for i in range(0, 10):
    cloud.append(train_in[np.where(train_out == i)])


center = []
for j in range(0, 10):
    center.append(np.mean(cloud[j], axis=0))

np.savetxt("cloud_centers.csv", center, delimiter=",")

distance = np.zeros((10, 10))
for k in range(0, 10):
    for l in range(0, 10):
        distance[k, l] = dist(center[k], center[l])

print(distance)

In [None]:
#Part 1 Problem 2 PCA

# Preprocessing recommended by: https://www.geeksforgeeks.org/data-analysis/principal-component-analysis-with-python/
# Based on example in: https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html#sphx-glr-auto-examples-decomposition-plot-pca-iris-py

# Preprocessing the scale
sc = StandardScaler()
train_in = sc.fit_transform(train_in)

X_reduced = PCA(n_components=2).fit_transform(train_in)

fig, ax = plt.subplots()

PCA_reduced = ax.scatter(X_reduced[:,0], X_reduced[:,1], c=train_out.to_numpy(), alpha=0.6, label=train_out.to_numpy(), cmap="Paired")

legend = ax.legend(*PCA_reduced.legend_elements(num=10),
                    loc="upper right", title="Number")
ax.add_artist(legend)

plt.title("PCA")

plt.show()

In [None]:
#Part 1 Problem 2 U-MAP

reducer = umap.UMAP()

embedding = reducer.fit_transform(train_in)

fig, ax = plt.subplots()
scatter = ax.scatter(embedding[:, 0],
                     embedding[:, 1],
                     c = train_out,
                     cmap = "Paired",
                     s = 5)
legend = ax.legend(*scatter.legend_elements(), title="Classes")
ax.add_artist(legend)
plt.title("U-MAP")
plt.show()

In [None]:
#Part 1 Problem 2 T-SNE

train_in_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=10).fit_transform(train_in)

fig, ax = plt.subplots()

PCA_reduced = ax.scatter(train_in_embedded[:,0], train_in_embedded[:,1], c=train_out, alpha=0.6, label=train_out, cmap="Paired")

legend = ax.legend(*PCA_reduced.legend_elements(num=10),
                    loc="upper right", title="Number")
ax.add_artist(legend)

plt.title("T-SNE")

plt.show()

In [None]:
#Part 1 Problem 3

#same function as dist for 10 vectors simultaneous
def dist_center(x, c):
    vector = np.tile(x, (10, 1))
    res = vector - c
    abs = np.linalg.norm(res, axis=1)
    return np.where(abs == np.min(abs))[0][0] #output only integer where distance is smallest

def find_nearest_mean(X, c):
    nearest_mean = []
    for x in X:
        nearest_mean.append(dist_center(x, c).item())
    return nearest_mean

centers = np.genfromtxt("cloud_centers.csv", delimiter=",")

train_nearest_mean = []
for i in range(len(train_in)):
    train_nearest_mean.append(dist_center(train_in[i], centers).item())

train_correct = np.where(train_nearest_mean == train_out)[0]
train_percentage = 100 * len(train_correct) / len(train_in)
print(f"{train_percentage:.2f}% from the train set is correctly classified using the nearest mean method.")

test_nearest_mean = []
for i in range(len(test_in)):
    test_nearest_mean.append(dist_center(test_in[i], centers).item())

test_correct = np.where(test_nearest_mean == test_out)[0]
test_percentage = 100 * len(test_correct) / len(test_in)
print(f"{test_percentage:.2f}% from the test set is correctly classified using the nearest mean method.")

In [None]:
#Part 1 Problem 4
# Based on example given in: https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea

centers = np.genfromtxt("cloud_centers.csv", delimiter=",")

# We only fit the KNN classifier with the training data.
neigh = KNeighborsClassifier()
neigh.fit(train_in, train_out)

KNN_train_classification = []
for number in train_in:
    KNN_train_classification.append(neigh.predict([number])[0])

KNN_test_classification = []
for number in test_in:
    KNN_test_classification.append(neigh.predict([number])[0])

nearest_mean_train_classification = find_nearest_mean(train_in, centers)
nearest_mean_test_classification = find_nearest_mean(test_in, centers)

def plot_cf_matrix(ground_truth, prediction, title):
    cf_matrix = confusion_matrix(ground_truth, prediction)  # Ground truth values go first!
    cf_matrix = cf_matrix.astype(float)
    
    for i in range(cf_matrix.shape[0]):
        total = sum(cf_matrix[i, :])
        cf_matrix[i, :] = cf_matrix[i, :] / total
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(cf_matrix, annot=True, fmt='.1%', linewidths=0.5)
    plt.title(title)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")

plot_cf_matrix(train_out, nearest_mean_train_classification, "Confustion Matrix for NM Classification on Training Data")
plot_cf_matrix(train_out, KNN_train_classification, "Confustion Matrix for KNN Classification on Training Data")
plot_cf_matrix(test_out, nearest_mean_test_classification, "Confustion Matrix for NM Classification on Test Data")
plot_cf_matrix(test_out, KNN_test_classification, "Confustion Matrix for KNN Classification on Test Data")









plt.show()


In [None]:
#Part 2

