In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def safelog(x):
    return(np.log(x + 1e-100))

In [2]:
# read data into memory
images_data_set = np.genfromtxt("hw02_images.csv", delimiter = ",")
labels_data_set = np.genfromtxt("hw02_labels.csv", delimiter = ",").astype(int)

In [3]:
# divide data set into two parts: training set and test set
x_training = images_data_set[0:30000, :]
x_test = images_data_set[30000:35000, :]
y_training = labels_data_set[0:30000]
y_test = labels_data_set[30000:35000]

In [4]:
# get number of classes and number of samples
K = np.max(y_training)

In [5]:
# calculate sample means
sample_means = np.array([np.mean(x_training[y_training == (c + 1)], axis=0) for c in range(K)])

In [6]:
print(sample_means)

[[254.99866667 254.98416667 254.85616667 ... 254.679      254.87816667
  254.95933333]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666667 254.98983333 254.91416667 ... 252.39516667 254.44166667
  254.93666667]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


In [7]:
# calculate sample deviations
sample_deviations = np.array([np.std(x_training[y_training == (c + 1)], axis=0) for c in range(K)])

In [8]:
print(sample_deviations)

[[ 0.09127736  0.25609108  1.31090756 ...  5.29826629  3.9117332
   1.93959091]
 [ 0.2065419   0.2065419   0.2163818  ...  1.04076669  0.47057267
   0.70062226]
 [ 0.05163547  0.04081939  0.16002465 ... 18.43665868  6.7881694
   1.1061344 ]
 [ 0.18436076  0.21617116  1.81046936 ... 15.67799977  6.34549162
   1.79971911]
 [ 0.04471018  0.64582342  3.03248555 ... 23.62576428 13.9167006
   4.4727787 ]]


In [9]:
# calculate prior probabilities
class_priors = np.array([np.mean(y_training == (c + 1)) for c in range(K)])

In [10]:
print(class_priors)

[0.2 0.2 0.2 0.2 0.2]


In [11]:
# score value calculations
def score(x):
    score = []
    for i in range(x.shape[0]):
        score.append([- 1/2 * np.sum(np.square((x[i] - sample_means[c]) / sample_deviations[c]) + 2 * safelog(sample_deviations[c])) + safelog(class_priors[c]) for c in range(K)])    
    return np.array(score)


In [12]:
y_score_training = score(x_training)
y_pred_training = np.argmax(y_score_training, axis=1) + 1
confusion_matrix_training = pd.crosstab(y_pred_training, y_training, rownames = ['y_pred'], colnames = ['y_truth'])

In [13]:
print(confusion_matrix_training)

y_truth     1     2     3     4     5
y_pred                               
1        3685    49     4   679     6
2        1430  5667  1140  1380   532
3         508   208  4670  2948   893
4         234    60   123   687   180
5         143    16    63   306  4389


In [14]:
y_score_test = score(x_test)
y_pred_test = np.argmax(y_score_test, axis=1) + 1
confusion_matrix_test = pd.crosstab(y_pred_test, y_test, rownames = ['y_pred'], colnames = ['y_truth'])

In [15]:
print(confusion_matrix_test)

y_truth    1    2    3    4    5
y_pred                          
1        597    6    0  114    1
2        237  955  188  267   81
3         92   25  785  462  167
4         34   11   16  109   29
5         40    3   11   48  722
