In [11]:
from sklearn.cluster import KMeans
from collections import OrderedDict
from sklearn import datasets, metrics
import numpy as np
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot as plt
import Handout_MNIST.MNIST as mn


In [12]:
# in this cell you will have to add a suitable intialisation for the EM-algorithm to find good "clusters"
# note that "cluster" is a bit misleading, as only the degree of belonging to a distribution is evaluated 
# for each sample, and not an absolute assignment to a cluster is made

class EMClusters:

    def __init__(self):
        self.__numOfClasses = 0
        self.__numOfAttributes = 0

    # Initialises for EM to work
    # should be called with a suitable smoothing factor blur, but the default does at least something.
    def initialise(self, data, num_classes, num_attributes, blur=0.1):
        self.__blur = blur
        self.__data = data
        self.__numOfAttributes = num_attributes
  
        self.__numOfClasses = num_classes

        self.__clusterMeansAndCovs = np.ones((self.__numOfClasses, self.__numOfAttributes, 2), dtype=float)
        self.__priors = np.ones(self.__numOfClasses)

        # start of your code
        ###
        #
        # Here, you should now fill in something more sensible, as the code above is only a placeholder solution...
        # You need to find initial clusters and set start values for the priors, means and (co)variances that would
        # end up in the following arrays:
        #
        # self.__priors              - is of dimension K x 1 and holds K values, K the number of clusters / classes
        # self.__clusterMeansAndCovs - is of dimensions K x I x 2 and holds for each combination of class k
        #                              and attribute j the mean in [k, j, 0] and the covariance (only for jj,
        #                              as we assume conditional independence given the class) in [k, j, 1]
        #
        ###
        self.__priors = np.random.rand(self.__numOfClasses)
        self.__clusterMeansAndCovs = np.random.rand(self.__numOfClasses, self.__numOfAttributes, 2)
        # end of your code

        
    # the actual EM algorithm, should be called with a suitable eps for a stop criterion
    # feel free to change (optimize) the implementation, but you do not have to do that
    def fit(self, eps=1.0):
        epsilon = eps

        print(self.__numOfAttributes)

        # print(self.__clusterMeansAndCovs[:,1,:])

        resp = np.zeros((len(self.__data), self.__numOfClasses))
        r_k = np.zeros(self.__numOfClasses)
        notDone = True
        while notDone:
            # print(self.__clusterMeansAndCovs)
            # E-step
            for i in range(len(self.__data)):
                probs = self.__priors.copy()
                for k in range(self.__numOfClasses):

                    for attr in range(self.__numOfAttributes):
                        probs[k] *= 1.0 / np.sqrt(
                            2 * math.pi * (self.__clusterMeansAndCovs[k, attr, 1] + self.__blur)) * math.exp(
                            -1.0 * math.pow((self.__clusterMeansAndCovs[k, attr, 0] - self.__data[i, attr]), 2) / (
                                        self.__clusterMeansAndCovs[k, attr, 1] + self.__blur))

                resp[i, :] = probs / sum(probs)
                # print(resp[i,:])
            
            # M-step
            newClusterMeansAndCovs = np.zeros((self.__numOfClasses, self.__numOfAttributes, 2))
            for k in range(self.__numOfClasses):
                r_k[k] = sum(resp[:, k])
                self.__priors[k] = 1 / len(self.__data) * r_k[k]

                for j in range(self.__numOfAttributes):
                    means = sum(resp[:, k] * self.__data[:, j]) / r_k[k]

                    newClusterMeansAndCovs[k, j, 0] = means
                    newClusterMeansAndCovs[k, j, 1] = sum(resp[:, k] * self.__data[:, j] * self.__data[:, j]) \
                                                      / r_k[k] - newClusterMeansAndCovs[k, j, 0] ** 2

            # print(newClusterMeansAndCovs[:, :, 1])

            err = np.linalg.norm(self.__clusterMeansAndCovs[:, :, 0] - newClusterMeansAndCovs[:, :, 0])

            self.__clusterMeansAndCovs = newClusterMeansAndCovs.copy()
            print("err = " + str(err))
            if err <= epsilon:
                notDone = False
                
                
        # produce "clusters", i.e, assign the samples to "their" gaussian
        clustered = 10 * np.ones(len(self.__data), dtype=int)
        for i in range(len(self.__data)):
            probs = self.__priors.copy()
            for k in range(self.__numOfClasses):

                for attr in range(self.__numOfAttributes):
                    probs[k] *= 1.0 / np.sqrt(2 * math.pi * (self.__clusterMeansAndCovs[k, attr, 1] + self.__blur)) \
                                * math.exp(
                        -1.0 * math.pow((self.__clusterMeansAndCovs[k, attr, 0] - self.__data[i, attr]), 2) \
                        / (self.__clusterMeansAndCovs[k, attr, 1] + self.__blur))

            sumProb = np.sum(probs)
            if (sumProb > 0.0):
                probs = probs / sumProb
                clustered[i] = np.argmax(probs)

        return clustered, self.__clusterMeansAndCovs

    # prediction uses the found gaussians to compute the likelihoods in a GNBC
    def predict(self, samples):
        predicted = [None for i in range(len(samples))]
        prob = [[self.__priors[cls] for cls in range(self.__numOfClasses)] for i in range(len(samples))]
        # print(prob)
        for i in range(len(samples)):
            for cls in range(self.__numOfClasses):
                for attr in range(self.__numOfAttributes):
                    (mean, var) = (
                    self.__clusterMeansAndCovs[cls, attr, 0], self.__clusterMeansAndCovs[cls, attr, 1] + self.__blur)
                    # print( mean, var)
                    prob[i][cls] *= 1.0 / np.sqrt(2 * math.pi * var) * math.exp(
                        -1.0 * math.pow((mean - samples[i, attr]), 2) / var)
                # print(i, cls, prob[i][cls])

            sumProb = np.sum(prob[i])
            if (sumProb > 0.0):
                prob[i] = prob[i] / sumProb
                predicted[i] = np.argmax(prob[i])
            else:
                predicted[i] = 10

            # print(prob[i])

        return predicted

In [13]:
# use this to rearrange the outcome of a clustering to match the order of the classes in the training data
# note that this is only something to make it easier for you to inspect the results, but probably not a method
# you want to integrate in a final system

def correctClusters( confM, classes) :
    clusterMapping = -1 * np.ones_like(classes)
    for k in classes:
        temp = list(confM[:, k])
        #print(temp)
        notDone = True
        while notDone:
            amax = np.argmax(temp)
            if (confM[amax, k] == np.max(confM[amax, :])):
                clusterMapping[k] = amax
                notDone = False
            elif (sum(temp) == 0):
                notDone = False
            else:
                temp[amax] = 0

    for k in classes:
        if (clusterMapping[k] == -1):
            for j in classes:
                if j not in clusterMapping:
                    clusterMapping[k] = j

    return clusterMapping


In [14]:
# loading the digits data and normalising to values between 0 and 1

digits = datasets.load_digits()


num_examples = len(digits.data)
num_split = int(0.7 * num_examples)
train_features = 1/16. * digits.data[:num_split]
train_labels = digits.target[:num_split]
test_features = 1/16. *digits.data[num_split:]
test_labels = digits.target[num_split:]

num_attributes = 64

num_classes = 10

# alternatively, loading the MNIST_Light data, which is normalised in the get_data() method already
# OBS: this will take a while to run!

#mnist = mn.MNISTData('Handout_MNIST/MNIST_Light/*/*.png')
#train_features, test_features, train_labels, test_labels = mnist.get_data()
#num_attributes = 400
#num_classes = 10

In [15]:
# some dummy call to the EM implementation, here you will have to change things (add parameters, for example)

emClusters = EMClusters()
emClusters.initialise(train_features, num_classes, num_attributes)
clustered, clusterMeansVars = emClusters.fit()

completeness_score = metrics.completeness_score(train_labels, clustered)
homogeneity_score = metrics.homogeneity_score(train_labels, clustered)
mutual_info_score = metrics.adjusted_mutual_info_score(train_labels, clustered)
print( "Completeness, homogeneity, adj mutual info EM vs labels", completeness_score, homogeneity_score, mutual_info_score)


# for index, center in enumerate(clusterMeansVars[:,:,0]):
#     img = center.reshape(8, 8)
#     plt.figure()
#     plt.axis('off')
#     plt.imshow(img, cmap=plt.cm.gray_r)
#     plt.show()
    


64
err = 10.76402577683994
err = 1.727904974706115
err = 1.07209799629523
err = 0.6859939745014375
Completeness, homogeneity, adj mutual info EM vs labels 0.6650584734269263 0.6320846243363692 0.6429076359534875


The cells below contain a breakdown for what was meant to be executed in a single cell in the handout skeleton. The single cell contained the following description:

"In this cell you should add the k-Means clustering to be able to compare to what you got with EM
If you apply "correctClusters" from above (works with any confusion matrix), you can even test the results 
against your or SKLearn's results in a classification attempt."

Define a utility function for making use of the provided `correctClusters` function output.

In [16]:
def map_correct_labels(predicted_labels, correct_clusters):
    kmeans_predicted_corrected = []
    for pred in predicted_labels:
        kmeans_predicted_corrected.append(correct_clusters[pred])
    
    return kmeans_predicted_corrected

Define the set of classes.

In [34]:
classes = [i for i in range(num_classes)]

Do K-Means clustering.

In [18]:
clustering = KMeans(n_clusters=num_classes)
clustering.fit(train_features, train_labels)
kmeans_predicted = clustering.predict(train_features)
kmeans_confusion_matrix = metrics.confusion_matrix(train_labels, kmeans_predicted)
print('CONFUSION MATRIX BEFORE CORRECTION\n', kmeans_confusion_matrix)

CONFUSION MATRIX BEFORE CORRECTION
 [[  0   0   0   0   0   0   0   0   0 125]
 [ 26   0  63   0   0   0   0  39   1   0]
 [107   0   9   1   3   3   0   1   0   0]
 [  0   2   0  13   2 113   0   0   0   0]
 [  0   0   2   0   7   0 109   6   0   0]
 [  0  94   0  28   0   2   1   0   1   0]
 [  0   0   2   0   0   0   0   0 124   1]
 [  0   1   0   0 123   0   0   1   0   0]
 [  2   8  61  44   1   1   0   4   1   0]
 [  0   3   0  97   7   3   0  15   0   0]]


Print the metrics of K-Means.

In [21]:
kmeans_correct_clusters = correctClusters(metrics.confusion_matrix(train_labels, kmeans_predicted), classes)
kmeans_predicted_corrected = map_correct_labels(kmeans_predicted, kmeans_correct_clusters)

kmeans_confusion_matrix_corrected = metrics.confusion_matrix(train_labels, kmeans_predicted_corrected)

print('K-MEANS CLASSIFICATION REPORT\n', metrics.classification_report(train_labels, kmeans_predicted_corrected))
print('CONFUSION MATRIX CORRECT KMEANS\n', kmeans_confusion_matrix_corrected)

K-MEANS CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       125
           1       0.46      0.49      0.47       129
           2       0.79      0.86      0.83       124
           3       0.93      0.87      0.90       130
           4       0.99      0.88      0.93       124
           5       0.87      0.75      0.80       126
           6       0.98      0.98      0.98       127
           7       0.86      0.98      0.92       125
           8       0.06      0.03      0.04       122
           9       0.53      0.78      0.63       125

    accuracy                           0.76      1257
   macro avg       0.75      0.76      0.75      1257
weighted avg       0.75      0.76      0.75      1257

CONFUSION MATRIX CORRECT KMEANS
 [[125   0   0   0   0   0   0   0   0   0]
 [  0  63  26   0   0   0   1   0  39   0]
 [  0   9 107   3   0   0   0   3   1   1]
 [  0   0   0 113   0   2   0   2   0  13]
 [  0

In [32]:
# # print('EM CLASSIFICATION REPORT\n', metrics.classification_report(test_labels, em_predicted))
# em_confusion_matrix = metrics.confusion_matrix(train_labels, clustered)
# em_correct_clusters = correctClusters(metrics.confusion_matrix(train_labels, clustered), classes)
# clusters_corrected = map_correct_labels(clustered, em_correct_clusters)
# kmeans_confusion_matrix_corrected = metrics.confusion_matrix(train_labels, clusters_corrected)
# print('K-MEANS CLASSIFICATION REPORT\n', metrics.classification_report(train_labels, clusters_corrected))
# print('CONFUSION MATRIX CORRECT EM\n', kmeans_confusion_matrix_corrected)

In [33]:
# In this cell you should add the k-Means clustering to be able to compare to what you got with EM
# If you apply "correctClusters" from above (works with any confusion matrix), you can even test the results 
# against your or SKLearn's results in a classification attempt.

# clustering = KMeans(n_clusters=num_classes)
# clustering.fit(train_features, train_labels)
# kmeans_predicted = clustering.predict(test_features)
# # print('K-MEANS CLASSIFICATION REPORT\n', metrics.classification_report(test_labels, kmeans_predicted))
# kmeans_confusion_matrix = metrics.confusion_matrix(test_labels, kmeans_predicted)
# kmeans_correct_clusters = correctClusters(metrics.confusion_matrix(test_labels, kmeans_predicted), classes)
# kmeans_confusion_matrix_correct = swap_rows(kmeans_confusion_matrix, kmeans_correct_clusters)
# print('CONFUSION MATRIX CORRECT KMEANS\n', kmeans_confusion_matrix_correct)

emClusters = EMClusters()
emClusters.initialise(train_features, num_classes, num_attributes, blur=0.2)
clustered, clusterMeansVars = emClusters.fit()
# print('EM CLASSIFICATION REPORT\n', metrics.classification_report(test_labels, em_predicted))
em_confusion_matrix = metrics.confusion_matrix(train_labels, clustered)

em_correct_clusters = correctClusters(metrics.confusion_matrix(train_labels, clustered), classes)
em_corrected_labels = map_correct_labels(clustered, em_correct_clusters)
em_confusion_matrix_corrected = metrics.confusion_matrix(train_labels, em_corrected_labels)
# em_confusion_matrix_correct = swap_rows(em_confusion_matrix, em_correct_clusters)
print('EM CLASSIFICATION REPORT\n', metrics.classification_report(train_labels, em_corrected_labels))
print('CONFUSION MATRIX CORRECT EM\n', em_confusion_matrix_corrected)

64
err = 10.751480292756897
err = 1.1139926726917484
err = 1.5287083492756972
err = 1.0844638758776464
err = 0.842535303247776
EM CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       125
           1       0.78      0.33      0.46       129
           2       0.60      0.86      0.71       124
           3       0.39      0.93      0.55       130
           4       0.00      0.00      0.00       124
           5       0.90      0.67      0.77       126
           6       0.42      0.98      0.58       127
           7       0.70      0.90      0.78       125
           8       0.00      0.00      0.00       122
           9       0.39      0.10      0.15       125

    accuracy                           0.58      1257
   macro avg       0.52      0.57      0.50      1257
weighted avg       0.52      0.58      0.50      1257

CONFUSION MATRIX CORRECT EM
 [[123   0   0   0   2   0   0   0   0   0]
 [  0  42  35   