# 1. Imports and dataset


In [433]:
from sklearn import datasets, metrics
import numpy as np

In [434]:
digits = datasets.load_digits()
num_split = int(0.7 * len(digits.data))
train_features = digits.data[:num_split]  # train data (features)
train_labels = digits.target[:num_split]  # train labels (y-values)
test_features = digits.data[num_split:]  # test data (features)
test_labels = digits.target[num_split:]  # test labels(y-values)

In [435]:
# Normalizing the train dataset to values between 0 and 1
for i in range(len(train_features)):
    for j in range(len(train_features[i])):
        train_features[i][j] /=16

# 2. EM-algorithm to find a Gaussian NBC
- Assume conditional independence (covariance = variance for the attribute)
- Normalize the data to avoid multiplications with very small values in the likelihoods
- You can use the overall change in cluster centers between two iterations as a stop criterion. Minimal movement: assume convergence
- Make sure dimensions of covariance matrix are correct, possible to only calculate the diagonal

In [444]:
class EM_:
    
    def __init__(self, X, K):
        
        # Initialize prior probs, means and covariances
        self.prior = np.ones(K)/K
        self.means = np.zeros([K, len(X[0])])
        self.cov = np.ones([K, len(X[0])])*0.1
        self.r = np.zeros([len(X), K])

        # Creating subsets 
        subset = []
        start = 0
        end = int(len(X))/K
        for i in range(K):
            subset.append(X[start:int((i+1)*end)])
            start = int((i+1)*end)
        
        # Calculating first means 
        for k,subs in enumerate(subset):
            N = len(subs)
            for j, image in enumerate(subs):
                for h, pixel in enumerate(image):
                    self.means[k][h] +=pixel/N 
       
        # Initiating covariances (variances here)
        for i,subs in enumerate(subset):
            N = len(subs)
            for j, image in enumerate(subs):
                for h, pixel in enumerate(image):
                    self.cov[i][h] += ((pixel-self.means[i][h])**2)/N
        
    def fit(self, X, K):
        diff = 10
        nr_it = 0
        mean_last_step = self.means.copy()
        
        while diff>0.001: 
            nr_it +=1
                
            # E-step
            EM_.E_step_(self,X,K)
                
            # M-step
            EM_.M_step_(self,X, K)
                
            diff  = np.linalg.norm(mean_last_step-self.means)
            mean_last_step = self.means.copy()
            print('Iteration: ' , nr_it, 'Diff: ', diff)
                  
        return         
        
    # Expectation step
    def E_step_(self,X,K):
        cov = self.cov
        means = self.means
        prior = self.prior
        self.r = np.zeros([len(X), K])

        # Iterating through all images and for each image we iterate through all classes and pixels
        for i,image in enumerate(X):
            numerator = np.zeros(K)
            denominator = 0
            
            for k in range(K):
                prob = 1
                
                # Calculating and multiplying the gaussian probabilistic term for the pixel
                for p, pixel in enumerate(image):
                    prob *= 1/(np.sqrt(2*np.pi*cov[k][p])) * np.exp(-(pixel-means[k][p])**2/(2*cov[k][p]))
                            
                numerator[k] = prior[k]*prob
                denominator += prior[k]*prob
            
            # r consists of a list in which every element represents an image by a list of the corresponding probabilities for each class
            self.r[i] = numerator/denominator 
            
        return
        
    def M_step_(self, X, K):
            
        # Computing r_k: the sum of all probabilities for each class, has dim 10
        r_k = np.zeros([K])
        for i, image in enumerate(self.r):
            for k, prob in enumerate(image):
                r_k[k] += prob
    
        # Computing new prior probabilities
        self.prior = r_k / len(X)
            
        # Updating means and covariances
        for k in range(K):
            mean_k = 0
            cov_k = 0
                
            for i, image in enumerate(X):
                cov_k += self.r[i][k]*(image*np.transpose(image))   
                mean_k += self.r[i][k] * image
            self.means[k] = mean_k/r_k[k]  
            self.cov[k] = cov_k/r_k[k] - mean_k/r_k[k]*np.transpose(mean_k/r_k[k]) + 0.01   

        return
    
    
    def predict_(self,X):
        predictions = []
        prior = self.prior
        cov = self.cov
        means = self.means
        
        # Iterating through all images
        for image in X:
            probability_values = np.zeros(len(prior))
        
            # Iterating through all possible classes with their prior probabilities and multiply probabilities corresponding to the class 
            for k,prior_prob in enumerate(prior):
                probability_values[k] = prior_prob
            
                # Iterating through the image's pixels
                for p, pixel in enumerate(image):
                    probability_values[k] *= 1/(np.sqrt(2*np.pi*cov[k][p])) * np.exp(-(pixel-means[k][p])**2/(2*cov[k][p]))
    
            # Appending the class with highest probability
            predictions.append(np.argmax(probability_values))

        return predictions
            
        

In [445]:
EM = EM_(train_features, 10)

In [446]:
EM.fit(train_features,10)

Iteration:  1 Diff:  1.210593924688377
Iteration:  2 Diff:  1.7719440934318422
Iteration:  3 Diff:  0.9828110096261158
Iteration:  4 Diff:  0.6963874001363991
Iteration:  5 Diff:  0.6033863352751674
Iteration:  6 Diff:  0.595243587094797
Iteration:  7 Diff:  0.5057529789511146
Iteration:  8 Diff:  0.29200166890061213
Iteration:  9 Diff:  0.25497934411289463
Iteration:  10 Diff:  0.24553961456549103
Iteration:  11 Diff:  0.16175876647675494
Iteration:  12 Diff:  0.12538869202540928
Iteration:  13 Diff:  0.138604099854422
Iteration:  14 Diff:  0.11986529155183093
Iteration:  15 Diff:  0.08816107610681838
Iteration:  16 Diff:  0.06237092016726197
Iteration:  17 Diff:  0.0519778059873326
Iteration:  18 Diff:  0.03441388246594154
Iteration:  19 Diff:  0.03235733810680268
Iteration:  20 Diff:  0.01822314976005127
Iteration:  21 Diff:  0.012603391239107878
Iteration:  22 Diff:  0.01171935367481656
Iteration:  23 Diff:  0.009199813546660088
Iteration:  24 Diff:  0.005768195013525328
Iteration:

# 3. Clustering



In [447]:
predictions = EM.predict_(train_features)

# 4. k-means

In [448]:
from sklearn.cluster import KMeans

In [449]:
KM = KMeans(n_clusters=10)
clusters = KM.fit(train_features)
predictions_KMeans = KM.predict(train_features)

# 5. Comparison

- Completeness score: all members of a given class are assigned to the same cluster.
- Homogeneity score: each cluster contains only members of a single class
- Mutual information score: how much does one cluster say about the other

In [450]:
print('KMeans clustering\nConfusion matrix:\n',metrics.confusion_matrix(train_labels,predictions_KMeans))
print('Completeness score: ',metrics.completeness_score( train_labels, predictions_KMeans))
print('Homogeneity score: ',metrics.homogeneity_score( train_labels, predictions_KMeans))
print('Mutual information score: ',metrics.adjusted_mutual_info_score( train_labels, predictions_KMeans))

KMeans clustering
Confusion matrix:
 [[  0   0   0   0   0   0   0 125   0   0]
 [  0   0  63   1  39   0   0   0   0  26]
 [  0   3  10   0   3   3   0   0   1 104]
 [  0 113   0   0   0   2   2   0  13   0]
 [109   0   2   0   6   7   0   0   0   0]
 [  1   2   0   1   0   0  93   0  29   0]
 [  0   0   2 124   0   0   0   1   0   0]
 [  0   0   0   0   2 123   0   0   0   0]
 [  0   2  70   1   4   1   4   0  38   2]
 [  0   3   2   0  15   6   2   0  97   0]]
Completeness score:  0.7564104368068433
Homogeneity score:  0.7481511041008959
Mutual information score:  0.7486554642198753


In [451]:
print('EM algorithm \nConfusion matrix:\n',metrics.confusion_matrix(train_labels,predictions))
print('Completeness score: ',metrics.completeness_score( train_labels, predictions))
print('Homogeneity score: ',metrics.homogeneity_score( train_labels, predictions))
print('Mutual information score: ',metrics.adjusted_mutual_info_score( train_labels, predictions))

EM algorithm 
Confusion matrix:
 [[  1   0   0 124   0   0   0   0   0   0]
 [ 41  60   0   0   0   0   0  27   0   1]
 [  6   0  19   0   0   0   0  99   0   0]
 [  0   0  32   0   0   0   2   2  94   0]
 [ 67   0   0   0   0  48   8   0   0   1]
 [  0   0   1   0   0   2   0   0  28  95]
 [  1   1   0   0 125   0   0   0   0   0]
 [  0   0   0   0   0  24 101   0   0   0]
 [  2  10 100   0   0   0   0   4   2   4]
 [  2   0   8   0   0  16   4   0  95   0]]
Completeness score:  0.7273059589167915
Homogeneity score:  0.71335151689085
Mutual information score:  0.7161737273016219
