#### Assignment 3
###### Federico Matteoni

The task required for this assignment is the implementation from scratch of a RBM.
This is achieved in the following block of code.

In [None]:
import numpy as np


def logistic(x):  # logistic function to be used in the RBM computations
    return 1.0 / (1 + np.exp(-x))
  

class RBM:  # definition of the Restricted Boltzmann Machine
    def __init__(self, hidden_units, visible_units):  # initialization
        self.nh = hidden_units  # hidden units of the RBM
        self.nv = visible_units  # visible units of the RBM
        self.weights = np.random.uniform(-1/self.nv, 1/self.nv, (self.nv, self.nh))  # random initialization for the weights
        self.bias_h = np.zeros(self.nh)  # bias initialized to zero for each hidden unit
        self.bias_v = np.zeros(self.nv)  # bias initialized to zero for each visible unit
        print("Built a RBM with " + str(self.nv) + " visible units and " + str(self.nh) + " hidden units")


    def train(self, Xtr, epochs = 100, learning_rate = 0.1):  # CD-1 training algorithm
        n = 6000  # batch size

        print("Training on " + str(n) + " random elements for " + str(epochs) + " epochs")
        for epoch in range(epochs):
            # Clamp data
            idx = np.random.uniform(low = 0, high = Xtr.shape[0], size=n).astype(int)
            cXtr = Xtr[idx,:]

            # Wake phase
            # Hidden probability
            h_prob = logistic(np.dot(cXtr, self.weights) + self.bias_h)
            wake = np.dot(cXtr.T, h_prob)

            # Dream phase
            # Hidden states
            h_state = h_prob > np.random.rand(n, self.nh)
            # Reconstruction probability
            reconstruction_data_prob = logistic(np.dot(h_state, self.weights.T) + self.bias_v)
            # Reconstructed data
            reconstruction_data = reconstruction_data_prob > np.random.rand(n, self.nv)
            h_neg_prob = logistic(np.dot(reconstruction_data, self.weights) + self.bias_h)
            dream = np.dot(reconstruction_data.T, h_neg_prob)

            # Learning phase
            error = np.sum((cXtr - reconstruction_data)**2)/n
            dW = (wake - dream)/n
            dBh = (np.sum(h_prob) - np.sum(h_neg_prob))/n
            dBv = (np.sum(cXtr) - np.sum(reconstruction_data))/n
            self.weights += learning_rate*dW
            self.bias_h += learning_rate*dBh
            self.bias_v += learning_rate*dBv
            print("\rError:\t" + "{:.5f}".format(error), end="")
        print("")


    def get_hidden_activations(self, Xtr):  # for the inference process
        n = Xtr.shape[0]

        print("Computing hidden activations for " + str(n) + " elements")
        h_states = np.ones((n, self.nh))

        h_prob = logistic(np.dot(Xtr, self.weights) + self.bias_h)
        h_states[:,:] = h_prob > np.random.rand(n, self.nh)
        return h_states

The MNIST data is loaded from the Keras library

In [None]:
from keras.datasets import mnist

(Xtr, ytr), (Xts, yts) = mnist.load_data()

The data is then flattened

In [None]:
def flatten(x):
    return x.reshape(x.shape[0], -1)


print(str(Xtr.shape[0]) + " images of " + str(Xtr.shape[1]) + "x" + str(Xtr.shape[2]) + " pixels =>", end = " ")
Xtr = flatten(Xtr)
Xts = flatten(Xts)
print(str(Xtr.shape[0]) + " vectors " + str(Xtr.shape[1]) + " elements")

60000 images of 28x28 pixels => 60000 vectors 784 elements


Then we create and train the RBM

In [None]:
RBM = RBM(visible_units = (28*28), hidden_units = 500)
RBM.train(Xtr, epochs=100, learning_rate=0.01)

Built a RBM with 784 visible units and 500 hidden units
Training on 6000 random elements for 100 epochs
Error:	11336.78300


And we compute the hidden activations that will be fed into the classifiers.

In [None]:
Xtr_h = RBM.get_hidden_activations(Xtr)
Xts_h = RBM.get_hidden_activations(Xts)

Computing hidden activations for 60000 elements
Computing hidden activations for 10000 elements


As the classifier, I tested two common solutions.

First, a MLP classifier from the Scikit-Learn library

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


print("Building the MLP classifier")
classifier = MLPClassifier(hidden_layer_sizes=(300,200,), learning_rate_init=0.001)
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the MLP classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 11.35%


The a K-NN classifier, from the Scikit-Learn library as well

In [None]:
from sklearn.neighbors import KNeighborsClassifier


print("Building the K-NN classifier")
classifier = KNeighborsClassifier(5)
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the K-NN classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 10.28%


As a comparison, I tried a random classifier too

In [None]:
from sklearn.dummy import DummyClassifier


print("Building the random classifier")
classifier = DummyClassifier(strategy='uniform')
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the random classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 9.86%


The random classifier picks a class with uniform probability among the unique classes it has seen in the training set and assigns the selected class to the example during the prediction phase. In the MNIST dataset there are 10 classes, one for each digit, so a random classification will be correct in 10% of the cases, which is about the result of the random classifier.

The MLP and K-NN classifiers, though, do not behave much better, with both achieving about 10% accuracy. This means that both models are as good as random guessing in classifying MNIST data using the hidden activations from my implementation of the RBM. This can have multiple causes: the CD-1 algorithm implemented by me could be flawed (and in general we know that it isn't the best state-of-the-art algorithm available), and also the experimental setup surely played a role in the low generalization capability of the models.

Inspider by [this paper](https://christian-igel.github.io/paper/TRBMAI.pdf) I tried binarizing the MNIST data: each value has been set to 0 or 1 based on a threshold of 127. This is done in the preprocessing of the data: the `flatten` method

In [None]:
def flatten(x):
    x = np.where(x > 127, 1, 0)
    return x.reshape(x.shape[0], -1)

The training of the RBM is the same as before

In [None]:
(Xtr, ytr), (Xts, yts) = mnist.load_data()
print(str(Xtr.shape[0]) + " images of " + str(Xtr.shape[1]) + "x" + str(Xtr.shape[2]) + " pixels =>", end = " ")
Xtr = flatten(Xtr)
Xts = flatten(Xts)
print(str(Xtr.shape[0]) + " vectors " + str(Xtr.shape[1]) + " elements")
RBM = RBM(visible_units = (28*28), hidden_units = 500)
RBM.train(Xtr, epochs=100, learning_rate=0.01)

Xtr_h = RBM.get_hidden_activations(Xtr)
Xts_h = RBM.get_hidden_activations(Xts)

60000 images of 28x28 pixels => 60000 vectors 784 elements
Built a RBM with 784 visible units and 500 hidden units
Training on 6000 random elements for 100 epochs
Error:	134.32850
Computing hidden activations for 60000 elements
Computing hidden activations for 10000 elements


The classifiers too

In [None]:
print("Building the MLP classifier")
classifier = MLPClassifier(hidden_layer_sizes=(300,200,), learning_rate_init=0.001)
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the MLP classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 16.43%


In [None]:
print("Building the K-NN classifier")
classifier = KNeighborsClassifier(5)
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the K-NN classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 9.38%


In [None]:
print("Building the random classifier")
classifier = DummyClassifier(strategy='uniform')
print("Classifier training started")
classifier.fit(Xtr_h, ytr)
print("Classifier training ended")
print("Gathering classifier predictions")
predicted = classifier.predict(Xts_h)
print("Computing accuracy score")
accuracy = accuracy_score(yts, predicted)
print("Accuracy on test set of {:.2f}%".format(accuracy*100))

Building the random classifier
Classifier training started
Classifier training ended
Gathering classifier predictions
Computing accuracy score
Accuracy on test set of 10.03%


By binarizing the MNIST data, the MLP classifier increases its accuracy, while the K-NN classifier still has around 10% accuracy. This may be due to the K=5, or other factors.

In both cases, a model selection process would probably increase the performance of both models. Using a Restricted Boltzmann Machine to preprocess the data, given a fair amount of finetuning to get the best results, may be useful in handling complex data by extracting, through the RBM, the important features and using this process as an automatic feature selection.