In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.parallel
import torch.utils.data
from torch.autograd import Variable

  from .autonotebook import tqdm as notebook_tqdm


Importing the dataset

In [3]:
movies = pd.read_csv('D:\\Courses\\deep_learning\\Boltzmann_Machines\\ml-1m\\movies.dat',sep='::',header=None,engine='python',encoding='latin-1')

- sep: since separator in this case isn't a comma
- encoding : in order to account for special characters
- header : since there is no header in the file
- engine: to specify the language in which the parsers are written. Can be either C or python

In [4]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
users = pd.read_csv('D:\\Courses\\deep_learning\\Boltzmann_Machines\\ml-1m\\users.dat',sep='::',header=None,engine='python',encoding='latin-1')

In [8]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [9]:
ratings = pd.read_csv('D:\\Courses\\deep_learning\\Boltzmann_Machines\\ml-1m\\ratings.dat',sep='::',header=None,engine='python',encoding='latin-1')

In [10]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Creating training and test sets

We have multiple train-test splits to perform K fold train test splits

In [12]:
training_set = pd.read_csv('D:\\Courses\\deep_learning\\Boltzmann_Machines\\ml-100k\\u1.base',delimiter='\t')

Converting dataframe to numpy array

In [13]:
training_set = np.array(training_set,dtype='int')

we are specifying the desired datatype(dtype) of the array elements

We do the same for the test set

In [14]:
test_set = pd.read_csv('D:\\Courses\\deep_learning\\Boltzmann_Machines\\ml-100k\\u1.test',delimiter='\t')
test_set = np.array(test_set,dtype='int')

### Getting the number of users and movies

To obtain the number of users, we get the maximum value present in the test and the training set

In [15]:
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))

Similarly we can obtain the number of users

In [16]:
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))

### Creating a matrix of features

Since we will be using pytorch, instead of creating 2d numpy array, we create a list of lists

- We will have one list per user

In [17]:
def convert(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        #Obtain the movies rated by the user
        id_movies = data[:,1][data[:,0] == id_users]

        #Obtain the user ratings
        id_ratings = data[:,2][data[:,0] == id_users]

        #For movies the user has not given ratings for, we need to enter a 0
        ratings = np.zeros(nb_movies)
        ratings[id_movies-1] = id_ratings

        #Add this list of one user to the central list
        new_data.append(ratings)

    return new_data    

Takeaway - filtering method for numpy arrays

In [18]:
training_set = convert(training_set)
test_set =  convert(test_set)

In [21]:
len(training_set)

943

Creating data into Pytorch tensors

In [22]:
training_set = torch.FloatTensor(training_set)

  training_set = torch.FloatTensor(training_set)


In [23]:
test_set = torch.FloatTensor(test_set)

### Converting ratings to binary ratings(like or not like)

Replace the 0 ratings(movies which werent rated by the user but filled randomly) with -1.

In [24]:
training_set[training_set == 0] = -1

Now we replace the 1 and 2 star reviews with 0.(user didn't like those movies).

The reviews with more than 3 stars must be replaced by 1

In [25]:
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1

In [26]:
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

### Creating the architecture of neural networks

In [37]:
class RBM():
    def __init__(self,nv,nh):
        self.W = torch.randn(nh,nv)
        self.a = torch.randn(1,nh)
        self.b = torch.randn(1,nv)

    def sample_h(self,x):
        wx = torch.mm(x,self.W.t()) #take transpose so that the dimensions match
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v,torch.bernoulli(p_h_given_v)

    def sample_v(self,y):
        wy = torch.mm(y,self.W) #no need to take the transpose(why???)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h,torch.bernoulli(p_v_given_h)
    
    def train(self,v0,vk,ph0,phk):
        self.W += (torch.mm(v0.t(),ph0) - torch.mm(vk.t(),phk)).t()
        self.b += torch.sum((v0-vk),0) #Done to maintain the dimension of b
        self.a += torch.sum((ph0-phk),0)

nv = len(training_set[0])
nh = 100
batch_size = 100

rbm = RBM(nv, nh)

Init function
* nv- number of visible nodes
* nh -number of hidden nodes
* W - we initiliaze weights in this step and it has to be random. By convention it is matrix of size nv*nh 
* Next ,we initialize the bias. This has to be a two dimensional vector with the first dimension being the batch and the second dimension being the bias
   - a: bias for hidden nodes
   - b : bias for visible nodes


Probability of hidden nodes getting activated given visible nodes

* Second function we need involves sampling the nodes given the probability P(H/V). H- hidden node, V - visible node. This is analogous to the forward pass in neural networks.

* This is basically the sigmoid function applied to W*X + b

* The function will activate hidden nodes based on probabilities it calculates based on the input
* This step is essential since we need to maximise our log likelihood function and gibbs sampling is an integral part of it. Gibbs sampling needs these probabilities

* x- visible neurons
* torch.nn - used to multiply two tensors
* expand_as(wx): When adding the bias, we need to make sure that the bias gets added to each entry in the wx array. Hence we use expand_as(wx)
* We pass this to the sigmoid function and obtain the probability of the hidden nodes getting activated given the visible nodes.
* bernoulli(p_h_given_v): What we have is an array/tensor of probability values and we are hoping to predict a binary output(like or didn't like) based on this. For this we need to use the bernoulli distribution.torch.bernoulli(p_h_given_v)
* So basically probabilities below a certain threshold will be set to 0 while the rest will be set to 1


Probability of visible nodes getting activated given hidden nodes

* y - hidden nodes
* Here, we won't need to take transpose while multiplying the weights(check why???)

Contrastive Divergence
* Aim is to minimize energy and maximize log likelihood
* We achieve this through Gibbs Sampling. We create a Gibbs Chain of k samples by sampling the hidden and visibles nodes k times.
 - v0, vk- visible nodes obtained after first and kth iteration
 - ph0,phk - probabilities given v0 and vk


### Initializing the model

The number of visible nodes will be the number of movies we have in this dataset. The number of hidden nodes however is upto us.

### Training the RBM

In [41]:
nb_epoch = 10
for epoch in range(1,nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(0, nb_users - batch_size,batch_size):
        #The following are the weights that will be updated every epoch
        #We initially set it to the initial non-updated weights v0
        vk = training_set[id_user:id_user+batch_size]
        #This is the actual rating given by the users. Will be fixed.
        v0 = training_set[id_user:id_user+batch_size]
        #the sample_h function returns the probability and the samples. We only need the former
        ph0,_ = rbm.sample_h(v0)
        for k in range(10):
            #First step is to sample the hidden nodes based on the visible node values
            _,hk = rbm.sample_h(vk)
            #Next, we obtain the updated visible nodes based on the first hidden nodes
            _,vk = rbm.sample_v(hk)
            #We have to make sure the entires with -1 ratings(the ones not rated by the users) aren't updated
            vk[v0<0]  = v0[v0<0]
        phk,_ = rbm.sample_h(vk)
        rbm.train(v0,vk,ph0,phk)
        #Measuring the loss
        #We make use of the mean absolute distance between the predicted value and the actual value to obtain our test loss
        train_loss += torch.mean(torch.abs(v0[v0>=0]-vk[v0>=0]))
        #Incrementing the counter
        s += 1.
    #We divide the train_loss because it will be cumulative(ex 0.2+0.2 over 2 cycles). We will need to average it out
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

epoch: 1 loss: tensor(0.2474)
epoch: 2 loss: tensor(0.2456)
epoch: 3 loss: tensor(0.2448)
epoch: 4 loss: tensor(0.2496)
epoch: 5 loss: tensor(0.2426)
epoch: 6 loss: tensor(0.2503)
epoch: 7 loss: tensor(0.2433)
epoch: 8 loss: tensor(0.2475)
epoch: 9 loss: tensor(0.2462)
epoch: 10 loss: tensor(0.2435)


### Measuring test loss

In [48]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    #We  don't change it to training set since we will be using these weights to activate the hidden neurons
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    # Now, we won't need K-fold contrastive divergence. Since the model has already been trained 10 times , it will just take 1 pass to update the parameters
    #If loop to check if there are valid ratings. Will be true in most cases
    if len(vt[v>0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0]-v[vt>=0]))
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.6583)
