#Boltzmann Machine

##Downloading the dataset

###ML-100K

In [None]:
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!unzip ml-100k.zip
!ls

--2020-06-08 15:18:04--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2020-06-08 15:18:04 (15.6 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

###ML-1M

In [None]:
!wget "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
!unzip ml-1m.zip
!ls

--2020-06-08 15:18:16--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2020-06-08 15:18:19 (2.44 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         
ml-100k  ml-100k.zip  ml-1m  ml-1m.zip	sample_data


##Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
#Parallel computations
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
#SGD
from torch.autograd import Variable

## Importing the dataset


In [None]:
#seperator is comma for csv, must use :: (some movies contain commas)
#no header (col names)
#engine- make sure data is imported correctly (python- efficient)
#some movies have special characters- use latin-1

# We won't be using this dataset.
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
#includes gender,age,job
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
#Includes user IDs, movie IDs, ratings
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

## Preparing the training set and the test set


In [None]:
#several 80% train/test splits in 100k folder (could perform K fold CV)
#delimiter is tab (for tab use delim not sep)
#need np arrays for pytorch tensors
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
#IDs and ratings- all integers
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

## Getting the number of users and movies


In [None]:
#Plan
#going to make 2 matrices (1 train and test)
#same num users and movies
#each cell of index (user, movie) will get rating from user
#0 if not rating

#User with highest ID or movie ID could be in either set- splits are random
#Get max of train and test and find max of those

#i=0 is all users
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0])))
#i=1 is all movies
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))

## Converting the data into an array with users in lines and movies in columns


In [None]:
#Corresponds to what RBM expects in input
#Create sturcture that will obtain observations in lines and features in columns

def convert(data):
  #1 list per user in new_data
  new_data = []
  #for every user in dataset
  for id_users in range(1, nb_users + 1):
    #Gets movies and ratings for user
    #Full col of movies/ratings --> all movies/ratings for specific user
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    #Want 0s for unreviewed movies
    ratings = np.zeros(nb_movies)
    #Adds ratings to ratings np array
    #for rated movie, replace 0 with rating
    #id movies is indices of movies that were rated (movie start at 1 so subtract 1)

    ratings[id_movies - 1] = id_ratings
    #Adds list of 1682 ratings of movies for user to new data np array
    new_data.append(list(ratings))
  return new_data

#converted to array
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors


In [None]:
#Tensors- multidimensional matrix with elements of certain datatype
#Tensors faster for NN
#Needs list of lists as input (why we converted data)
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

## Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)


In [None]:
#Convert ratings (1-5) into binary
#Want to predict binary ratings- need inputs to match (will predict for non rated movies based on input)

#If 0, -1 (not rated)
#If 1 or 2, 0 (didnt like)
#If 3-5, 1 (liked)

training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

## Creating the architecture of the Neural Network


In [None]:
#Class is ensemble of instructions, model of something we want to build


class RBM():
  #initalize RBM object
  #define parameters of object that will be created once class is made
  #default compulsory function
  #self corresponds to object created, need it to attach variables to
  #nv- number of visible nodes, nh- number of hidden nodes
  def __init__(self, nv, nh):
    #weights- parameters of probabilities of visible nodes given hidden nodes
    #initialized in torch tensor of nh x nv (according to normal dist (u=0,sd=1))
    self.W = torch.randn(nh, nv)
    #bias for probabilities of hidden nodes given visible nodes
    #vector of nh elements
    #first dimension corresponds to batch, second corresponds to bias
    #(pytorch functions require two dimensional inputs with first being batcb, second bias)
    self.a = torch.randn(1, nh)
    #bias for probabilities of visible nodes given hidden nodes
    #vector of nv elements
    self.b = torch.randn(1, nv)
  #During sample- will approximate log likelihood gradient through Gibbs sampling
  #For Gibbs sampling, need probabilities of hidden nodes given visible
  #Can then sample activations of hidden nodes
  #Will activate each node according to certain prob P(H|V)
  #sample probabilities of hidden nodes given visible nodes
  #x is visible neurons
  def sample_h(self, x):
    #Compute product of neurons and weights (mm- product of two torch tensors)
    #Take transpose for consistent product
    wx = torch.mm(x, self.W.t())
    #Add bias a to wx
    #Apply bias to each line of minibatch (each line of 1 dimension)
    activation = wx + self.a.expand_as(wx)
    #Apply sigmoid activation function to wx+a
    p_h_given_v = torch.sigmoid(activation)
    #Some samples of hidden neurons according to probability
    #Bernoulli RBM as predicting binary outcome
    #Bernoulli samples of distribution
    #P(H|V) gas nh elements that has probs that correspond to hidden node (prob of activation)
    #Take random number and see if below probability threshold for each hidden node- activate if so
    #vector of 0s and 1s representing whetber node is activated
    return p_h_given_v, torch.bernoulli(p_h_given_v)
  #sample probabilities of visible nodes given hidden nodes
  #same as sample h
  def sample_v(self, y):
    wy = torch.mm(y, self.W)
    activation = wy + self.b.expand_as(wy)
    p_v_given_h = torch.sigmoid(activation)
    #predict activation - whether user likes given movie
    return p_v_given_h, torch.bernoulli(p_v_given_h)
  #use contrastive divergence- approximate rbm log likelihood gradient (try to max log likelihood)
  #direct computation of gradient too heavy - approximate
  #Gibbs sampling- sampling k times hidden and visible nodes (k step contrastive divergence)
  #start with input vector v0
  #based on probabilities ph, sample first set of nodes (first iter)
  #then take sampled hidden nodes as input (h1) with probs pv
  #then use sample visible nodes v1 to sample hidden nodes with probs p(h|V1)
  #sample again visible nodes and hidden nodes (k times)
  #v0- input vector, vk- visible nodes after k samplings (visible to hidden and back) k iter and k CD
  #ph0- vector of probs at first iter the hidden nodes=1 given V0
  #phk- probs after k samplings given vk
  def train(self, v0, vk, ph0, phk):
    #update tensor of weights w
    #add product of vj0 (rating of movie j) and p(h|v0)
    #subtract p(h|vk) * vjk (value of vis node corresp to movie j after k iterations)
    self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
    #update bias p(v|h)
    #add vj0 - vjk
    #keep format as tensor of two dimensions
    self.b += torch.sum((v0 - vk), 0)
    #update bias p(h|v)
    #add ph0 - phk
    self.a += torch.sum((ph0 - phk), 0)
#visibles nodes- size is number of movies 
#visible nodes are ratings of all movies by user
#could use nb movies
nv = len(training_set[0])
#100 hidden nodes- we choose
#features to detect- actors, oscars, genres
nh = 100
#will update weights after several observations (batch)
#makes training faster
batch_size = 100
rbm = RBM(nv, nh)

## Training the RBM


In [None]:
nb_epoch = 10
#for each epoch, all observations will go into network
#will update weights after observations of each batch has passed through network
#end- will get final visible node with new ratings for unrated movies
for epoch in range(1, nb_epoch + 1):
  #loss function to measure error (pred ratings vs real)
  train_loss = 0
  #counter to normalize train loss
  #increment after each user
  s = 0.
  #loop over all users
  #step of batch size (0-99,100-199...)
  #stop at users - batch size as will add batch size
  for id_user in range(0, nb_users - batch_size, batch_size):
    #gets input and target (in batches)
    #input (vk) is ratings of all movies by user- will be out of Gibbs sampling
    #target is same (don't touch- use as comparison)
    #input will go through Gibbs sampling- will have new ratings
    vk = training_set[id_user : id_user + batch_size]
    v0 = training_set[id_user : id_user + batch_size]
    #initial probabilities P(H|V)
    #probs of H0 nodes = 1 given visible nodes at beginning
    #add underscore to not get bernoulli, only get probabilities (first arg)
    ph0,_ = rbm.sample_h(v0)
    #k steps of contrastive divergence
    #makes gibbs chain- several roudn trips from visible nodes to hidden nodes and vice versa
    #visible nodes updated each round- get closer to good pred ratings
    for k in range(10):
      #sample first hidden nodes with P(H|V)
      _,hk = rbm.sample_h(vk)
      #update vk to be sampled visible nodes after first step of gibbs sampling
      #sample based on sampling of hidden nodes
      _,vk = rbm.sample_v(hk)
      #for non rating cells (-1)- dont want to include (freeze)
      #will not update during gibbs sampling
      #keep -1 ratings (dont want to train on them)
      vk[v0<0] = v0[v0<0]
    #compute phk - sample h after k steps of contrastive divergence
    phk,_ = rbm.sample_h(vk)
    #train to update weights/bias
    rbm.train(v0, vk, ph0, phk)
    #loss = MAE of ratings for rated movies
    train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
    #increment counter for loss calculation
    s += 1.
  print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))


epoch: 1 loss: tensor(0.3446)
epoch: 2 loss: tensor(0.2168)
epoch: 3 loss: tensor(0.2445)
epoch: 4 loss: tensor(0.2458)
epoch: 5 loss: tensor(0.2490)
epoch: 6 loss: tensor(0.2498)
epoch: 7 loss: tensor(0.2468)
epoch: 8 loss: tensor(0.2488)
epoch: 9 loss: tensor(0.2497)
epoch: 10 loss: tensor(0.2462)


## Testing the RBM


In [None]:
#initialize loss, loss normalizer to 0
test_loss = 0
s = 0.
#for each user 1 by 1 (no batches)
for id_user in range(nb_users):
  #gets user from training set, test set
  #training set is input used to activate hidden neurons of RBM to predict neurons
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    #only need 1 step of walk for contrastive divergence
    #blind walk- trained by gibbs sampling (with random steps- MCMC)
    #if test user has rated movies
    if len(vt[vt>=0]) > 0:
      #get RBM predictions
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        #calculate error
        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
        s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.2359)
