# Movie Recommendation with Restricted Boltzmann Machine

# Imports

In [66]:
import torch 
import torch.nn as nn # The torch module to implement the Neural Networks
import torch.nn.parallel # For parallel computations
import torch.optim as optim # For optimizers
import torch.utils.data # Tools
from torch.autograd import Variable # For Stochatic Gradient Descent
import numpy as np
import pandas as pd
from datetime import datetime

# Load the dataset

In [67]:
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [68]:
print('Movies Shape: {}'.format(movies.shape))
print('Users Shape: {}'.format(users.shape))
print('Ratings Shape: {}'.format(ratings.shape), '\n')

Movies Shape: (3883, 3)
Users Shape: (6040, 5)
Ratings Shape: (1000209, 4) 



In [69]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [70]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


# Data Preprocessing

In [71]:
#Movies Dataset
movies.drop(columns = 2, inplace = True)
movies.columns = ['movie_id','movie']

#Ratings Dataset
ratings.drop(columns = 3, inplace = True) #removing useless column
ratings.columns = ['user_id','movie_id','rating']

#Normalization
max_rating = max(ratings['rating']) #5
min_rating = min(ratings['rating']) #1
ratings['rating'] = [((i - min_rating) / (max_rating - min_rating)) for i in ratings['rating']] 

## Train-Test Split

In [100]:
%%time

#Train-Test Split
training_size = 800090
training_set = ratings.iloc[:training_size, :] # Until userID = 4794
test_set = ratings.iloc[training_size:, :] # Starting at userID = 4795

training_set = training_set.values
test_set = test_set.values

print('Train Shape: {}'.format(training_set.shape))
print('Test Shape: {}'.format(test_set.shape), '\n')

nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0]))) #number of users
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1]))) #number of movies

print('Users: {}'.format(nb_users))
print('Movies: {}'.format(nb_movies))

Train Shape: (800090, 3)
Test Shape: (200119, 3) 

Users: 6040
Movies: 3952
Wall time: 316 ms


In [101]:
%%time

def convert(data, nr_observations, nr_entities):
        '''
        Generates (from a numpy array) a list of lists containing the number of ratings per user (rows), per entity (columns).
        Each of the constituent lists will correspond to an observation / user (row).
        Each observation list will contain the number of ratings (columns), one for each rating entity
        data: Input table (numpy array)
        nr_observations: Number of observations
        nr_entities: Number of entities rating in each observation
        '''
        converted_data = []
        for id_user in range(1, nr_observations + 1):
            id_entity = data[:,1][data[:,0] == id_user].astype(int)
            id_ratings = data[:,2][data[:,0] == id_user]
            ratings = np.zeros(nr_entities)
            ratings[id_entity - 1] = id_ratings
            converted_data.append(list(ratings))
        return converted_data
    
training_set = convert(training_set, nb_users, nb_movies)
test_set = convert(test_set, nb_users, nb_movies)

#Torch Tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

Wall time: 28.7 s


In [102]:
training_set.device

device(type='cpu')

## Restricted Boltzmann Machine

In [103]:
class RestrictedBoltzmannMachine():
    '''
    Restricted Boltzmann Machine (RBM) with 'nh' hidden nodes and 'nv' visible nodes.
    '''
    def __init__(self, nv, nh):
        '''
        RBM initialization module where three tensors are defined:
        W: Weight tensor
        a: Visible node bias tensor
        b: Hidden node bias tensor
        a and b are created as two-dimensional tensors to accommodate batches of observations over training.
        '''
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)

        
    def sample_h(self, vx):
        '''
        Method devoted to Gibbs sampling probabilities of hidden nodes given visible nodes - p (h|v)
        vx: Input visible node tensor
        '''
        w_vx = torch.mm(vx, self.W.t())
        activation = w_vx + self.a.expand_as(w_vx)
        c_p_h_given_v = torch.sigmoid(activation)
        return c_p_h_given_v, torch.bernoulli(c_p_h_given_v)

    
    def sample_v(self, hx):
        '''
        Method devoted to Gibbs sampling probabilities of visible nodes given hidden nodes - p (v|h)
        hx: Input hidden node tensor
        '''
        w_hx = torch.mm(hx, self.W)
        activation = w_hx + self.b.expand_as(w_hx)
        p_v_given_h = torch.sigmoid(activation)
        
        return p_v_given_h, torch.bernoulli(p_v_given_h)

    
    def train(self, nr_observations, nr_epoch, batch_size, train_tensor, metric):
        '''
        Method through which contrastive divergence-based training is performed.
        nr_observations: Number of observations used for training
        nr_epoch: Number of training epochs
        batch_size: Batch size
        train_tensor: Tensor containing training observations
        metric: Training performance metric of choice ('MAE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        '''
        print('Training...')
        for epoch in range(1, nr_epoch + 1):
            start_time = datetime.now()
            print(f'Epoch {str(epoch)} of {str(nr_epoch)} ', end='')
            train_loss = 0
            s = 0.
            
            for id_user in range(0, nr_observations - batch_size, batch_size):
                v0 = train_tensor[id_user:id_user+batch_size]
                vk = train_tensor[id_user:id_user+batch_size]
                ph0,_ = self.sample_h(v0)
                for k in range(10):
                    _,hk = self.sample_h(vk)
                    _,vk = self.sample_v(hk)
                    vk[v0<0] = v0[v0<0]
                    
                phk,_ = self.sample_h(vk)
                self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
                self.b += torch.sum((v0 - vk), 0)
                self.a += torch.sum((ph0 - phk), 0)
                
                if metric == 'MAE':
                    train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
                elif metric == 'RMSE':
                    train_loss += np.sqrt(torch.mean((v0[v0>=0] - vk[v0>=0])**2))
                s += 1.
                
            end_time = datetime.now()
            time_elapsed = end_time - start_time
            time_elapsed = time_elapsed.total_seconds()
            print(f'- Loss ({metric}): {train_loss/s:.8f} ({time_elapsed:.2f} seconds)')


    def test(self, nr_observations, train_tensor, test_tensor, metric):
        '''
        Method through which testing is performed.
        nr_observations: Number of observations used for testing
        train_tensor: Tensor containing training observations
        test_tensor: Tensor containing testing observations
        metric: Training performance metric of choice ('MAE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        '''
        print('Testing...')
        test_loss = 0
        s = 0.
        for id_user in range(nr_observations):
            c_v = train_tensor[id_user:id_user+1]
            vt = test_tensor[id_user:id_user+1]
            if len(vt[vt>=0]) > 0:
                _,c_h = self.sample_h(c_v)
                _,c_v = self.sample_v(c_h)
                
                if metric == 'MAE':
                    test_loss += torch.mean(torch.abs(vt[vt>=0] - c_v[vt>=0]))
                elif metric == 'RMSE':
                    test_loss += np.sqrt(torch.mean((vt[vt>=0] - c_v[vt>=0])**2))
                s += 1.
        print(f'Test loss ({metric}): {test_loss/s:.8f}')
        
        
    def predict(self, visible_nodes):
        '''
        Method through which predictions for one specific observation are derived.
        visible_nodes: Tensor containing one particular observation (set of values for each visible node) 
        '''
        h_v,_ = self.sample_h(visible_nodes)
        v_h,_ = self.sample_v(h_v)
        
        return v_h
    
def movie_recommender(movie_list, train_set, test_set, model, user_id):
        '''
        Generates movie recommendations for a particular platform user. 
        movie_list: List of movies and corresponding IDs
        train_set: Tensor containing training observations
        test_set: Tensor containing testing observations
        model: A RBM machine learning model previously instantiated
        user_id: The user for which preferred movies will be assessed and recommendations will be provided
        '''
        if user_id < 4795:
            user_sample = train_set[user_id - 1:user_id]
        else:
            user_sample = test_set[user_id - 1:user_id]
            
        pred = model.predict(user_sample).numpy()
        user_sample = user_sample.numpy()
        user_sample = pd.Series(user_sample[0])
        user_sample = user_sample.sort_values(ascending=False)
        user_sample = user_sample.iloc[:5]
        movie_indices = user_sample.index.values.tolist()
        print('Favourite movies of User {}\n'.format(user_id))
        
        for fav_movie_id in movie_indices:
            print(movie_list[movie_list.movie_id == fav_movie_id + 1].iloc[0][1])
        pred = pd.Series(pred[0])
        pred = pred.sort_values(ascending=False)
        pred_list = pred.index.values.tolist()
        print('\nUser {} may also like these movies\n'.format(user_id))
        
        nb_recommendations = 0
        i = 0
        while nb_recommendations < 5:
            pred_movie = pred_list[i]
            if pred_movie not in movie_indices:
                print(movie_list[movie_list.movie_id == pred_movie + 1].iloc[0][1])
                nb_recommendations += 1
            i += 1

## Training RBM

In [104]:
%%time

nv = len(training_set[0])
nh = 100
batch_size = 5
epoch = 20
metric = 'MAE'

rbm = RestrictedBoltzmannMachine(nv, nh)
rbm.train(nb_users, epoch, batch_size, training_set, metric)
rbm.test(nb_users, training_set, test_set, metric)

Training...
Epoch 1 of 20 - Loss (MAE): 0.03920628 (33.90 seconds)
Epoch 2 of 20 - Loss (MAE): 0.03742308 (36.51 seconds)
Epoch 3 of 20 - Loss (MAE): 0.03740079 (47.57 seconds)
Epoch 4 of 20 - Loss (MAE): 0.03739331 (84.77 seconds)
Epoch 5 of 20 - Loss (MAE): 0.03719060 (62.54 seconds)
Epoch 6 of 20 - Loss (MAE): 0.03698669 (39.32 seconds)
Epoch 7 of 20 - Loss (MAE): 0.03692999 (52.98 seconds)
Epoch 8 of 20 - Loss (MAE): 0.03676546 (84.24 seconds)
Epoch 9 of 20 - Loss (MAE): 0.03670770 (31.47 seconds)
Epoch 10 of 20 - Loss (MAE): 0.03663136 (34.96 seconds)
Epoch 11 of 20 - Loss (MAE): 0.03655441 (30.71 seconds)
Epoch 12 of 20 - Loss (MAE): 0.03652532 (31.88 seconds)
Epoch 13 of 20 - Loss (MAE): 0.03656529 (29.88 seconds)
Epoch 14 of 20 - Loss (MAE): 0.03662478 (29.91 seconds)
Epoch 15 of 20 - Loss (MAE): 0.03654826 (30.00 seconds)
Epoch 16 of 20 - Loss (MAE): 0.03652560 (29.68 seconds)
Epoch 17 of 20 - Loss (MAE): 0.03651884 (29.79 seconds)
Epoch 18 of 20 - Loss (MAE): 0.03650099 (29.4

In [106]:
print(movie_recommender(movies, training_set, test_set, rbm, 12))

Favourite movies of User 12

Raiders of the Lost Ark (1981)
Taxi Driver (1976)
Godfather: Part II, The (1974)
Christmas Story, A (1983)
Silence of the Lambs, The (1991)

User 12 may also like these movies

Casablanca (1942)
American Beauty (1999)
Sixth Sense, The (1999)
Hustler, The (1961)
8 1/2 (1963)
None


In [107]:
movie_recommender(movies, training_set, test_set, rbm, 10)

Favourite movies of User 10

Toy Story (1995)
Mary Poppins (1964)
Cinderella (1950)
Shaggy Dog, The (1959)
Escape to Witch Mountain (1975)

User 10 may also like these movies

Sixth Sense, The (1999)
American Beauty (1999)
Raiders of the Lost Ark (1981)
Hunt for Red October, The (1990)
Fargo (1996)


In [108]:
movie_recommender(movies, training_set, test_set, rbm, 4888)

Favourite movies of User 4888

Cabaret (1972)
Lawrence of Arabia (1962)
Little Big Man (1970)
Blade Runner (1982)
Apocalypse Now (1979)

User 4888 may also like these movies

Casablanca (1942)
Hustler, The (1961)
Jules and Jim (Jules et Jim) (1961)
Midnight Cowboy (1969)
8 1/2 (1963)
