<a href="https://colab.research.google.com/github/guiraposo/DataScienceProjects/blob/main/Movie_Recommendation/Movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get datafiles

!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!wget "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
!unzip ml-100k.zip
!unzip ml-1m.zip
!ls

--2024-02-25 10:28:05--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-02-25 10:28:05 (22.6 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

--2024-02-25 10:28:05--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2024-02-25 10:28:05 (23.5 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/READM

In [2]:
#Import relevant libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [3]:
#Import the datasets
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')


In [4]:
#Preparing the training set and testing set

#The training and testing sets.
#They are separated with tab. Use delimeter parameter.
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
#Convert to array to use with pytorch
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

In [5]:
# Extract the maximum number of users and movies
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))

In [6]:
#Convert the data to an array with users in lines and movies in columns
#In this form the data willbe suitable to be inputed in a neural network

def convert(dataset):
  #This function will
    new_data = []
    for id_user in range(1, nb_users + 1):
        id_movies = dataset[:,1][dataset[:,0] == id_user]
        id_ratings = dataset[:,2][dataset[:,0] == id_user]
        rating_list = np.zeros(nb_movies)
        rating_list[id_movies - 1] = id_ratings
        new_data.append(list(rating_list))
    return new_data

In [7]:
training_set= convert(training_set)
test_set = convert(test_set)

In [8]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [9]:
# Convert to binary classification
training_set[training_set==0] = -1
training_set[training_set==1] = 0
training_set[training_set==2] = 0
training_set[training_set>=3] = 1
test_set[test_set==0] = -1
test_set[test_set==1] = 0
test_set[test_set==2] = 0
test_set[test_set>=3] = 1

In [76]:
#Creating architecture for the Neural Network. In this case, RBM.
class RBM():
    def __init__(self, visible_nodes, hidden_nodes):
        self.W = torch.randn(hidden_nodes, visible_nodes)
        self.a = torch.randn(1, hidden_nodes)
        self.b = torch.randn(1, visible_nodes)
    def sample_hidden(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx) #Check again
        p_hidden_given_visible = torch.sigmoid(activation)
        return p_hidden_given_visible, torch.bernoulli(p_hidden_given_visible)
    def sample_visible(self, y):
        wy = torch.mm(y,self.W) #not necessary to transpose here
        activation = wy + self.b.expand_as(wy) # Bias of visible
        p_visible_given_hidden = torch.sigmoid(activation)
        return p_visible_given_hidden, torch.bernoulli(p_visible_given_hidden)
    def train (self, v0, vk, ph0, phk):
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)

In [77]:
visible_nodes = len(training_set[0])
hidden_nodes  = 100 #Change accordingly. Number of features to detect
batch_size = 100 #Change accordingly.

In [78]:
rbm = RBM(visible_nodes, hidden_nodes) #Create RBM class object

In [79]:
#Training the RBM model.
nb_epoch = 10
for epoch in range(1,nb_epoch +1):
    train_loss = 0
    loss_counter = 0. #Counter to normalize the loss
    for id_user in range(0, nb_users - batch_size, batch_size):
        vk = training_set[id_user:id_user + batch_size]
        v0 = training_set[id_user:id_user + batch_size] #Our target.
        ph0,_ = rbm.sample_hidden(v0)
        for k_step in range(10):
            _,hk = rbm.sample_hidden(vk)
            _,vk = rbm.sample_visible(hk)
            vk[v0 == -1] = v0[v0==-1] # Avoid training the missing values
        phk,_ = rbm.sample_hidden(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0>=0]-vk[v0>=0])) #Simple distance error
        loss_counter += 1.
    print('epoch= ' +  str(epoch) + ' loss:' + str(train_loss/loss_counter))


epoch= 1 loss:tensor(0.3369)
epoch= 2 loss:tensor(0.2533)
epoch= 3 loss:tensor(0.2501)
epoch= 4 loss:tensor(0.2456)
epoch= 5 loss:tensor(0.2509)
epoch= 6 loss:tensor(0.2476)
epoch= 7 loss:tensor(0.2464)
epoch= 8 loss:tensor(0.2498)
epoch= 9 loss:tensor(0.2491)
epoch= 10 loss:tensor(0.2492)
