<a href="https://colab.research.google.com/github/forlinarthur/deep-learning-tensorflow/blob/master/Movie_Rating_Auto_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.nn import parallel
from torch.utils import data
from torch.autograd import Variable

# Prepare training and test set 
- 1 Million registers divided as 20% test and 80% training

In [0]:
training_set = pd.read_csv("drive/My Drive/ML - AZ Course/Data/ml-1m/training_set.csv")
test_set = pd.read_csv("drive/My Drive/ML - AZ Course/Data/ml-1m/test_set.csv")

In [0]:
training_set.head()

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1287,5,978302039


# We need to convert our data into numPy arrays because that's the expected format to be able to transform them into tensors.

In [0]:
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

In [0]:
print(len(training_set))
print(len(test_set))

750121
250088


In [0]:
training_set[:5]

array([[        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       [        1,      3408,         4, 978300275],
       [        1,      2355,         5, 978824291],
       [        1,      1287,         5, 978302039]])

# Get number of users and movies

In [0]:
nb_users = int(max(training_set[:,0]))
nb_movies = int(max(training_set[:,1]))
print(f'Number of users: ', nb_users)
print(f'Number of movies: ', nb_movies)

Number of users:  6040
Number of movies:  3952


# Convert data into array of users and movies (in order to transform in tensors)
- Get all movieId and ratings from each user and create a list of ratings indexed by movieId.
- Create a list of lists containing all ratings of each user.
- Return the new data list.

In [0]:
def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
      id_movies = data[:,1][data[:,0] == id_users]
      id_ratings = data[:,2][data[:,0] == id_users]
      ratings = np.zeros(nb_movies)
      ratings[id_movies - 1] = id_ratings
      new_data.append(list(ratings))
  return new_data

In [0]:
training_set_1 = convert(training_set)
test_set_1 = convert(test_set)

In [0]:
len(training_set_1)

6040

# Convert data into Torch tensor

In [0]:

training_set_1 = torch.FloatTensor(training_set_1)
test_set_1 = torch.FloatTensor(test_set_1)

In [0]:
training_set_1

tensor([[5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 0.]])

# Architecture of Auto-Encoder neural network
## Create the fully connected layers containing:
* Encoder with a node for each movie (nb_movies) that connects to a 20 nodes layer.
* The previous layer will then be connected (shrinked) to a 10 nodes layer.
* A Decoder that takes the 10 nodes layer and connects it to a 20 nodes layer.
* A last layer that restores the inital movies list size.
* Activation function (Sigmoid).
* Finally, create a function that will apply the activation over each layer (forward propagation), except the last one (output).

In [0]:

class SAE(nn.Module):
  def __init__(self, ):
    super(SAE, self).__init__()
    self.fc1 = nn.Linear(nb_movies, 20)
    self.fc2 = nn.Linear(20, 10)
    self.fc3 = nn.Linear(10, 20)
    self.fc4 = nn.Linear(20, nb_movies)
    self.activation = nn.Sigmoid()
  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x

In [0]:
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

#Train Neural Network
We will loop over 200 epochs and for each one we take the information from each user and extract it from the tensor, then we check if there are valid ratings (at least one non zero rating), if true we send the data to our sae class. After that we calculate loss and apply a mean corrector. We are now able to call backpropagation, add our loss metric (RMSE) and finally call the RMSprop optimizer.

In [0]:
# Train NN
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  s = 0.
  for id_user in range(nb_users):
    inpt = Variable(training_set_1[id_user]).unsqueeze(0)
    target = inpt.clone()
    if torch.sum(target.data > 0) > 0:
      output = sae(inpt)
      target.require_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data.item() * mean_corrector)
      s += 1.
      optimizer.step()
  print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s))

epoch: 1 loss: 1.3472274646257396
epoch: 2 loss: 1.0100685584508857
epoch: 3 loss: 0.9899486759401004
epoch: 4 loss: 0.9832039737762297
epoch: 5 loss: 0.9801433544190066
epoch: 6 loss: 0.9783893666767414
epoch: 7 loss: 0.9774148415264873
epoch: 8 loss: 0.9765183444392906
epoch: 9 loss: 0.9759904665796292
epoch: 10 loss: 0.9755635603824311
epoch: 11 loss: 0.9752304424767345
epoch: 12 loss: 0.9751184393630885
epoch: 13 loss: 0.9747343990640828
epoch: 14 loss: 0.9745999175112476
epoch: 15 loss: 0.9743667555254177
epoch: 16 loss: 0.9743899248009193
epoch: 17 loss: 0.9741234986375745
epoch: 18 loss: 0.9740008441424362
epoch: 19 loss: 0.9738845408671623
epoch: 20 loss: 0.9739739082931812
epoch: 21 loss: 0.9737233644109785
epoch: 22 loss: 0.9735608893380282
epoch: 23 loss: 0.9734317003837023
epoch: 24 loss: 0.9732252147067989
epoch: 25 loss: 0.9727567734499101
epoch: 26 loss: 0.9719347022547415
epoch: 27 loss: 0.9712102859536008
epoch: 28 loss: 0.970487125752764
epoch: 29 loss: 0.970106379046

Training over 1M - epoch: 200 loss: 0.8728275413825726

# Test NN (final result)
For testing the network we do almost the same as we did for training except the epochs looping.

In [0]:

test_loss = 0
s = 0.
for id_user in range(nb_users):
    inpt = Variable(training_set_1[id_user]).unsqueeze(0)
    target = Variable(test_set_1[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = sae(inpt)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data.item() * mean_corrector)
        s += 1.
print('Test loss: ' + str(test_loss/s))

Test loss: 0.9083699782989956


Test over 1M - Test loss: 0.9083699782989956