<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/AutoEncoder_MovieLens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Jan 11 22:01:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    24W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import numpy as np
import pandas as pd
import torch


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
manual_seed = 2357 # only primers ;)
 
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Experimento deterministico, seed: {manual_seed} -- ', end = '')
        print(f'Existe {torch.cuda.device_count()} GPU\
 {torch.cuda.get_device_name(0)} disponível.')
    else:
        print('Experimento randomico')
deterministic()

Experimento deterministico, seed: 2357 -- Existe 1 GPU Tesla V100-SXM2-16GB disponível.


# Read the data

In [3]:
!unzip /content/drive/MyDrive/Colab\ Notebooks/RecSys/movielens_datasets/ml100k/ml-100k.zip

Archive:  /content/drive/MyDrive/Colab Notebooks/RecSys/movielens_datasets/ml100k/ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [4]:
path_data_dir = '/content/drive/MyDrive/Colab Notebooks/RecSys/movielens_datasets/ml-1m'

movies = pd.read_csv(os.path.join(path_data_dir, 'movies.dat') , sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv(os.path.join(path_data_dir, 'users.dat'), sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv(os.path.join(path_data_dir, 'ratings.dat'), sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [5]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Read in train and test sets¶


In [8]:
#  training_set: user ID, movie ID, rating, timestampes

training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t', header = None)
print(training_set.head(3))

   0  1  2          3
0  1  1  5  874965758
1  1  2  3  876893171
2  1  3  4  878542960


In [9]:
##convert it to array
training_set = np.array(training_set, dtype = 'int')
print(training_set.shape)

(80000, 4)


In [10]:
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t', header = None)
##convert it to array
test_set = np.array(test_set, dtype = 'int')
print(test_set.shape)

(20000, 4)


# Data prep

In [11]:
#take max users id in train and test data
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

nb_users, nb_movies 

(943, 1682)

In [12]:
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        ##id of movies that is rated by current users
        id_movies = data[:,1][data[:,0] == id_users]
        
        ##rate of movies that is given by current user
        id_ratings = data[:,2][data[:,0] == id_users]
        
        #inialize ratings for all movies
        #set 0 for movies that are not rated by current users
        ratings = np.zeros(nb_movies)
        #movie id starts from 1, 1st movie will be 1st element in rating with index as 0
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

In [13]:
training_set = convert(training_set)
test_set = convert(test_set)

#to tensor
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [14]:
training_set

tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

# Model

In [15]:
class Net(torch.nn.Module):
    def __init__(self, ):
        super().__init__()
        
        #encoding
        self.fc1 = torch.nn.Linear(nb_movies, 20)
        self.fc2 = torch.nn.Linear(20, 10)
        
        #decoding
        self.fc3 = torch.nn.Linear(10, 20)
        self.fc4 = torch.nn.Linear(20, nb_movies)
        self.activation = torch.nn.Sigmoid()
    
    def forward(self, x):
        o = self.activation(self.fc1(x))
        o = self.activation(self.fc2(o))
        o = self.activation(self.fc3(o))
        return self.fc4(o)

In [16]:
net = Net().to(device)

criterion = torch.nn.MSELoss()

#create optimizer object
#parameters of all auto-encoders defined in the class
optimizer = torch.optim.RMSprop(net.parameters(), lr = 0.01, weight_decay = 0.5)

# Training

In [20]:
#loop all epochs
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    #exclude users who did not rate any movies
    #define a float
    s = 0.
    #loop through each users
    for id_user in range(nb_users):
        #get all rating for current user from training_set
        #nn does not take single dimension vector, so add a batch dimension
        #a batch of sinlge inptu vector, update weigths after each vector
        input = torch.autograd.Variable(training_set[id_user]).unsqueeze(0).to(device)
        #create target by copying input
        target = input.clone().to(device)
        #only look at users who rated at least 1 movie
        if torch.sum(target.data > 0) > 0:
            #get output from the network, a vector of predicted value
            output = net(input)
            #do not compute gradient with respect to target
            target.require_grad = False
            #don't account the output whose initial input is 0
            output[target == 0] = 0
            loss = criterion(output, target)
            #make demonitor is not zero, to add a small number
            mean_corrector = nb_movies / float(torch.sum(target.data>0) + 1e-10)
            #backward method to determine which direction 
            loss.backward()
            #access the data of loss object .data[0]
            #adjust the loss to compute relevant mean for all movies for current user
            train_loss += np.sqrt(loss.item() * mean_corrector)
            s += 1.
            #apply optimizer to update weights, decides the amount of weight udpates
            optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch}: Train loss: {train_loss/s:.4}')

epoch: 10: Train loss: 0.9801
epoch: 20: Train loss: 0.9664
epoch: 30: Train loss: 0.9553
epoch: 40: Train loss: 0.9546
epoch: 50: Train loss: 0.9473
epoch: 60: Train loss: 0.9395
epoch: 70: Train loss: 0.9335
epoch: 80: Train loss: 0.9309
epoch: 90: Train loss: 0.9268
epoch: 100: Train loss: 0.9235
epoch: 110: Train loss: 0.9208
epoch: 120: Train loss: 0.9188
epoch: 130: Train loss: 0.9167
epoch: 140: Train loss: 0.9158
epoch: 150: Train loss: 0.9144
epoch: 160: Train loss: 0.9134
epoch: 170: Train loss: 0.9119
epoch: 180: Train loss: 0.9112
epoch: 190: Train loss: 0.9097
epoch: 200: Train loss: 0.909


# Evaluate

In [23]:
#loop through each users
test_loss, s = 0, 0.
for id_user in range(nb_users):
    #keep using training set
    input = torch.autograd.Variable(training_set[id_user]).unsqueeze(0).to(device)
    #create target by copying input
    target = torch.autograd.Variable(test_set[id_user]).unsqueeze(0).to(device)
    #only look at users who rated at least 1 movie
    if torch.sum(target.data > 0) > 0:
        #get output from the network, a vector of predicted value
        output = net(input)
        #do not compute gradient with respect to target
        target.require_grad = False
        #don't account the output whose initial input is 0
        output[target == 0] = 0
        loss = criterion(output, target)
        #make demonitor is not zero, to add a small number
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        
        
        #access the data of loss object .data[0]
        #adjust the loss to compute relevant mean for all movies for current user
        test_loss += np.sqrt(loss.item() * mean_corrector)
        s += 1.
print(f'Test loss: {test_loss/s:.4}')

Test loss: 0.9523


# Print Predictions

In [29]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)

user_input = torch.autograd.Variable(training_set[user_id]).unsqueeze(0).to(device)
print(training_set[user_id])
predicted = net(user_input)
predicted = predicted.data.cpu().numpy().reshape(-1,1)
print(predicted)
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

tensor([5., 3., 4.,  ..., 0., 0., 0.])
[[3.8871922]
 [3.377602 ]
 [2.9045982]
 ...
 [2.0260649]
 [3.099457 ]
 [3.1787095]]


In [30]:
result_df.head()

Unnamed: 0,Movie,Target Rating,Predicted
0,Heat (1995),5,3.78763
1,GoldenEye (1995),3,3.86266
2,Dracula: Dead and Loving It (1995),5,4.46161
3,Nixon (1995),5,3.82889
4,Sense and Sensibility (1995),3,3.28658
