# AutoEncoders

### Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable # for stochastic gradient decent

### Importing the dataset

In [2]:
# Using same dataset as boltzmann.ipynb

movies = pd.read_csv('data/rbm/ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1') # :: is the separator in the file
users = pd.read_csv('data/rbm/ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1') # :: is the separator in the file
ratings = pd.read_csv('data/rbm/ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

### Create the training and test sets

In [3]:
training_set = pd.read_csv('data/rbm/ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('data/rbm/ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

### Getting the number of users and movies

In [4]:
# We need to get the largest user id and movie id so it works for training and testing for any of the datasets
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0]))) 
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

### Convert the data into an array with users in rows and movies in columns

In [5]:
# We will have a list of lists that has every user and every movie so they will always be the same size
# Torch expects a list of lists

def convert(data):
    new_data = []
    for id_user in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_user] # Get all movies
        id_ratings = data[:,2][data[:,0] == id_user] # Get all ratings
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

In [6]:
# Example final state
# User 1 - [0,1,1,2]
# User 2 - [1,1,0,2]
# User 3 - [1,0,0,2]

training_set_converted = convert(training_set)
test_set_converted = convert(test_set)

### Converting the data into Torch tensors

In [7]:
training_set_torch = torch.FloatTensor(training_set_converted) # Expects the list of lists
test_set_torch = torch.FloatTensor(test_set_converted)

### Creating the architecture of the Neural Network

In [33]:
# Stacked AutoEncoder class
class SAE(nn.Module): 
    def __init__(self):
        super(SAE, self).__init__() # Gets all the inherited classes from nn.Module
        self.fc1 = nn.Linear(nb_movies, 20) # First full connection
        self.fc2 = nn.Linear(20, 10) # 20 input and 10 in current layer
        self.fc3 = nn.Linear(10, 20) # final layer 10 in and 20 out
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
        self.middle_activation = nn.ReLU()
    def forward(self, x): # x is input vector
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
    

In [34]:
sae = SAE() # No need to incorporate arguments, they are all training
criterion  = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) # LR and learning weighss



### Training the SAE

In [35]:
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0. # Counts the number of users that rated at least one movie (since RMSE needs a float, this is a float)
    for id_user in range(nb_users):
        input = Variable(training_set_torch[id_user]).unsqueeze(0)  # need to add a fake dimension since it needs more than a vector, this creates a batch of one input vector
        target = input.clone() # both start the same

        # Save memory
        if torch.sum(target.data > 0) > 0: # If so there is at least one observation greater than 0
            output = sae(input)
            target.require_grad = False # don't compute a gradient with respect to the target
            output[target == 0] = 0 # skip any movies not rated
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) # the last bit just makes sure the denominator is not null (small enough not biasing)
            loss.backward() # backward method
            train_loss += np.sqrt(loss.item()*mean_corrector) # getting the RMSE
            s += 1.
            optimizer.step()
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s))
    

epoch: 1 loss: 1.7709465313485453
epoch: 2 loss: 1.096653913269114
epoch: 3 loss: 1.053416851111914
epoch: 4 loss: 1.0383075805967934
epoch: 5 loss: 1.0307350394471906
epoch: 6 loss: 1.0267263880409452
epoch: 7 loss: 1.0238254320756048
epoch: 8 loss: 1.0219327236564144
epoch: 9 loss: 1.0207599870567021
epoch: 10 loss: 1.0196084696846628
epoch: 11 loss: 1.0189396450635617
epoch: 12 loss: 1.018377518832397
epoch: 13 loss: 1.018229079021296
epoch: 14 loss: 1.0173168366011036
epoch: 15 loss: 1.0171252985512569
epoch: 16 loss: 1.016852010576024
epoch: 17 loss: 1.0166340912902576
epoch: 18 loss: 1.0166775080611203
epoch: 19 loss: 1.0164989148583898
epoch: 20 loss: 1.016070933329149
epoch: 21 loss: 1.0159080950827064
epoch: 22 loss: 1.0161957175959226
epoch: 23 loss: 1.0155785427527704
epoch: 24 loss: 1.0157936661687794
epoch: 25 loss: 1.0156699656704469
epoch: 26 loss: 1.0155927680500103
epoch: 27 loss: 1.0152816137363398
epoch: 28 loss: 1.0151378054854858
epoch: 29 loss: 1.0130887193310119


### Testing the Stacked AutoEncoder (SAE)

In [36]:
test_loss = 0
s = 0. 
for id_user in range(nb_users):
    input = Variable(training_set_torch[id_user]).unsqueeze(0) # Need this to still be training since we use a user's input to predict
    target = Variable(test_set_torch[id_user]).unsqueeze(0) # This is the "future" truth that is created once the user has watched the movies 
    
    if torch.sum(target.data > 0) > 0: 
        output = sae(input)
        target.require_grad = False 
        output[target == 0] = 0 
        loss = criterion(output, target) # This compares the input to output
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)             
        test_loss += np.sqrt(loss.item()*mean_corrector)
        s += 1.            
print('test loss: ' + str(test_loss/s))

test loss: 0.951293135435618


In [37]:
# test loss: 0.9584480307484321 this means it is off by just than less than 1 star
# Out of the box value