In [1]:
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
import datetime
import math
from torch.autograd import Variable
import random

In [3]:
# Without softmax
class AutoEncoder(nn.Module):
  def __init__(self,dropOutProb = 0.4,**kwargs):
    super(AutoEncoder,self).__init__()
    self.encoder_hidden_layer1 = nn.Linear(kwargs["input_shape"],1024)
    self.encoder_hidden_layer2 = nn.Linear(1024,512)
    self.encoder_hidden_layer3 = nn.Linear(512,512)
    self.representation_layer = nn.Linear(512,256)
    self.decoder_hidden_layer1 = nn.Linear(256,512)
    self.decoder_hidden_layer2 = nn.Linear(512,512)
    self.decoder_hidden_layer3 = nn.Linear(512,1024)
    self.decoder_output_layer = nn.Linear(1024,kwargs["input_shape"])
    self.Dropout = nn.Dropout(dropOutProb)

  def forward(self,features):
    encoder1 = nn.functional.relu(self.encoder_hidden_layer1(features))
    encoder1 = self.Dropout(encoder1)
    encoder2 = nn.functional.relu(self.encoder_hidden_layer2(encoder1))
    encoder2 = self.Dropout(encoder2)
    encoder3 = nn.functional.relu(self.encoder_hidden_layer3(encoder2))
    encoder3 = self.Dropout(encoder3)
    representation = nn.functional.relu(self.representation_layer(encoder3))
    decoder1 = nn.functional.relu(self.decoder_hidden_layer1(representation))
    decoder2 = nn.functional.relu(self.decoder_hidden_layer2(decoder1))
    decoder3 = nn.functional.relu(self.decoder_hidden_layer3(decoder2))
    output = self.decoder_output_layer(decoder3)
    return output

In [4]:
data_path = 'Data/'
data = pd.read_csv(data_path+'Preprocessed_data.csv')

In [5]:
movies_list = data['Movie_Id'].unique()

789

In [6]:
# Remove movies with ratings less than 100
movies_counter = {}
for i in range(df.shape[0]):
    movies_counter.setdefault(df.iloc[i]['Movie_Id'],0)
    movies_counter[df.iloc[i]['Movie_Id']] += 1
    
del_movies = [x for x in movies_counter if movies_counter[x] < 100]

45

In [7]:
movies_list = [x for x in movies_list if x not in del_movies]
movies_list = dict([(movies_list[x],x) for x in range(len(movies_list))])

In [23]:
userData = {}
for i in range(data.shape[0]):
  user = int(data.iloc[i]['Cust_Id'])
  userData.setdefault(user,[0 for x in movies_list])
  if data.iloc[i]['Movie_Id'] in movies_list:
    userData[user][movies_list[data.iloc[i]['Movie_Id']]-1] = int(data.iloc[i]['Rating'])

len(userData)

393393

In [9]:
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))

  # Format as hh:mm:ss
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [10]:
# Train model
def trainModel(X_train,X_test,model,optimizer,lossFunc,weights,batch_size=64,num_epochs=10,lr=0.01,num_re_feeding=1):
  loss_values = []
   
  for epoch_i in range(0, num_epochs):
      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs))
      print('Training...')
      t0 = time.time()
      total_loss = 0
      model.train(True)
      for step in range(math.ceil(X_train.shape[0]/batch_size)):
          if step % 500 == 0 and not step == 0:
              elapsed = format_time(time.time() - t0)
              
              print('  Batch {:>5,}   Elapsed: {:}.'.format(step, elapsed))
          
          b_inputs = X_train[step*batch_size:(step*batch_size)+batch_size].float().to(device)
          b_weights = weights[step*batch_size:(step*batch_size)+batch_size].float().to(device)
          
          optimizer.zero_grad()
          outputs = model(features = b_inputs)
          loss,num_ratings = lossFunc(outputs,b_inputs,b_weights)
          loss = torch.sum(loss)/num_ratings
          total_loss += loss.item()
          # Perform a backward pass to calculate the gradients.
          loss.backward()
          optimizer.step()
          for i in range(num_re_feeding):
            b_inputs = Variable(outputs.data)
            optimizer.zero_grad()
            outputs = model(features = b_inputs)
            loss,num_ratings = lossFunc(outputs,b_inputs,b_weights)
            
            loss = torch.sum(loss)/num_ratings
            loss.backward()
            optimizer.step()
      
      avg_train_loss = total_loss / math.ceil(X_train.shape[0]/batch_size)
      loss_values.append(avg_train_loss)
      print("")
      print("  Total MMSE: {0:.2f}".format(total_loss))
      print("  Average MMSE: {0:.2f}".format(avg_train_loss))
      print("  Average RMSE: {0:.2f}".format(math.sqrt(avg_train_loss)))
      print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
      # ========================================
      #               Validation
      # ========================================
      print("")
      print("Running Validation...")
      t0 = time.time()
      model.eval()
      test_loss = 0
      for validationStep in range(math.ceil(X_test.shape[0]/batch_size)):
          # Add batch to GPU
          b_inputs = X_test[validationStep*batch_size:(validationStep*batch_size)+batch_size].float().to(device)
          b_weights = torch.ones(b_inputs.size()[0],1).to(device)
          
          with torch.no_grad():
              outputs = model(features = b_inputs)
              #print(outputs)
          
          loss,num_ratings = lossFunc(outputs,b_inputs,b_weights)
          loss = torch.sqrt(torch.sum(loss)/num_ratings)
          test_loss += loss.item()
      avg_test_loss = test_loss / math.ceil(X_test.shape[0]/batch_size)
      print("")
      print("  Average test RMSE: {0:.2f}".format(avg_test_loss))
      print("  Validation epoch took: {:}".format(format_time(time.time() - t0)))
  print("")
  print("Training complete!")
  return model, loss_values

In [11]:
# Get error for computing Adaboost weights
def getError(X_train,model,lossFunc,weights,batch_size=64):
  loss_values = []
  weighted_loss = []
  model.eval()
  num_ratings = 0
  for step in range(math.ceil(X_train.shape[0]/batch_size)):
      if step % 500 == 0 and not step == 0:
          
          print('  Batch {:>5,}'.format(step))
      
      b_inputs = X_train[step*batch_size:(step*batch_size)+batch_size].float().to(device)
      b_weights = weights[step*batch_size:(step*batch_size)+batch_size].float().to(device)
      b_ones = torch.ones(b_inputs.size()[0],1).to(device)
      with torch.no_grad():
        outputs = model(features = b_inputs)
      weighted,ratings = lossFunc(outputs,b_inputs,b_weights)
      num_ratings += ratings.item()
      weighted = torch.sum(weighted,dim=1,keepdims=True)
      weighted_loss.extend([np.sqrt(weighted[x][0].item()) for x in range(weighted.size()[0])])
      loss,_ = lossFunc(outputs,b_inputs,b_weights)
      loss = torch.sum(loss,dim=1,keepdims=True)
      loss_values.extend([np.sqrt(loss[x][0].item()) for x in range(loss.size()[0])])
  
  return loss_values,weighted_loss,num_ratings

In [12]:
# Custom loss function
def MMSE(y_pred,y,weights):
  masks = y != 0
  num_ratings = torch.sum(masks.float())
  return weights*masks.float()*torch.square(y_pred - y),num_ratings

In [13]:
samples = []
for sample in userData:
  num_ratings = len([1 for x in userData[sample] if x != 0])
  if num_ratings >= 5:
    samples.append(userData[sample])

len(samples)

222828

In [14]:
userData = np.array(samples)
userData.shape

(222828, 744)

In [15]:
X_train,X_test,_,_ = train_test_split(userData,userData,test_size=0.2,random_state=42)

X_train.shape,X_test.shape

((178262, 744), (44566, 744))

In [16]:
if torch.cuda.is_available():
    # Use GPU if available
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [None]:
# Without Adaboost
N = X_train.shape[0]
weights = np.ones((N,1))
model = AutoEncoder(input_shape=X_train.shape[1])
model.cuda()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001,momentum=0.9)

X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
weights = torch.tensor(weights)
model,loss_values = trainModel(X_train,X_test,model,optimizer,MMSE,weights=weights,num_epochs=15)

In [18]:
test_samples = X_test.detach().cpu().numpy()

In [None]:
# Testing
k = 2
model.eval()
results = []
for sample in test_samples:
  ratings = [(x,sample[x]) for x in range(len(sample)) if sample[x] > 0]
  sampled_ratings = random.sample(ratings,k)
  
  for i in sampled_ratings:
    sample[i[0]] = 0
  with torch.no_grad():
    output = model(torch.tensor(sample).float().to(device))
  predictions = output.cpu().numpy()
  predictions = np.array([predictions[x[0]] for x in sampled_ratings])
  actual = np.array([x[1] for x in sampled_ratings])
  rmse = np.sqrt(np.mean((predictions-actual)**2))
  results.append(rmse)

In [None]:
np.mean(results)