In [1]:
import time
import typing

import numpy as np
import pandas as pd
import torch
import scipy

import os
import json

In [2]:
from google.colab import drive

In [3]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [4]:
file_dir = '/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit/training_set_user11k'
file = 'ratings.csv'
file_path = os.path.join(file_dir, file)

columns = ["userId", "movieId", "rating", "timestamp"]
ratings_train = pd.read_csv(file_path, sep=',')

In [5]:
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

In [6]:
with open(file_path, "r") as json_file:
    userIds = json.load(json_file)

In [7]:
training_set = ratings_train
training_set

Unnamed: 0,userId,movieId,rating,timestamp
0,5,10,4.0,840768638
1,5,110,4.0,840768763
2,5,161,4.0,840764183
3,5,165,4.0,840764017
4,5,349,4.0,840764017
...,...,...,...,...
862942,200959,176371,4.5,1663748485
862943,200959,192365,5.0,1663748485
862944,200959,194474,5.0,1663748485
862945,200959,204698,4.0,1663748485


In [8]:
user_item_binary_matrix = pd.crosstab(training_set["userId"], training_set["movieId"]).clip(upper=1)
user_item_binary_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,288979,289253,289727,289797,290213,290263,290383,290767,291419,291857
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
user_item = torch.FloatTensor(user_item_binary_matrix.values)
user_item

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [29]:
class ELSA_vari(torch.nn.Module):
  def __init__(self, dim_user=10925, dim_latent1=500, dim_item=18221, dim_latent2=300, lr=0.1, weight_decay=1e-3):
    super(ELSA_vari, self).__init__()
    W1 = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([dim_user, dim_latent1])).detach().clone())
    W2 = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([dim_item, dim_latent2])).detach().clone())
    self.W_list = torch.nn.ParameterList([W1, W2])
    self.dim_user = dim_user
    self.dim_item = dim_item
    self.optimizer = torch.optim.NAdam(self.parameters(), lr=lr, weight_decay=weight_decay)
    # normalizaton is done in the training step
    self.criterion = torch.nn.MSELoss()

  def train_step(self, x, y):
    self.zero_grad()
    output = self(x)
    loss = self.criterion(torch.nn.functional.normalize(output, dim=-1), torch.nn.functional.normalize(y, dim=-1))
    loss.backward()
    self.optimizer.step()
    return loss, output

  def fit(self, train_data, batch_size=10925, shuffle=True, epochs=10):
    train_dataloader = torch.utils.data.dataloader.DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    total_steps = len(train_dataloader)
    print("****************** START TRAINING ******************")
    print("Total steps {:n}".format(total_steps))

    losses = {"nmse_train": []}

    for epoch_index in range(1, epochs + 1):
      epoch_start = time.time()
      nmse_losses_per_epoch = []
      for step, io_batch in enumerate(train_dataloader, start=1):
        loss, predictions = self.train_step(io_batch, io_batch)
        nmse_losses_per_epoch.append(loss.item())
        # log_dict = {
        #   "Epoch": f"{epoch_index} / {epochs}",
        #   "Step": f"{step}/{total_steps}",
        #   "Loss(nmse)": round(np.mean(nmse_losses_per_epoch), 4),
        #   "Training time": f"{(time.time() - epoch_start):2f}s",
        # }
        # for key, val in log_dict.items():
        #   print(f"{key}: {val}\n")

      losses["nmse_train"].append(np.mean(nmse_losses_per_epoch))

      train_end = time.time()
      log_dict = {
        "Epoch": f"{epoch_index} / {epochs}",
        "Loss(nmse)": round(losses["nmse_train"][-1], 4),
        "Training time": f"{(train_end - epoch_start):2f}s",
      }

      for key, val in log_dict.items():
          print(f"{key}: {val}\n")

    print("****************** [END] ******************")
    return losses

  def forward(self, x):
    A = torch.nn.functional.normalize(self.W_list[0], dim=-1)
    B = torch.nn.functional.normalize(self.W_list[1], dim=-1)

    x = x.float()

    # print(A.shape)
    # print(B.shape)
    # print(x.shape)

    xB = x @ B
    xBBT = xB @ B.T
    Right = xBBT - x
    # print(Right.shape)

    ATRight = A.T @ Right
    Left = A @ ATRight
    # print(Left.shape)
    return Left - Right

  def predict(self, predict_data, batch_size=10925):
    shuffle = False
    predict_dataloader = torch.utils.data.dataloader.DataLoader(predict_data, batch_size=batch_size, shuffle=shuffle)
    prediction = torch.vstack(list(self.predict_generator(predict_dataloader)))

    rm_seen_factor = 10000000
    prediction_rm_seen = prediction - rm_seen_factor * predict_data
    return prediction_rm_seen

  def predict_generator(self, predict_dataloader):
    for input_batch in predict_dataloader:
        yield self.forward(input_batch).detach()


In [30]:
model = ELSA_vari()

In [31]:
model.fit(user_item)

****************** START TRAINING ******************
Total steps 1
Epoch: 1 / 10

Loss(nmse): 0.0001

Training time: 16.408335s

Epoch: 2 / 10

Loss(nmse): 0.0001

Training time: 16.481786s

Epoch: 3 / 10

Loss(nmse): 0.0001

Training time: 16.450382s

Epoch: 4 / 10

Loss(nmse): 0.0001

Training time: 15.632263s

Epoch: 5 / 10

Loss(nmse): 0.0001

Training time: 16.692401s

Epoch: 6 / 10

Loss(nmse): 0.0001

Training time: 16.278631s

Epoch: 7 / 10

Loss(nmse): 0.0001

Training time: 16.445610s

Epoch: 8 / 10

Loss(nmse): 0.0001

Training time: 15.546890s

Epoch: 9 / 10

Loss(nmse): 0.0001

Training time: 16.342177s

Epoch: 10 / 10

Loss(nmse): 0.0001

Training time: 16.182567s

****************** [END] ******************


{'nmse_train': [0.00010977873898809776,
  0.00010975539044011384,
  0.00010976797057082877,
  0.00010976184421451762,
  0.00010974241740768775,
  0.00010977005149470642,
  0.00010977259080391377,
  0.0001097617787308991,
  0.00010976139310514554,
  0.00010975325130857527]}

In [32]:
# for prediction
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

with open(file_path, "r") as json_file:
    userIds_predict = json.load(json_file)

In [33]:
user_predict = pd.DataFrame({'userId': userIds_predict})
# user_item_binary_matrix_predict = user_item_binary_matrix[user_item_binary_matrix.index.isin(user_predict['userId'])]
# user_item_binary_matrix_predict

In [34]:
# predict all users in taining set, and then get the predictions for user_predict
user_item_predict = torch.FloatTensor(user_item)
predict = model.predict(user_item_predict)

In [35]:
predict

tensor([[-0.0829, -2.8386,  3.3417,  ...,  0.0691, -1.6546,  5.2116],
        [-0.8831,  0.7790, -2.8560,  ..., -4.9272,  3.3944, -5.2544],
        [ 1.6306,  0.9293, -1.4313,  ..., -0.9456, -2.1487,  1.1137],
        ...,
        [-0.8492, -0.8803,  2.8783,  ...,  2.2753, -0.8744, -4.2884],
        [ 2.8715, -5.3560,  0.5227,  ...,  0.6218,  0.4575,  2.1831],
        [ 0.0120, -2.5584, -3.2253,  ...,  7.6732,  1.7670,  1.4032]])

In [36]:
user = "userId"
item = "movieId"
user_index = user + "_index"
item_index = item + "_index"

# user_item_binary_matrix = pd.crosstab(training_set[user], training_set[item]).clip(upper=1)
# convert pd dataframe to np array
# data = user_item_binary_matrix.values
# user_item = torch.FloatTensor(data)
# user_item

In [37]:
user_idMapIndex = training_set[[user]].drop_duplicates()
user_idMapIndex[user_index] = user_idMapIndex.loc[:, user].astype("category").cat.codes

item_idMapIndex = training_set[[item]].drop_duplicates()
item_idMapIndex[item_index] = item_idMapIndex.loc[:, item].astype("category").cat.codes

training_set = pd.merge(training_set, user_idMapIndex, on=[user], how="inner")
training_set = pd.merge(training_set, item_idMapIndex, on=[item], how="inner")

item_idMapIndex_map = {}
for itemId, itemIndex in item_idMapIndex.values:
  item_idMapIndex_map[itemIndex] = itemId

In [38]:
num_recs = 100

_output_itemId = []
_output_rating = []
for _preds in predict:
  for _index in _preds.topk(num_recs).indices.tolist():
    _output_itemId.append(item_idMapIndex_map[_index])
  for _rating in _preds.topk(num_recs).values.tolist():
    _output_rating.append(_rating)

In [39]:
user_train = training_set[["userId"]].drop_duplicates()
user_train

Unnamed: 0,userId
0,5
1,130
2,1190
3,1734
4,1785
...,...
631490,123075
659341,130288
677296,65103
724006,180029


In [40]:
_output_userId = np.repeat(user_train.values, num_recs)
Q0 = ["Q0"] * len(_output_userId)
ranking = [i for _ in range(len(user_train)) for i in range(1, num_recs + 1)]
algo = ["elsa_vari"] * len(_output_userId)
output = pd.DataFrame({'userId': _output_userId, "Q0" : Q0, 'itemId': _output_itemId, 'ranking': ranking, 'rating': _output_rating, 'algo': algo})

In [41]:
output

Unnamed: 0,userId,Q0,itemId,ranking,rating,algo
0,5,Q0,1904,1,8.608417,elsa_vari
1,5,Q0,95583,2,8.332220,elsa_vari
2,5,Q0,106062,3,8.245851,elsa_vari
3,5,Q0,8974,4,8.161308,elsa_vari
4,5,Q0,1621,5,7.862520,elsa_vari
...,...,...,...,...,...,...
1092495,30124,Q0,4236,96,7.131208,elsa_vari
1092496,30124,Q0,6803,97,7.124311,elsa_vari
1092497,30124,Q0,2826,98,7.117578,elsa_vari
1092498,30124,Q0,87408,99,7.108712,elsa_vari


In [42]:
output_user_predict = pd.merge(output, user_predict, on=["userId"], how="inner")
output_user_predict

Unnamed: 0,userId,Q0,itemId,ranking,rating,algo
0,1190,Q0,993,1,8.700368,elsa_vari
1,1190,Q0,155288,2,7.433038,elsa_vari
2,1190,Q0,161956,3,7.164965,elsa_vari
3,1190,Q0,26770,4,7.092326,elsa_vari
4,1190,Q0,3206,5,7.072025,elsa_vari
...,...,...,...,...,...,...
101095,123075,Q0,191799,96,5.944958,elsa_vari
101096,123075,Q0,164606,97,5.929909,elsa_vari
101097,123075,Q0,141058,98,5.927854,elsa_vari
101098,123075,Q0,5745,99,5.925770,elsa_vari


In [44]:
result = "elsa_vari.results"
file_dir = "/content/gdrive/My Drive/msci720_prj/result/"
file_save_path = os.path.join(file_dir, result)

output_user_predict.to_csv(file_save_path, sep=' ', header=None, index=False)