In [1]:
import time
import typing

import numpy as np
import pandas as pd
import torch
import scipy

import os
import json

In [2]:
from google.colab import drive

In [3]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [4]:
file_dir = '/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit/training_set_user11k'
file = 'ratings.csv'
file_path = os.path.join(file_dir, file)

columns = ["userId", "movieId", "rating", "timestamp"]
ratings_train = pd.read_csv(file_path, sep=',')

In [5]:
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

In [6]:
with open(file_path, "r") as json_file:
    userIds = json.load(json_file)

In [7]:
training_set = ratings_train
training_set

Unnamed: 0,userId,movieId,rating,timestamp
0,5,10,4.0,840768638
1,5,110,4.0,840768763
2,5,161,4.0,840764183
3,5,165,4.0,840764017
4,5,349,4.0,840764017
...,...,...,...,...
862942,200959,176371,4.5,1663748485
862943,200959,192365,5.0,1663748485
862944,200959,194474,5.0,1663748485
862945,200959,204698,4.0,1663748485


In [8]:
user_item_binary_matrix = pd.crosstab(training_set["userId"], training_set["movieId"]).clip(upper=1)
user_item_binary_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,288979,289253,289727,289797,290213,290263,290383,290767,291419,291857
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
user_item_binary_matrix.shape[0]

10925

In [10]:
user_item_binary_matrix.shape[1]

18221

In [11]:
# ndarray
# data = user_item_binary_matrix.values
# data

In [12]:
user_item = torch.FloatTensor(user_item_binary_matrix.values)
user_item

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
# train_dataloader = torch.utils.data.dataloader.DataLoader(user_item, batch_size=128, shuffle=True)

In [None]:
# len(train_dataloader)

8

In [13]:
class ELSA(torch.nn.Module):
  def __init__(self, dim_item, dim_latent, lr=0.1, weight_decay=1e-8):
    super(ELSA, self).__init__()
    W = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([dim_item, dim_latent])).detach().clone())
    self.W_list = torch.nn.ParameterList([W])
    self.dim_item = dim_item
    self.optimizer = torch.optim.NAdam(self.parameters(), lr=lr, weight_decay=weight_decay)
    # normalizaton is done in the training step
    self.criterion = torch.nn.MSELoss()

  def train_step(self, x, y):
    self.zero_grad()
    output = self(x)
    loss = self.criterion(torch.nn.functional.normalize(output, dim=-1), torch.nn.functional.normalize(y, dim=-1))
    loss.backward()
    self.optimizer.step()
    return loss, output

  def fit(self, train_data, batch_size, shuffle=True, epochs=20):
    train_dataloader = torch.utils.data.dataloader.DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    total_steps = len(train_dataloader)
    print("****************** START TRAINING ******************")
    print("Total steps {:n}".format(total_steps))

    losses = {"nmse_train": []}

    for epoch_index in range(1, epochs + 1):
      epoch_start = time.time()
      nmse_losses_per_epoch = []
      for step, io_batch in enumerate(train_dataloader, start=1):
        loss, predictions = self.train_step(io_batch, io_batch)
        nmse_losses_per_epoch.append(loss.item())
        '''
        log_dict = {
          "Epoch": f"{epoch_index} / {epochs}",
          "Step": f"{step}/{total_steps}",
          "Loss(nmse)": round(np.mean(nmse_losses_per_epoch), 4),
          "Training time": f"{(time.time() - epoch_start):2f}s",
        }
        for key, val in log_dict.items():
          print(f"{key}: {val}\n")
        '''

      losses["nmse_train"].append(np.mean(nmse_losses_per_epoch))

      train_end = time.time()
      log_dict = {
        "Epoch": f"{epoch_index} / {epochs}",
        "Loss(nmse)": round(losses["nmse_train"][-1], 4),
        "Training time": f"{(train_end - epoch_start):2f}s",
      }

      for key, val in log_dict.items():
          print(f"{key}: {val}\n")

    print("****************** [END] ******************")
    return losses

  def forward(self, x):
    weights = torch.vstack([param for param in self.W_list])
    A = torch.nn.functional.normalize(weights, dim=-1)

    x = x.float()
    xA = x @ A
    xAAT = xA @ A.T
    return xAAT - x

  def predict(self, predict_data, batch_size=128):
    shuffle = False
    predict_dataloader = torch.utils.data.dataloader.DataLoader(predict_data, batch_size=batch_size, shuffle=shuffle)
    prediction = torch.vstack(list(self.predict_generator(predict_dataloader)))

    rm_seen_factor = 1000
    prediction_rm_seen = prediction - rm_seen_factor * predict_data
    return prediction_rm_seen

  def predict_generator(self, predict_dataloader):
    for input_batch in predict_dataloader:
        yield self.forward(input_batch).detach()


In [35]:
model = ELSA(dim_item=user_item_binary_matrix.shape[1], dim_latent=300)

In [43]:
model.fit(user_item, batch_size=128, epochs=25)

****************** START TRAINING ******************
Total steps 86
Epoch: 1 / 25

Loss(nmse): 0.0001

Training time: 36.286956s

Epoch: 2 / 25

Loss(nmse): 0.0001

Training time: 35.417258s

Epoch: 3 / 25

Loss(nmse): 0.0001

Training time: 35.921731s

Epoch: 4 / 25

Loss(nmse): 0.0001

Training time: 35.430151s

Epoch: 5 / 25

Loss(nmse): 0.0001

Training time: 35.628816s

Epoch: 6 / 25

Loss(nmse): 0.0001

Training time: 35.641565s

Epoch: 7 / 25

Loss(nmse): 0.0001

Training time: 34.891833s

Epoch: 8 / 25

Loss(nmse): 0.0001

Training time: 35.514000s

Epoch: 9 / 25

Loss(nmse): 0.0001

Training time: 35.265253s

Epoch: 10 / 25

Loss(nmse): 0.0001

Training time: 35.312531s

Epoch: 11 / 25

Loss(nmse): 0.0001

Training time: 35.667077s

Epoch: 12 / 25

Loss(nmse): 0.0001

Training time: 35.130312s

Epoch: 13 / 25

Loss(nmse): 0.0001

Training time: 35.658239s

Epoch: 14 / 25

Loss(nmse): 0.0001

Training time: 35.265142s

Epoch: 15 / 25

Loss(nmse): 0.0001

Training time: 35.50799

{'nmse_train': [8.224331590057308e-05,
  8.228595842685832e-05,
  8.227033821492304e-05,
  8.214047353925876e-05,
  8.207756273318516e-05,
  8.194018089345136e-05,
  8.195828931368238e-05,
  8.183471808754189e-05,
  8.20576878627329e-05,
  8.183194163287882e-05,
  8.180776244126867e-05,
  8.187104609786793e-05,
  8.171986633516458e-05,
  8.172899825419949e-05,
  8.138414358295132e-05,
  8.146144318743609e-05,
  8.166881381712724e-05,
  8.149497477651769e-05,
  8.15390121715609e-05,
  8.142514969560665e-05,
  8.12920844446337e-05,
  8.137212852340971e-05,
  8.127616989483168e-05,
  8.122911602312828e-05,
  8.095671779132275e-05]}

In [44]:
# for prediction
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

with open(file_path, "r") as json_file:
    userIds_predict = json.load(json_file)

In [29]:
# user_predict = pd.DataFrame({'userId': userIds_predict})

In [18]:
# user_predict = pd.merge(ratings_train, user_predict, on=["userId"], how="inner")
# item_binary_matrix_predict = pd.crosstab(user_predict["userId"], user_predict["movieId"]).clip(upper=1)
# user_item_predict = torch.FloatTensor(item_binary_matrix_predict.values)
# predict = model.predict(user_item_predict)

In [45]:
user_predict = pd.DataFrame({'userId': userIds_predict})
user_item_binary_matrix_predict = user_item_binary_matrix[user_item_binary_matrix.index.isin(user_predict['userId'])]
user_item_binary_matrix_predict

movieId,1,2,3,4,5,6,7,8,9,10,...,288979,289253,289727,289797,290213,290263,290383,290767,291419,291857
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
269,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
516,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1190,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [46]:
user_item_predict = torch.FloatTensor(user_item_binary_matrix_predict.values)
predict = model.predict(user_item_predict)

In [47]:
user = "userId"
item = "movieId"
user_index = user + "_index"
item_index = item + "_index"

# user_item_binary_matrix = pd.crosstab(training_set[user], training_set[item]).clip(upper=1)
# convert pd dataframe to np array
# data = user_item_binary_matrix.values
# user_item = torch.FloatTensor(data)
# user_item

In [22]:
user_idMapIndex = training_set[[user]].drop_duplicates()
user_idMapIndex[user_index] = user_idMapIndex.loc[:, user].astype("category").cat.codes

item_idMapIndex = training_set[[item]].drop_duplicates()
item_idMapIndex[item_index] = item_idMapIndex.loc[:, item].astype("category").cat.codes

training_set = pd.merge(training_set, user_idMapIndex, on=[user], how="inner")
training_set = pd.merge(training_set, item_idMapIndex, on=[item], how="inner")

item_idMapIndex_map = {}
for itemId, itemIndex in item_idMapIndex.values:
  item_idMapIndex_map[itemIndex] = itemId

In [48]:
num_recs = 100

_output_itemId = []
_output_rating = []
for _preds in predict:
  for _index in _preds.topk(num_recs).indices.tolist():
    _output_itemId.append(item_idMapIndex_map[_index])
  for _rating in _preds.topk(num_recs).values.tolist():
    _output_rating.append(_rating)

_output_userId = np.repeat(user_predict[["userId"]].values, num_recs)
Q0 = ["Q0"] * len(_output_userId)
ranking = [i for _ in range(len(user_predict)) for i in range(1, num_recs + 1)]
algo = ["elsa"] * len(_output_userId)
output = pd.DataFrame({'userId': _output_userId, "Q0" : Q0, 'itemId': _output_itemId, 'ranking': ranking, 'rating': _output_rating, 'algo': algo})

In [49]:
output

Unnamed: 0,userId,Q0,itemId,ranking,rating,algo
0,269,Q0,1291,1,7.909534,elsa
1,269,Q0,1270,2,7.842189,elsa
2,269,Q0,1196,3,7.789792,elsa
3,269,Q0,1210,4,7.753688,elsa
4,269,Q0,1198,5,7.730851,elsa
...,...,...,...,...,...,...
101095,200959,Q0,51255,96,19.636681,elsa
101096,200959,Q0,6378,97,19.605192,elsa
101097,200959,Q0,5445,98,19.578651,elsa
101098,200959,Q0,260,99,19.564156,elsa


In [50]:
result = "elsa.results"
file_dir = "/content/gdrive/My Drive/msci720_prj/result/"
file_save_path = os.path.join(file_dir, result)

output.to_csv(file_save_path, sep=' ', header=None, index=False)