In [None]:
!pip install --upgrade torch torchvision

Collecting torch
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting torchvision
  Downloading torchvision-0.17.2-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB

In [1]:
import numpy as np
import pandas as pd
import torch
import os
import json

In [2]:
from google.colab import drive

In [3]:
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [4]:
!ls "/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit"

movies.csv  ratings.csv  training_set_user11k


In [5]:
file_dir = '/content/gdrive/My Drive/msci720_prj/dataset/train-dataset/ml-implicit/training_set_user11k'
file = 'ratings.csv'
file_path = os.path.join(file_dir, file)

columns = ["userId", "movieId", "rating", "timestamp"]
ratings_train = pd.read_csv(file_path, sep=',')

In [6]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("GPU")
else:
  device = torch.device("cpu")
  print("CPU")

CPU


In [7]:
len(ratings_train["userId"].unique())

10925

In [8]:
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

In [9]:
with open(file_path, "r") as json_file:
    userIds = json.load(json_file)

In [10]:
# user_train_trimmed = pd.DataFrame({'userId': userIds})

In [11]:
# training_set = pd.merge(ratings_train, user_train_trimmed, on=["userId"], how="inner")

In [12]:
training_set = ratings_train
training_set

Unnamed: 0,userId,movieId,rating,timestamp
0,5,10,4.0,840768638
1,5,110,4.0,840768763
2,5,161,4.0,840764183
3,5,165,4.0,840764017
4,5,349,4.0,840764017
...,...,...,...,...
862942,200959,176371,4.5,1663748485
862943,200959,192365,5.0,1663748485
862944,200959,194474,5.0,1663748485
862945,200959,204698,4.0,1663748485


In [13]:
len(training_set["userId"].unique())

10925

In [14]:
item_binary_matrix = pd.crosstab(training_set["userId"], training_set["movieId"]).clip(upper=1)

In [15]:
item_binary_matrix.shape

(10925, 18221)

In [16]:
class EASE():
  def __init__(self, training_set, user="userId", item="movieId", lamda=500):
    self.user = user
    self.item = item
    self.lamda = lamda

    self.user_index = self.user + "_index"
    self.item_index = self.item + "_index"

    # unique_users = df[self.user].unique()
    # unique_items = df[self.item].unique()
    user_item_binary_matrix = pd.crosstab(training_set[self.user], training_set[self.item]).clip(upper=1)
    # convert pd dataframe to np array
    user_item_binary_array = user_item_binary_matrix.values
    self.user_item = torch.FloatTensor(user_item_binary_array)

    self.user_idMapIndex = training_set[[self.user]].drop_duplicates()
    self.user_idMapIndex[self.user_index] = self.user_idMapIndex.loc[:, self.user].astype("category").cat.codes

    self.item_idMapIndex = training_set[[self.item]].drop_duplicates()
    self.item_idMapIndex[self.item_index] = self.item_idMapIndex.loc[:, self.item].astype("category").cat.codes

    training_set = pd.merge(training_set, self.user_idMapIndex, on=[self.user], how="inner")
    training_set = pd.merge(training_set, self.item_idMapIndex, on=[self.item], how="inner")

    self.item_idMapIndex_map = {}
    for itemId, itemIndex in self.item_idMapIndex.values:
      self.item_idMapIndex_map[itemIndex] = itemId


  def fit(self):
    G = self.user_item.t() @ self.user_item
    G += self.lamda * torch.eye(G.shape[0])
    P = G.inverse()
    B = P / (-1 * P.diag())
    # torch.fill_diagonal_(B, 0)
    n = B.shape[0]
    for i in range(n):
        B[i, i] = 0
    self.B = B
    return

  def predict(self, user_predict, num_recs=100):
    rm_seen_factor = 1000
    user_predict = pd.merge(user_predict, self.user_idMapIndex, on=[self.user], how="inner")
    user_predict_torch = torch.LongTensor(user_predict[self.user_index])
    filtered_user_predict = self.user_item[user_predict_torch]

    prediction = filtered_user_predict @ self.B
    prediction_rm_seen = prediction - rm_seen_factor * filtered_user_predict

    _output_itemId = []
    _output_rating = []
    for _preds in prediction_rm_seen:
      for _index in _preds.topk(num_recs).indices.tolist():
        _output_itemId.append(self.item_idMapIndex_map[_index])
      for _rating in _preds.topk(num_recs).values.tolist():
        _output_rating.append(_rating)

    _output_userId = np.repeat(user_predict[["userId"]].values, num_recs)
    Q0 = ["Q0"] * len(_output_userId)
    ranking = [i for _ in range(len(user_predict_torch)) for i in range(1, num_recs + 1)]
    EASE = ["EASE"] * len(_output_userId)
    output = pd.DataFrame({'userId': _output_userId, "Q0" : Q0, 'itemId': _output_itemId, 'ranking': ranking, 'rating': _output_rating, 'algo': EASE})
    return output

In [23]:
model = EASE(training_set, lamda=400)

In [24]:
model.fit()

In [25]:
# for prediction
file_dir = '/content/gdrive/My Drive/msci720_prj/'
file = 'userIds.json'
file_path = os.path.join(file_dir, file)

with open(file_path, "r") as json_file:
    userIds_predict = json.load(json_file)

In [26]:
user_predict = pd.DataFrame({'userId': userIds_predict})

In [27]:
output = model.predict(user_predict)



In [28]:
result = "ease_400.results"
file_dir = "/content/gdrive/My Drive/msci720_prj/result/"
file_save_path = os.path.join(file_dir, result)

output.to_csv(file_save_path, sep=' ', header=None, index=False)
