In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("/Users/kristina/Desktop/University/COURSE_WORK/Project/RecSys")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/Project/recsysvenv/bin/python3.10'

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# from scipy.sparse import csr_matrix
# from scipy.sparse.linalg import svds
from surprise import SVDpp, Reader, Dataset
from surprise.model_selection import cross_validate

from typing import Union
from tqdm.notebook import tqdm

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

from src.metrics import reccomendation_report
from src.utils import surprise_predict
from src.utils import split_test_df
from src.utils import seed_everything
from src.utils import trainDataset

from src.models import NCF
from src.models import NeuMF

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
np.random.seed(RANDOM_STATE)

## Data

In [5]:
df_movies = pd.read_csv(
    DATA_FOLDER + "movies.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["movieId", "name", "genre"],
)
df_ratings = pd.read_csv(
    DATA_FOLDER + "ratings.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "movieId", "rating", "timestamp"],
)
df_users = pd.read_csv(
    DATA_FOLDER + "users.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "gender", "age", "occupation", "zip-code"],
)

In [6]:
## Encode usedId, movieId
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_movies["movieId"] = movie_encoder.fit_transform(df_movies["movieId"])
df_users["userId"] = user_encoder.fit_transform(df_users["userId"])

df_ratings["movieId"] = movie_encoder.transform(df_ratings["movieId"])
df_ratings["userId"] = user_encoder.transform(df_ratings["userId"])

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 10% of test.

In [7]:
df_ratings["rank"] = (
    df_ratings[["userId", "timestamp"]]
    .groupby("userId", as_index=False)["timestamp"]
    .rank(method="first", ascending=False)
)
# df_ratings = df_ratings.merge(
#     pd.DataFrame(df_ratings["userId"].value_counts()).reset_index(),
#     how="left", on="userId")
# df_ratings["cum_position"] = df_ratings["rank"] / df_ratings["count"]
# df_ratings = df_ratings.drop(columns=["rank", "count"])

In [8]:
# leave one out
df_train = df_ratings.loc[df_ratings["rank"] != 1].reset_index(drop=True)
df_test = (
    df_ratings.loc[df_ratings["rank"] == 1].reset_index(drop=True).assign(action=1)
)
df_test, df_val = train_test_split(df_test, test_size=0.2, random_state=RANDOM_STATE)

In [9]:
# enrich test data with 100 random movies from the ones not intercated by user
df_add = pd.DataFrame()
for user in tqdm(df_test.userId.unique(), desc="Enriching test"):
    movie = df_test.loc[df_test.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_test = pd.concat([df_test, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

df_add = pd.DataFrame()
for user in tqdm(df_val.userId.unique(), desc="Enriching val"):
    movie = df_val.loc[df_val.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_val = pd.concat([df_val, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
train_loader = DataLoader(
    trainDataset(df_train, df_movies["movieId"].nunique()),
    batch_size=2048,
    shuffle=True,
)

### NCF

In [14]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

In [22]:
ncf = NCF(num_users, num_items, mlp_layer_sizes=[16, 64, 32])
display(ncf)

NCF(
  (user_embedding): Embedding(6040, 8)
  (item_embedding): Embedding(3883, 8)
  (mlp): Sequential(
    (MLP_layer_1): Linear(in_features=16, out_features=64, bias=True)
    (Activation_layer_1): ReLU()
    (MLP_layer_2): Linear(in_features=64, out_features=32, bias=True)
    (Activation_layer_2): ReLU()
  )
  (final): Linear(in_features=32, out_features=1, bias=True)
)

In [23]:
optimizer = torch.optim.Adam(ncf.parameters(), lr=3e-4)
criterion = nn.BCELoss()
n_epochs = 5
device = "cpu"

In [24]:
### Train NCF
ncf.to(device)
num_iterations = len(train_loader)

for epoch in tqdm(range(n_epochs), desc="Epochs"):
    # train
    total_train_loss = 0
    ncf.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for userIds, movieIds, ratings in tepoch:
            pred_train = ncf(userIds.to(device), movieIds.to(device))
            loss_train = criterion(pred_train.flatten(), ratings.to(device))

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
            total_train_loss += loss_train.item()
            tepoch.set_postfix(loss=loss_train.item())
    print("Epoch:", epoch)
    print("Train loss", round(total_train_loss / num_iterations, 5))

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.46244


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.3717


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.35627


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.35307


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.35188


In [28]:
test_loader = DataLoader(
    TensorDataset(torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])),
    batch_size=4096,
    shuffle=False,
)

In [33]:
len(test_loader.dataset)

488032

In [36]:
# predict ncf
ncf.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = ncf(
            userIds, movieIds
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [40]:
df_test["rating_pred"] = total_preds.numpy()
pred, target = split_test_df(df_test)

In [41]:
reccomendation_report(pred, target, k=50)

{'Hit rate @ K': tensor(0.9112), 'NDCG @ K': tensor(0.3574)}

YOUUUHUU BETTER THAN SVD++

### NeuMF

In [43]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

In [44]:
neumf = NeuMF(num_users, num_items, mf_dim=32, mlp_layer_sizes=[16, 64, 32])
display(neumf)

NeuMF(
  (mf_user_embed): Embedding(6040, 32)
  (mf_item_embed): Embedding(3883, 32)
  (mlp_user_embed): Embedding(6040, 8)
  (mlp_item_embed): Embedding(3883, 8)
  (mlp): Sequential(
    (MLP_layer_1): Linear(in_features=16, out_features=64, bias=True)
    (Activation_layer_1): ReLU()
    (MLP_layer_2): Linear(in_features=64, out_features=32, bias=True)
    (Activation_layer_2): ReLU()
  )
  (final): Linear(in_features=64, out_features=1, bias=True)
)

In [46]:
optimizer = torch.optim.Adam(neumf.parameters(), lr=3e-4)
criterion = nn.BCELoss()
n_epochs = 5
device = "cpu"

In [47]:
### Train NeuMF
neumf.to(device)
num_iterations = len(train_loader)

for epoch in tqdm(range(n_epochs), desc="Epochs"):
    # train
    total_train_loss = 0
    neumf.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for userIds, movieIds, ratings in tepoch:
            pred_train = neumf(userIds.to(device), movieIds.to(device), sigmoid=True)
            loss_train = criterion(pred_train.flatten(), ratings.to(device))

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
            total_train_loss += loss_train.item()
            tepoch.set_postfix(loss=loss_train.item())
    print("Epoch:", epoch)
    print("Train loss", round(total_train_loss / num_iterations, 5))

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.37588


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.30779


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.28551


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.27419


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.26642


In [48]:
test_loader = DataLoader(
    TensorDataset(torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])),
    batch_size=4096,
    shuffle=False,
)

In [52]:
# predict NeuMF
neumf.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = neumf(
            userIds, movieIds, sigmoid=True
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [53]:
df_test["rating_pred"] = total_preds.numpy()
pred, target = split_test_df(df_test)

In [55]:
reccomendation_report(pred, target, k=50)

{'Hit rate @ K': tensor(0.9671), 'NDCG @ K': tensor(0.4366)}

ЕМАЕ КАК КРАСИВО БОООЖЕЕ МОЙЙ