In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [38]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

from src.utils import (
    create_test_user,
    train_test_val_split,
    seed_everything,
    trainDataset,
    split_test_df,
)
from src.models import NCF, NeuMF
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
df_movies = pd.read_csv(
    DATA_FOLDER + "movies.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["movieId", "name", "genre"],
)
df_ratings = pd.read_csv(
    DATA_FOLDER + "ratings.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "movieId", "rating", "timestamp"],
)
df_users = pd.read_csv(
    DATA_FOLDER + "users.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "gender", "age", "occupation", "zip-code"],
)

In [6]:
## Encode usedId, movieId
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_movies["movieId"] = movie_encoder.fit_transform(df_movies["movieId"])
df_users["userId"] = user_encoder.fit_transform(df_users["userId"])

df_ratings["movieId"] = movie_encoder.transform(df_ratings["movieId"])
df_ratings["userId"] = user_encoder.transform(df_ratings["userId"])

### Creating a test_user

In [7]:
df_users, df_ratings, new_user_id = create_test_user(
    df_users, df_ratings, [6, 16, 3192, 1461, 827, 887, 593]
)
print("Test user watch list:")
display(df_ratings[df_ratings.userId == new_user_id].merge(df_movies, on="movieId"))

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,5,0,Sabrina (1995),Comedy|Romance
1,6040,16,5,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,5,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,5,3,Love and Other Catastrophes (1996),Romance
4,6040,827,5,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,5,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,5,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 10% of test.

In [8]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

## Loading additional data

In [11]:
with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

### NCF

In [51]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDataset(df_train, df_movies["movieId"].nunique()),
    batch_size=2048,
    shuffle=True,
)

val_loader = DataLoader(
    TensorDataset(torch.tensor(df_val["userId"]), torch.tensor(df_val["movieId"])),
    batch_size=4096,
    shuffle=False,
)

num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 994175/994175 [00:27<00:00, 35528.49it/s]


In [61]:
seed_everything(RANDOM_STATE)
ncf = NCF(
    num_users,
    num_items,
    n_mlp_layers=5,
    mlp_layers_dim=64,
    mlp_kwargs={
        "activation": True,
        "dropout": True,
        "batchnorm": True,
        "dropout_rate": 0.6,
    },
)
display(ncf)

NCF(
  (user_embedding): Embedding(6041, 32)
  (item_embedding): Embedding(3883, 32)
  (mlp): Sequential(
    (MLP_layer_1): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=64, out_features=64, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inplace=False)
        (BatchNorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (MLP_layer_2): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=64, out_features=64, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inplace=False)
        (BatchNorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (MLP_layer_3): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=64, out_features=64, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inplace=False)
        (BatchNorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=T

In [63]:
optimizer = torch.optim.Adam(ncf.parameters(), lr=3e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=500, eta_min=1e-4
)
criterion = torch.nn.BCELoss()
n_epochs = 10
device = "cpu"

In [64]:
def train(
    model,
    optimizer,
    scheduler,
    criterion,
    n_epochs,
    device,
    train_loader,
    val_loader,
    df_val,
):
    ### Train NCF
    ncf.to(device)
    num_iterations = len(train_loader)

    for epoch in tqdm(range(n_epochs), desc="Epochs"):
        # train
        total_train_loss = 0
        ncf.train()
        with tqdm(train_loader, unit="batch") as tepoch:
            for userIds, movieIds, ratings in tepoch:
                pred_train = ncf(userIds.to(device), movieIds.to(device))
                loss_train = criterion(pred_train.flatten(), ratings.to(device))

                optimizer.zero_grad()
                loss_train.backward()
                optimizer.step()
                scheduler.step()
                total_train_loss += loss_train.item()
                tepoch.set_postfix(
                    loss=loss_train.item(), lr=round(scheduler.get_last_lr()[0], 7)
                )
        print("Epoch:", epoch)
        print("Train loss", round(total_train_loss / num_iterations, 5))

        # val
        ncf.eval()
        total_preds = torch.zeros(len(val_loader.dataset))
        batch_size = val_loader.batch_size
        for i, (userIds, movieIds) in enumerate(
            tqdm(val_loader, desc="Inference", unit="batch")
        ):
            with torch.no_grad():
                total_preds[i * batch_size : (i + 1) * batch_size] = ncf(
                    userIds, movieIds
                ).flatten()

        df_val["rating_pred"] = total_preds.numpy()
        pred, target, pred_items = split_test_df(
            df_val, "userId", "movieId", "rating_pred", "action"
        )
        print(
            reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)
        )

In [65]:
train(
    ncf,
    optimizer,
    scheduler,
    criterion,
    n_epochs,
    device,
    train_loader,
    val_loader,
    df_val,
)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.43168


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5749), 'NDCG @ K': tensor(0.2783), 'Diversity (ILD)': tensor(0.1443), 'Novelty (EPC)': tensor(0.6958)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.36923


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5707), 'NDCG @ K': tensor(0.2955), 'Diversity (ILD)': tensor(0.1442), 'Novelty (EPC)': tensor(0.6926)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.36464


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5757), 'NDCG @ K': tensor(0.2904), 'Diversity (ILD)': tensor(0.1441), 'Novelty (EPC)': tensor(0.6914)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.36201


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5823), 'NDCG @ K': tensor(0.2994), 'Diversity (ILD)': tensor(0.1442), 'Novelty (EPC)': tensor(0.6909)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.3598


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5848), 'NDCG @ K': tensor(0.2933), 'Diversity (ILD)': tensor(0.1440), 'Novelty (EPC)': tensor(0.6913)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss 0.35837


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5831), 'NDCG @ K': tensor(0.2965), 'Diversity (ILD)': tensor(0.1440), 'Novelty (EPC)': tensor(0.6915)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss 0.35738


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5873), 'NDCG @ K': tensor(0.2992), 'Diversity (ILD)': tensor(0.1435), 'Novelty (EPC)': tensor(0.6935)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 7
Train loss 0.35596


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5815), 'NDCG @ K': tensor(0.3041), 'Diversity (ILD)': tensor(0.1429), 'Novelty (EPC)': tensor(0.6955)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 8
Train loss 0.35378


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.5931), 'NDCG @ K': tensor(0.3080), 'Diversity (ILD)': tensor(0.1422), 'Novelty (EPC)': tensor(0.7003)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 9
Train loss 0.35111


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ K': tensor(0.6063), 'NDCG @ K': tensor(0.3119), 'Diversity (ILD)': tensor(0.1395), 'Novelty (EPC)': tensor(0.7040)}


In [66]:
test_loader = DataLoader(
    TensorDataset(torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])),
    batch_size=4096,
    shuffle=False,
)

In [67]:
# predict ncf
ncf.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = ncf(
            userIds, movieIds
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [68]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.5859),
 'NDCG @ K': tensor(0.3104),
 'Diversity (ILD)': tensor(0.1403),
 'Novelty (EPC)': tensor(0.7015)}

In [58]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.5584),
 'NDCG @ K': tensor(0.2812),
 'Diversity (ILD)': tensor(0.1451),
 'Novelty (EPC)': tensor(0.6865)}

In [18]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.5629),
 'NDCG @ K': tensor(0.2839),
 'Diversity (ILD)': tensor(0.1459),
 'Novelty (EPC)': tensor(0.6845)}

### NeuMF

In [20]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDataset(df_train, df_movies["movieId"].nunique()),
    batch_size=2048,
    shuffle=True,
)
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

100%|████████████████████████████████| 994175/994175 [00:29<00:00, 33863.24it/s]


In [21]:
seed_everything(RANDOM_STATE)
neumf = NeuMF(num_users, num_items, mf_dim=32, mlp_layer_sizes=[16, 64, 32])
display(neumf)

NeuMF(
  (mf_user_embed): Embedding(6041, 32)
  (mf_item_embed): Embedding(3883, 32)
  (mlp_user_embed): Embedding(6041, 8)
  (mlp_item_embed): Embedding(3883, 8)
  (mlp): Sequential(
    (MLP_layer_1): Linear(in_features=16, out_features=64, bias=True)
    (Activation_layer_1): ReLU()
    (MLP_layer_2): Linear(in_features=64, out_features=32, bias=True)
    (Activation_layer_2): ReLU()
  )
  (final): Linear(in_features=64, out_features=1, bias=True)
)

In [22]:
optimizer = torch.optim.Adam(neumf.parameters(), lr=3e-4)
criterion = nn.BCELoss()
n_epochs = 5
device = "cpu"

In [23]:
### Train NeuMF
neumf.to(device)
num_iterations = len(train_loader)

for epoch in tqdm(range(n_epochs), desc="Epochs"):
    # train
    total_train_loss = 0
    neumf.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for userIds, movieIds, ratings in tepoch:
            pred_train = neumf(userIds.to(device), movieIds.to(device), sigmoid=True)
            loss_train = criterion(pred_train.flatten(), ratings.to(device))

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
            total_train_loss += loss_train.item()
            tepoch.set_postfix(loss=loss_train.item())
    print("Epoch:", epoch)
    print("Train loss", round(total_train_loss / num_iterations, 5))

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.46242


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.3704


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.35588


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.35302


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.35179


In [24]:
test_loader = DataLoader(
    TensorDataset(torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])),
    batch_size=4096,
    shuffle=False,
)

In [25]:
# predict NeuMF
neumf.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = neumf(
            userIds, movieIds, sigmoid=True
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [26]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.5584),
 'NDCG @ K': tensor(0.2824),
 'Diversity (ILD)': tensor(0.1459),
 'Novelty (EPC)': tensor(0.6847)}

## Test user recommendations

In [27]:
pd.set_option("display.max_rows", 102)

### NCF

In [28]:
test_user_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]), torch.tensor(df_test_user["movieId"])
    ),
    batch_size=4096,
    shuffle=False,
)
# predict ncf
ncf.eval()
total_preds = torch.zeros(len(test_user_loader.dataset))
batch_size = test_user_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_user_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = ncf(
            userIds, movieIds
        ).flatten()

df_test_user["rating_pred"] = total_preds.numpy()

Inference:   0%|          | 0/1 [00:00<?, ?batch/s]

In [29]:
display(
    df_test_user.loc[df_test_user.userId == df_test_user["userId"].max()]
    .sort_values(by="rating_pred", ascending=False)
    .merge(df_movies, on="movieId")
    .loc[:100, ["userId", "movieId", "name", "genre"]]
)

Unnamed: 0,userId,movieId,name,genre
0,6040,315,"Shawshank Redemption, The (1994)",Drama
1,6040,847,"Godfather, The (1972)",Action|Crime|Drama
2,6040,1575,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
3,6040,2847,Total Recall (1990),Action|Adventure|Sci-Fi|Thriller
4,6040,2728,Big (1988),Comedy|Fantasy
5,6040,1366,Jaws (1975),Action|Horror
6,6040,49,"Usual Suspects, The (1995)",Crime|Thriller
7,6040,3412,High Fidelity (2000),Comedy
8,6040,1372,Jerry Maguire (1996),Drama|Romance
9,6040,1202,"Blues Brothers, The (1980)",Action|Comedy|Musical


### NeuMF

In [30]:
test_user_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]), torch.tensor(df_test_user["movieId"])
    ),
    batch_size=4096,
    shuffle=False,
)
# predict neumf
neumf.eval()
total_preds = torch.zeros(len(test_user_loader.dataset))
batch_size = test_user_loader.batch_size
for i, (userIds, movieIds) in enumerate(
    tqdm(test_user_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = neumf(
            userIds, movieIds
        ).flatten()

df_test_user["rating_pred"] = total_preds.numpy()

display(
    df_test_user.loc[df_test_user.userId == df_test_user["userId"].max()]
    .sort_values(by="rating_pred", ascending=False)
    .merge(df_movies, on="movieId")
    .loc[:100, ["userId", "movieId", "name", "genre"]]
)

Inference:   0%|          | 0/1 [00:00<?, ?batch/s]

Unnamed: 0,userId,movieId,name,genre
0,6040,315,"Shawshank Redemption, The (1994)",Drama
1,6040,1575,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
2,6040,847,"Godfather, The (1972)",Action|Crime|Drama
3,6040,2847,Total Recall (1990),Action|Adventure|Sci-Fi|Thriller
4,6040,49,"Usual Suspects, The (1995)",Crime|Thriller
5,6040,1366,Jaws (1975),Action|Horror
6,6040,2728,Big (1988),Comedy|Fantasy
7,6040,1202,"Blues Brothers, The (1980)",Action|Comedy|Musical
8,6040,724,"Rock, The (1996)",Action|Adventure|Thriller
9,6040,1372,Jerry Maguire (1996),Drama|Romance
