In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

from src.utils import (
    load_MovieLens,
    create_test_user,
    train_test_val_split,
    seed_everything,
    UserMovieDataset,
    split_test_df,
    add_not_watched_movies,
    create_test_user_display_df,
    train,
    predict,
)
from src.models import NeuMF
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER)

### Creating a test_user

In [6]:
df_users, df_ratings, new_user_id = create_test_user(
    df_users, df_ratings, [6, 16, 3192, 1461, 827, 887, 593]
)
print("Test user watch list:")
display(df_ratings[df_ratings.userId == new_user_id].merge(df_movies, on="movieId"))

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,5,0,Sabrina (1995),Comedy|Romance
1,6040,16,5,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,5,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,5,3,Love and Other Catastrophes (1996),Romance
4,6040,827,5,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,5,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,5,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [7]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

### Loading additional data

In [8]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

## NeuMF

In [9]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    UserMovieDataset(df_train, df_movies["movieId"].nunique()),
    batch_size=2048,
    shuffle=True,
)
val_loader = DataLoader(
    TensorDataset(torch.tensor(df_val["userId"]), torch.tensor(df_val["movieId"])),
    batch_size=4096,
    shuffle=False,
)
test_loader = DataLoader(
    TensorDataset(torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])),
    batch_size=4096,
    shuffle=False,
)
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

  0%|          | 0/994175 [00:00<?, ?it/s]

In [10]:
seed_everything(RANDOM_STATE)
neumf = NeuMF(
    num_users,
    num_items,
    mf_dim=32,
    n_mlp_layers=4,
    mlp_layers_dim=32,
    mlp_kwargs={
        "activation": True,
        "dropout": True,
        "batchnorm": True,
        "dropout_rate": 0.6,
    },
)
display(neumf)

NeuMF(
  (mf_user_embed): Embedding(6041, 32)
  (mf_item_embed): Embedding(3883, 32)
  (mlp_user_embed): Embedding(6041, 16)
  (mlp_item_embed): Embedding(3883, 16)
  (mlp): Sequential(
    (MLP_layer_0): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=32, out_features=32, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inplace=False)
        (BatchNorm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (MLP_layer_1): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=32, out_features=32, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inplace=False)
        (BatchNorm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (MLP_layer_2): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=32, out_features=32, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.6, inpla

In [11]:
optimizer = torch.optim.Adam(neumf.parameters(), lr=3e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = torch.nn.BCELoss()
n_epochs = 10
device = "cpu"

In [12]:
%%time
train(
    neumf,
    train_loader,
    optimizer,
    scheduler,
    criterion,
    n_epochs,
    val_loader=val_loader,
    df_val=df_val,
    cos_dist=cos_dist,
    popularity=popularity,
    verbose=True,
)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss: 0.43623


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5633),
 'NDCG @ 15': tensor(0.2401),
 'Diversity (ILD)': tensor(0.1432),
 'Novelty (EPC)': tensor(0.7215)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss: 0.38271


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5699),
 'NDCG @ 15': tensor(0.2506),
 'Diversity (ILD)': tensor(0.1434),
 'Novelty (EPC)': tensor(0.7160)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss: 0.37652


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5666),
 'NDCG @ 15': tensor(0.2492),
 'Diversity (ILD)': tensor(0.1381),
 'Novelty (EPC)': tensor(0.7255)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss: 0.36561


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5691),
 'NDCG @ 15': tensor(0.2573),
 'Diversity (ILD)': tensor(0.1304),
 'Novelty (EPC)': tensor(0.7422)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss: 0.3508


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5889),
 'NDCG @ 15': tensor(0.2689),
 'Diversity (ILD)': tensor(0.1269),
 'Novelty (EPC)': tensor(0.7463)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss: 0.33618


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5840),
 'NDCG @ 15': tensor(0.2666),
 'Diversity (ILD)': tensor(0.1196),
 'Novelty (EPC)': tensor(0.7667)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss: 0.32355


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5897),
 'NDCG @ 15': tensor(0.2735),
 'Diversity (ILD)': tensor(0.1165),
 'Novelty (EPC)': tensor(0.7726)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 7
Train loss: 0.3134


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5906),
 'NDCG @ 15': tensor(0.2786),
 'Diversity (ILD)': tensor(0.1140),
 'Novelty (EPC)': tensor(0.7781)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 8
Train loss: 0.30572


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5955),
 'NDCG @ 15': tensor(0.2826),
 'Diversity (ILD)': tensor(0.1133),
 'Novelty (EPC)': tensor(0.7793)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 9
Train loss: 0.2998


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5955),
 'NDCG @ 15': tensor(0.2847),
 'Diversity (ILD)': tensor(0.1118),
 'Novelty (EPC)': tensor(0.7839)}
CPU times: user 17min 22s, sys: 46min 39s, total: 1h 4min 1s
Wall time: 7min 32s


In [13]:
df_test["pred"] = predict(neumf, test_loader, verbose=False).numpy()
pred, target, pred_items = split_test_df(df_test, "userId", "movieId", "pred", "action")
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ 15': tensor(0.5919),
 'NDCG @ 15': tensor(0.2877),
 'Diversity (ILD)': tensor(0.1116),
 'Novelty (EPC)': tensor(0.7843)}

## Test user recommendations

In [14]:
df_test_user = add_not_watched_movies(new_user_id, df_test, df_train, df_movies)
test_user_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]), torch.tensor(df_test_user["movieId"])
    ),
    batch_size=4096,
    shuffle=False,
)
df_test_user["pred"] = predict(neumf, test_user_loader, verbose=False).numpy()
display(create_test_user_display_df(df_test_user, df_movies, "pred"))

Unnamed: 0,userId,movieId,name,genre,pred
0,6040,2061,Atlantic City (1980),Crime|Drama|Romance,0.925224
1,6040,1227,"Graduate, The (1967)",Drama|Romance,0.865895
2,6040,3476,Cabaret (1972),Musical|War,0.814038
3,6040,2401,Crocodile Dundee (1986),Adventure|Comedy,0.79864
4,6040,1211,Annie Hall (1977),Comedy|Romance,0.786102
5,6040,1067,"Great Race, The (1965)",Comedy|Musical,0.783455
6,6040,1202,"Blues Brothers, The (1980)",Action|Comedy|Musical,0.783216
7,6040,1243,"Deer Hunter, The (1978)",Drama|War,0.773092
8,6040,1081,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi,0.764825
9,6040,1220,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.764572
