In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from src.utils import (
    load_MovieLens,
    create_test_user,
    train_test_val_split,
    seed_everything,
    trainDatasetWithNumCatFeatures,
    split_test_df,
    add_not_watched_movies,
    create_test_user_display_df,
    train,
    predict,
)
from src.models import DCNv2
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER)

### Creating a test_user

In [6]:
df_users, df_ratings, new_user_id = create_test_user(
    df_users, df_ratings, [6, 16, 3192, 1461, 827, 887, 593]
)
print("Test user watch list:")
display(df_ratings[df_ratings.userId == new_user_id].merge(df_movies, on="movieId"))

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,5,0,Sabrina (1995),Comedy|Romance
1,6040,16,5,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,5,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,5,3,Love and Other Catastrophes (1996),Romance
4,6040,827,5,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,5,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,5,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [7]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

### Loading additional data

In [8]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

### Building ordinal encoded features and normalizing continious features

In [9]:
ord_user = OrdinalEncoder()
user_cat = torch.tensor(ord_user.fit_transform(df_users[["gender", "occupation"]]))

ord_movie = OrdinalEncoder()
movie_cat = torch.tensor(ord_movie.fit_transform(df_movies[["genre"]]))

ss_user = StandardScaler()
user_num = torch.tensor(ss_user.fit_transform(df_users[["age"]]))

# features for val
user_cat_val = user_cat[df_val["userId"].values].clone().detach().to(torch.long)
user_num_val = user_num[df_val["userId"].values].clone().detach().to(torch.float)
movie_cat_val = movie_cat[df_val["movieId"].values].clone().detach().to(torch.long)
val_cat = torch.hstack((user_cat_val, movie_cat_val))

# features for test
user_cat_test = user_cat[df_test["userId"].values].clone().detach().to(torch.long)
user_num_test = user_num[df_test["userId"].values].clone().detach().to(torch.float)
movie_cat_test = movie_cat[df_test["movieId"].values].clone().detach().to(torch.long)
test_cat = torch.hstack((user_cat_test, movie_cat_test))


num_numeric_feats = 1
cat_feature_vocab = [len(i) for i in ord_user.categories_] + [
    len(i) for i in ord_movie.categories_
]

## DCN V2 Best

In [10]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDatasetWithNumCatFeatures(
        df_train, df_movies["movieId"].nunique(), user_cat, user_num, movie_cat
    ),
    batch_size=2048,
    shuffle=True,
)
val_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_val["userId"]),
        torch.tensor(df_val["movieId"]),
        user_num_val,
        val_cat,
    ),
    batch_size=4096,
    shuffle=False,
)
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test["userId"]),
        torch.tensor(df_test["movieId"]),
        user_num_test,
        test_cat,
    ),
    batch_size=4096,
    shuffle=False,
)
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

  0%|          | 0/994175 [00:00<?, ?it/s]

IOStream.flush timed out


**Best hyper parametres combination after tuning:**

Trial 6 finished with value: 0.6274834275245667 and parameters: {

'l': 1, 

'n_mlp_layers': 10, 

'mlp_layers_dim': 304.0, 

'mlp_kwargs_dropout': False, 

'mlp_kwargs_batchnorm': False, 

'mlp_kwargs_dropout_rate': 0.2299671130204035, 

'lr': 0.003144141618479194, 

'n_epochs': 19.0
}

2024-05-14 18:49:10,185 | INFO | Trial 5 finished with value: 0.6214128136634827 and parameters: {'l': 10, 'n_mlp_layers': 3, 'mlp_layers_dim': 224.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.4719799629637327, 'lr': 0.004796725329128699, 'n_epochs': 9.0}. Best is trial 5 with value: 0.6214128136634827.

In [11]:
seed_everything(RANDOM_STATE)
dcn = DCNv2(
    num_users,
    num_items,
    num_numeric_feats,
    cat_feature_vocab,
    l=1,
    n_mlp_layers=10,
    mlp_layers_dim=304,
    mlp_kwargs={
        "activation": True,
        "dropout": False,
        "batchnorm": False,
        "dropout_rate": 0.23,
    },
)
display(dcn)

DCNv2(
  (user_embedding): Embedding(6041, 5)
  (item_embedding): Embedding(3883, 5)
  (numerical_embeddings): ModuleList(
    (0): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=1, out_features=5, bias=True)
      )
    )
  )
  (categorical_embeddings): ModuleList(
    (0): Embedding(2, 5)
    (1): Embedding(21, 5)
    (2): Embedding(301, 5)
  )
  (mlp): Sequential(
    (MLP_layer_0): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=30, out_features=304, bias=True)
        (Activation): ReLU()
        (Dropout): Dropout(p=0.2, inplace=False)
        (BatchNorm): BatchNorm1d(304, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (MLP_layer_1): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=304, out_features=304, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_2): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=304, out_features=304, bias=True)
      

In [12]:
optimizer = torch.optim.Adam(dcn.parameters(), lr=0.0031)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = torch.nn.BCELoss()
n_epochs = 19
device = "cpu"

In [13]:
%%time
train(
    dcn,
    train_loader,
    optimizer,
    scheduler,
    criterion,
    n_epochs,
    val_loader=val_loader,
    df_val=df_val,
    cos_dist=cos_dist,
    popularity=popularity,
    verbose=True,
)

Epochs:   0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss: 0.36857


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5815),
 'NDCG @ 15': tensor(0.2806),
 'Diversity (ILD)': tensor(0.1413),
 'Novelty (EPC)': tensor(0.6992)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss: 0.33789


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5897),
 'NDCG @ 15': tensor(0.2961),
 'Diversity (ILD)': tensor(0.1378),
 'Novelty (EPC)': tensor(0.7093)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss: 0.32976


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6005),
 'NDCG @ 15': tensor(0.3065),
 'Diversity (ILD)': tensor(0.1360),
 'Novelty (EPC)': tensor(0.7138)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss: 0.32514


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6055),
 'NDCG @ 15': tensor(0.3099),
 'Diversity (ILD)': tensor(0.1345),
 'Novelty (EPC)': tensor(0.7184)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss: 0.32128


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6079),
 'NDCG @ 15': tensor(0.3109),
 'Diversity (ILD)': tensor(0.1334),
 'Novelty (EPC)': tensor(0.7212)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss: 0.31828


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6187),
 'NDCG @ 15': tensor(0.3143),
 'Diversity (ILD)': tensor(0.1325),
 'Novelty (EPC)': tensor(0.7241)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss: 0.316


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6154),
 'NDCG @ 15': tensor(0.3118),
 'Diversity (ILD)': tensor(0.1320),
 'Novelty (EPC)': tensor(0.7258)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 7
Train loss: 0.31427


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6270),
 'NDCG @ 15': tensor(0.3147),
 'Diversity (ILD)': tensor(0.1321),
 'Novelty (EPC)': tensor(0.7257)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 8
Train loss: 0.31296


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6336),
 'NDCG @ 15': tensor(0.3151),
 'Diversity (ILD)': tensor(0.1314),
 'Novelty (EPC)': tensor(0.7277)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 9
Train loss: 0.31194


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6344),
 'NDCG @ 15': tensor(0.3167),
 'Diversity (ILD)': tensor(0.1312),
 'Novelty (EPC)': tensor(0.7282)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 10
Train loss: 0.31117


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6328),
 'NDCG @ 15': tensor(0.3168),
 'Diversity (ILD)': tensor(0.1311),
 'Novelty (EPC)': tensor(0.7287)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 11
Train loss: 0.31052


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6328),
 'NDCG @ 15': tensor(0.3172),
 'Diversity (ILD)': tensor(0.1312),
 'Novelty (EPC)': tensor(0.7288)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 12
Train loss: 0.31005


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6344),
 'NDCG @ 15': tensor(0.3168),
 'Diversity (ILD)': tensor(0.1310),
 'Novelty (EPC)': tensor(0.7290)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 13
Train loss: 0.30969


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6352),
 'NDCG @ 15': tensor(0.3172),
 'Diversity (ILD)': tensor(0.1313),
 'Novelty (EPC)': tensor(0.7286)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 14
Train loss: 0.30938


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6344),
 'NDCG @ 15': tensor(0.3180),
 'Diversity (ILD)': tensor(0.1310),
 'Novelty (EPC)': tensor(0.7292)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 15
Train loss: 0.30914


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6352),
 'NDCG @ 15': tensor(0.3186),
 'Diversity (ILD)': tensor(0.1311),
 'Novelty (EPC)': tensor(0.7290)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 16
Train loss: 0.30895


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6361),
 'NDCG @ 15': tensor(0.3180),
 'Diversity (ILD)': tensor(0.1309),
 'Novelty (EPC)': tensor(0.7297)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 17
Train loss: 0.3088


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6361),
 'NDCG @ 15': tensor(0.3172),
 'Diversity (ILD)': tensor(0.1310),
 'Novelty (EPC)': tensor(0.7296)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 18
Train loss: 0.30868


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6361),
 'NDCG @ 15': tensor(0.3181),
 'Diversity (ILD)': tensor(0.1310),
 'Novelty (EPC)': tensor(0.7296)}
CPU times: user 1h 55min 19s, sys: 5h 32min 23s, total: 7h 27min 42s
Wall time: 48min 10s


In [14]:
df_test["pred"] = predict(dcn, test_loader, verbose=False).numpy()
pred, target, pred_items = split_test_df(df_test, "userId", "movieId", "pred", "action")
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ 15': tensor(0.6287),
 'NDCG @ 15': tensor(0.3186),
 'Diversity (ILD)': tensor(0.1316),
 'Novelty (EPC)': tensor(0.7278)}

## Test user recommendations

In [15]:
df_test_user = add_not_watched_movies(new_user_id, df_test, df_train, df_movies)

# features for test user
user_cat_test_user = (
    user_cat[df_test_user["userId"].values].clone().detach().to(torch.long)
)
user_num_test_user = (
    user_num[df_test_user["userId"].values].clone().detach().to(torch.float)
)
movie_cat_test_user = (
    movie_cat[df_test_user["movieId"].values].clone().detach().to(torch.long)
)
test_user_cat = torch.hstack((user_cat_test_user, movie_cat_test_user))

test_user_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]),
        torch.tensor(df_test_user["movieId"]),
        user_num_test_user,
        test_user_cat,
    ),
    batch_size=4096,
    shuffle=False,
)

In [16]:
df_test_user["pred"] = predict(dcn, test_user_loader, verbose=True).numpy()
display(create_test_user_display_df(df_test_user, df_movies, "pred"))

Inference:   0%|          | 0/1 [00:00<?, ?batch/s]

Unnamed: 0,userId,movieId,name,genre,pred
0,6040,1720,"Wedding Singer, The (1998)",Comedy|Romance,0.953705
1,6040,1899,"Breakfast Club, The (1985)",Comedy|Drama,0.938518
2,6040,1854,There's Something About Mary (1998),Comedy,0.923268
3,6040,2222,Edward Scissorhands (1990),Drama|Romance,0.916178
4,6040,102,Happy Gilmore (1996),Comedy,0.890119
5,6040,2252,Pleasantville (1998),Comedy,0.87397
6,6040,10,"American President, The (1995)",Comedy|Drama|Romance,0.870376
7,6040,2625,Big Daddy (1999),Comedy,0.862656
8,6040,2631,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy,0.846404
9,6040,368,Reality Bites (1994),Comedy|Drama,0.804024
