In [17]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [18]:
import pandas as pd

pd.set_option("display.max_rows", 102)

from tqdm.auto import tqdm
import mmh3

import torch
from torch.utils.data import DataLoader, TensorDataset

from src.utils import (
    load_MovieLens,
    create_test_user,
    train_test_val_split,
    seed_everything,
    trainDatasetWithCrossFeatures,
    split_test_df,
    add_not_watched_movies,
    create_test_user_display_df,
    train,
    predict,
)
from src.models import wideAndDeep
from src.metrics import reccomendation_report

## Constants

In [19]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [20]:
seed_everything(RANDOM_STATE)

## Data

In [21]:
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER)

### Creating a test_user

In [22]:
df_users, df_ratings, new_user_id = create_test_user(
    df_users, df_ratings, [6, 16, 3192, 1461, 827, 887, 593]
)
print("Test user watch list:")
display(df_ratings[df_ratings.userId == new_user_id].merge(df_movies, on="movieId"))

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,5,0,Sabrina (1995),Comedy|Romance
1,6040,16,5,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,5,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,5,3,Love and Other Catastrophes (1996),Romance
4,6040,827,5,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,5,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,5,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [23]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

### Loading additional data

In [24]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

### Building cross features

In [25]:
cross_feats_dim = 30
df_users["combined_feat"] = (
    df_users["gender"].astype(str)
    + df_users["age"].astype(str)
    + df_users["occupation"].astype(str)
)
user_features_mapping = {
    i: df_users.loc[df_users.userId == i, "combined_feat"].values[0]
    for i in df_users.userId.unique()
}
movie_features_mapping = {
    i: df_movies.loc[df_movies.movieId == i, "genre"].values[0]
    for i in df_movies.movieId.unique()
}


# cross features for test
test_idx = (
    (
        pd.Series(list(map(lambda x: user_features_mapping[x], df_test["userId"])))
        + pd.Series(list(map(lambda x: movie_features_mapping[x], df_test["movieId"])))
    )
    .apply(lambda x: mmh3.hash(x) % cross_feats_dim)
    .values
)
cross_test = torch.zeros(df_test.shape[0], cross_feats_dim)
cross_test[torch.arange(cross_test.shape[0]), test_idx] = 1


# cross features for val
val_idx = (
    (
        pd.Series(list(map(lambda x: user_features_mapping[x], df_val["userId"])))
        + pd.Series(list(map(lambda x: movie_features_mapping[x], df_val["movieId"])))
    )
    .apply(lambda x: mmh3.hash(x) % cross_feats_dim)
    .values
)
cross_val = torch.zeros(df_val.shape[0], cross_feats_dim)
cross_val[torch.arange(cross_val.shape[0]), val_idx] = 1

## Wide-and-Deep Best

In [26]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDatasetWithCrossFeatures(
        df_train,
        df_movies["movieId"].nunique(),
        user_features_mapping,
        movie_features_mapping,
        hash_bucket_size=cross_feats_dim,
    ),
    batch_size=2048,
    shuffle=True,
)
val_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_val["userId"]), torch.tensor(df_val["movieId"]), cross_val
    ),
    batch_size=4096,
    shuffle=False,
)
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"]), cross_test
    ),
    batch_size=4096,
    shuffle=False,
)
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

  0%|          | 0/994175 [00:00<?, ?it/s]

IOStream.flush timed out
IOStream.flush timed out


  0%|          | 0/4970875 [00:00<?, ?it/s]

**Best hyper parametres combination after tuning:**

Trial 4 finished with value: 0.6230684518814087 and parameters: {

'n_mlp_layers': 9, 

'mlp_layers_dim': 400.0, 

'mlp_kwargs_dropout': False, 

'mlp_kwargs_batchnorm': False, 

'mlp_kwargs_dropout_rate': 0.10651146716504352, 

'lr': 0.0027311465340585516, 

'n_epochs': 7.0

}

In [27]:
seed_everything(RANDOM_STATE)
wd = wideAndDeep(
    num_users,
    num_items,
    cross_feats_dim=cross_feats_dim,
    n_mlp_layers=9,
    mlp_layers_dim=400,
    mlp_kwargs={
        "activation": True,
        "dropout": False,
        "batchnorm": False,
        "dropout_rate": 0.11,
    },
)
display(wd)

wideAndDeep(
  (user_embedding): Embedding(6041, 200)
  (item_embedding): Embedding(3883, 200)
  (mlp): Sequential(
    (MLP_layer_0): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=400, out_features=400, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_1): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=400, out_features=400, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_2): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=400, out_features=400, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_3): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=400, out_features=400, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_4): MLP(
      (block): Sequential(
        (Linear): Linear(in_features=400, out_features=400, bias=True)
        (Activation): ReLU()
      )
    )
    (MLP_layer_5): MLP(
      (block): Sequential(
        

In [28]:
optimizer = torch.optim.Adam(wd.parameters(), lr=0.0027)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = torch.nn.BCELoss()
n_epochs = 7
device = "cpu"

In [29]:
%%time
train(
    wd,
    train_loader,
    optimizer,
    scheduler,
    criterion,
    n_epochs,
    val_loader=val_loader,
    df_val=df_val,
    cos_dist=cos_dist,
    popularity=popularity,
    verbose=True,
)

Epochs:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss: 0.35754


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.5798),
 'NDCG @ 15': tensor(0.2867),
 'Diversity (ILD)': tensor(0.1418),
 'Novelty (EPC)': tensor(0.6940)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss: 0.32558


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6443),
 'NDCG @ 15': tensor(0.3111),
 'Diversity (ILD)': tensor(0.1305),
 'Novelty (EPC)': tensor(0.7245)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss: 0.30056


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6625),
 'NDCG @ 15': tensor(0.3270),
 'Diversity (ILD)': tensor(0.1254),
 'Novelty (EPC)': tensor(0.7396)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss: 0.28165


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6642),
 'NDCG @ 15': tensor(0.3356),
 'Diversity (ILD)': tensor(0.1205),
 'Novelty (EPC)': tensor(0.7521)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss: 0.26622


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6807),
 'NDCG @ 15': tensor(0.3456),
 'Diversity (ILD)': tensor(0.1181),
 'Novelty (EPC)': tensor(0.7575)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss: 0.25339


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6956),
 'NDCG @ 15': tensor(0.3498),
 'Diversity (ILD)': tensor(0.1150),
 'Novelty (EPC)': tensor(0.7644)}


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss: 0.2423


Inference:   0%|          | 0/30 [00:00<?, ?batch/s]

{'Hit rate @ 15': tensor(0.6898),
 'NDCG @ 15': tensor(0.3470),
 'Diversity (ILD)': tensor(0.1135),
 'Novelty (EPC)': tensor(0.7686)}
CPU times: user 43min 31s, sys: 1h 52min 1s, total: 2h 35min 32s
Wall time: 17min 12s


In [30]:
df_test["pred"] = predict(wd, test_loader, verbose=False).numpy()
pred, target, pred_items = split_test_df(df_test, "userId", "movieId", "pred", "action")
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ 15': tensor(0.7121),
 'NDCG @ 15': tensor(0.3656),
 'Diversity (ILD)': tensor(0.1132),
 'Novelty (EPC)': tensor(0.7665)}

## Test user recommendations

In [31]:
df_test_user = add_not_watched_movies(new_user_id, df_test, df_train, df_movies)

# cross features for test user
test_idx = (
    (
        pd.Series(list(map(lambda x: user_features_mapping[x], df_test_user["userId"])))
        + pd.Series(
            list(map(lambda x: movie_features_mapping[x], df_test_user["movieId"]))
        )
    )
    .apply(lambda x: mmh3.hash(x) % cross_feats_dim)
    .values
)
cross_test_user = torch.zeros(df_test_user.shape[0], cross_feats_dim)
cross_test_user[torch.arange(cross_test_user.shape[0]), test_idx] = 1

test_user_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]),
        torch.tensor(df_test_user["movieId"]),
        cross_test_user,
    ),
    batch_size=4096,
    shuffle=False,
)

In [32]:
df_test_user["pred"] = predict(wd, test_user_loader, verbose=False).numpy()
display(create_test_user_display_df(df_test_user, df_movies, "pred"))

Unnamed: 0,userId,movieId,name,genre,pred
0,6040,10,"American President, The (1995)",Comedy|Drama|Romance,0.980934
1,6040,1720,"Wedding Singer, The (1998)",Comedy|Romance,0.954767
2,6040,2252,Pleasantville (1998),Comedy,0.923652
3,6040,582,Home Alone (1990),Children's|Comedy,0.886759
4,6040,591,Beauty and the Beast (1991),Animation|Children's|Musical,0.878612
5,6040,139,"Birdcage, The (1996)",Comedy,0.861535
6,6040,1854,There's Something About Mary (1998),Comedy,0.853699
7,6040,2075,Sixteen Candles (1984),Comedy,0.842035
8,6040,558,Welcome to the Dollhouse (1995),Comedy|Drama,0.83691
9,6040,464,"Englishman Who Went Up a Hill, But Came Down a...",Comedy|Romance,0.831498
