In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# from scipy.sparse import csr_matrix
# from scipy.sparse.linalg import svds
# from surprise import SVDpp, Reader, Dataset
# from surprise.model_selection import cross_validate

from typing import Union
from tqdm.notebook import tqdm

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

from src.metrics import reccomendation_report
from src.utils import surprise_predict
from src.utils import split_test_df
from src.utils import seed_everything
from src.utils import trainDataset
from src.utils import trainDatasetWithNumCatFeatures

from src.models import DeepFM, DeepFMImp

import mmh3

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
df_movies = pd.read_csv(
    DATA_FOLDER + "movies.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["movieId", "name", "genre"],
)
df_ratings = pd.read_csv(
    DATA_FOLDER + "ratings.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "movieId", "rating", "timestamp"],
)
df_users = pd.read_csv(
    DATA_FOLDER + "users.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "gender", "age", "occupation", "zip-code"],
)

In [6]:
## Encode usedId, movieId
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_movies["movieId"] = movie_encoder.fit_transform(df_movies["movieId"])
df_users["userId"] = user_encoder.fit_transform(df_users["userId"])

df_ratings["movieId"] = movie_encoder.transform(df_ratings["movieId"])
df_ratings["userId"] = user_encoder.transform(df_ratings["userId"])

#### Creating a test_user

In [7]:
df_users.loc[-1] = [df_users["userId"].max() + 1, "F", 21, 5, 777777]
df_users = df_users.reset_index(drop=True)
df_test_user = pd.DataFrame(columns=["userId", "movieId", "rating", "timestamp"])
df_test_user = df_test_user.assign(movieId=[6, 16, 3192, 1461, 827, 887, 593]).assign(
    userId=df_users["userId"].max(), rating=4, timestamp=lambda x: np.arange(x.shape[0])
)
print("Test user watch list:")
display(df_test_user.merge(df_movies, on="movieId"))
df_ratings = pd.concat([df_ratings, df_test_user], ignore_index=True)

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,4,0,Sabrina (1995),Comedy|Romance
1,6040,16,4,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,4,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,4,3,Love and Other Catastrophes (1996),Romance
4,6040,827,4,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,4,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,4,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 10% of test.

In [8]:
df_ratings["rank"] = (
    df_ratings[["userId", "timestamp"]]
    .groupby("userId", as_index=False)["timestamp"]
    .rank(method="first", ascending=False)
)

In [9]:
# leave one out
df_train = df_ratings.loc[df_ratings["rank"] != 1].reset_index(drop=True)
df_test = (
    df_ratings.loc[df_ratings["rank"] == 1].reset_index(drop=True).assign(action=1)
)
df_test, df_val = train_test_split(df_test, test_size=0.2, random_state=RANDOM_STATE)

In [10]:
# enrich test data with 100 random movies from the ones not intercated by user
df_add = pd.DataFrame()
for user in tqdm(df_test.userId.unique(), desc="Enriching test"):
    movie = df_test.loc[df_test.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_test = pd.concat([df_test, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

df_add = pd.DataFrame()
for user in tqdm(df_val.userId.unique(), desc="Enriching val"):
    movie = df_val.loc[df_val.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_val = pd.concat([df_val, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

# adding data for test user
df_add = pd.DataFrame()
user = df_users["userId"].max()
movie = df_test.loc[df_test.userId == user, "movieId"]
watched_movies = np.append(
    movie, df_train.loc[df_train.userId == user, "movieId"].values
)
not_wathed_movies = np.setdiff1d(
    np.arange(df_movies["movieId"].max() + 1), watched_movies
)
random_500 = np.random.choice(not_wathed_movies, 500, replace=False)

df_temp = pd.DataFrame().assign(movieId=random_500, userId=user, action=0)
df_add = pd.concat([df_add, df_temp], ignore_index=True)
df_test_user = df_add

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

In [20]:
with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

## Building ordinal encoded features and normalizing continious features

In [11]:
ord_user = OrdinalEncoder()
user_cat = torch.tensor(ord_user.fit_transform(df_users[["gender", "occupation"]]))

ord_movie = OrdinalEncoder()
movie_cat = torch.tensor(ord_movie.fit_transform(df_movies[["genre"]]))

ss_user = StandardScaler()
user_num = torch.tensor(ss_user.fit_transform(df_users[["age"]]))

In [12]:
test_user_cat = user_cat[df_test["userId"].values].clone().detach().to(torch.long)
test_user_num = user_num[df_test["userId"].values].clone().detach().to(torch.float)
test_movie_cat = movie_cat[df_test["movieId"].values].clone().detach().to(torch.long)
test_cat = torch.hstack((test_user_cat, test_movie_cat))

### DeepFM

In [13]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDatasetWithNumCatFeatures(
        df_train, df_movies["movieId"].nunique(), user_cat, user_num, movie_cat
    ),
    batch_size=2048,
    shuffle=True,
)

num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()
num_numeric_feats = 1
cat_feature_vocab = [len(i) for i in ord_user.categories_] + [
    len(i) for i in ord_movie.categories_
]

100%|████████████████████████████████| 994175/994175 [00:27<00:00, 35889.22it/s]


In [14]:
seed_everything(RANDOM_STATE)
dF = DeepFM(num_users, num_items, num_numeric_feats, cat_feature_vocab)
display(dF)

DeepFM(
  (user_embedding): Embedding(6041, 5)
  (item_embedding): Embedding(3883, 5)
  (numerical_embeddings): ModuleList(
    (0): Linear(in_features=1, out_features=5, bias=True)
  )
  (categorical_embeddings): ModuleList(
    (0): Embedding(2, 5)
    (1): Embedding(21, 5)
    (2): Embedding(301, 5)
  )
  (mlp): Sequential(
    (MLP_layer_0): Linear(in_features=30, out_features=16, bias=True)
    (Activation_layer_0): ReLU()
    (MLP_layer_1): Linear(in_features=16, out_features=32, bias=True)
    (Activation_layer_1): ReLU()
    (MLP_layer_2): Linear(in_features=32, out_features=64, bias=True)
    (Activation_layer_2): ReLU()
    (MLP_layer_3): Linear(in_features=64, out_features=1, bias=True)
    (Activation_layer_3): ReLU()
  )
  (fm_linear): Linear(in_features=325, out_features=1, bias=True)
  (final_sigmoid): Sigmoid()
)

In [15]:
optimizer = torch.optim.Adam(dF.parameters(), lr=3e-3)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0003, max_lr=0.001, step_size_up=100, mode="triangular2", cycle_momentum=False)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=500, eta_min=1e-4
)
criterion = nn.BCELoss()
n_epochs = 10
device = "cpu"

In [16]:
### Train DeepFM
dF.to(device)
num_iterations = len(train_loader)

for epoch in tqdm(range(n_epochs), desc="Epochs"):
    # train
    total_train_loss = 0
    dF.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for userIds, movieIds, num_feats, cat_feats, ratings in tepoch:
            pred_train = dF(
                userIds.to(device),
                movieIds.to(device),
                num_feats.to(device),
                cat_feats.to(device),
            )
            loss_train = criterion(pred_train.flatten(), ratings.to(device))

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
            scheduler.step()
            total_train_loss += loss_train.item()
            tepoch.set_postfix(
                loss=loss_train.item(), lr=round(scheduler.get_last_lr()[0], 7)
            )
    print("Epoch:", epoch)
    print("Train loss", round(total_train_loss / num_iterations, 5))

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.81371


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.43739


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.3684


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.34411


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.32969


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss 0.31831


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss 0.3101


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 7
Train loss 0.30372


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 8
Train loss 0.29894


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 9
Train loss 0.29595


In [17]:
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test["userId"]),
        torch.tensor(df_test["movieId"]),
        test_user_num,
        test_cat,
    ),
    batch_size=4096,
    shuffle=False,
)

In [18]:
# predict dF
dF.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds, num_feats, cat_feats) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = dF(
            userIds.to(device),
            movieIds.to(device),
            num_feats.to(device),
            cat_feats.to(device),
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [21]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.6697),
 'NDCG @ K': tensor(0.3392),
 'Diversity (ILD)': tensor(0.1269),
 'Novelty (EPC)': tensor(0.7391)}

### DeepFMImp

In [22]:
seed_everything(RANDOM_STATE)
train_loader = DataLoader(
    trainDatasetWithNumCatFeatures(
        df_train, df_movies["movieId"].nunique(), user_cat, user_num, movie_cat
    ),
    batch_size=2048,
    shuffle=True,
)

num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()
num_numeric_feats = 1
cat_feature_vocab = [len(i) for i in ord_user.categories_] + [
    len(i) for i in ord_movie.categories_
]

100%|████████████████████████████████| 994175/994175 [00:27<00:00, 36198.93it/s]


In [23]:
seed_everything(RANDOM_STATE)
dFI = DeepFMImp(num_users, num_items, num_numeric_feats, cat_feature_vocab)
display(dFI)

DeepFMImp(
  (user_embedding): Embedding(6041, 5)
  (item_embedding): Embedding(3883, 5)
  (numerical_embeddings): ModuleList(
    (0): Linear(in_features=1, out_features=5, bias=True)
  )
  (categorical_embeddings): ModuleList(
    (0): Embedding(2, 5)
    (1): Embedding(21, 5)
    (2): Embedding(301, 5)
  )
  (mlp): Sequential(
    (MLP_layer_0): Linear(in_features=30, out_features=16, bias=True)
    (Activation_layer_0): ReLU()
    (MLP_layer_1): Linear(in_features=16, out_features=32, bias=True)
    (Activation_layer_1): ReLU()
    (MLP_layer_2): Linear(in_features=32, out_features=64, bias=True)
    (Activation_layer_2): ReLU()
    (MLP_layer_3): Linear(in_features=64, out_features=1, bias=True)
    (Activation_layer_3): ReLU()
  )
  (fm_sparse): Linear(in_features=325, out_features=1, bias=True)
  (fm_linear): Linear(in_features=2, out_features=1, bias=True)
  (final_linear): Linear(in_features=2, out_features=1, bias=True)
  (final_sigmoid): Sigmoid()
)

In [24]:
optimizer = torch.optim.Adam(dFI.parameters(), lr=1e-3)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0003, max_lr=0.001, step_size_up=100, mode="triangular2", cycle_momentum=False)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=1000, eta_min=1e-4
)
criterion = nn.BCELoss()
n_epochs = 10
device = "cpu"

In [25]:
### Train DeepFM
dFI.to(device)
num_iterations = len(train_loader)

for epoch in tqdm(range(n_epochs), desc="Epochs"):
    # train
    total_train_loss = 0
    dFI.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for userIds, movieIds, num_feats, cat_feats, ratings in tepoch:
            pred_train = dFI(
                userIds.to(device),
                movieIds.to(device),
                num_feats.to(device),
                cat_feats.to(device),
                device,
            )
            loss_train = criterion(pred_train.flatten(), ratings.to(device))

            optimizer.zero_grad()
            loss_train.backward()
            optimizer.step()
            scheduler.step()
            total_train_loss += loss_train.item()
            tepoch.set_postfix(
                loss=loss_train.item(), lr=round(scheduler.get_last_lr()[0], 7)
            )
    print("Epoch:", epoch)
    print("Train loss", round(total_train_loss / num_iterations, 5))

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 0
Train loss 0.50295


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 1
Train loss 0.43988


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 2
Train loss 0.40126


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 3
Train loss 0.3753


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 4
Train loss 0.36155


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 5
Train loss 0.35354


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 6
Train loss 0.34831


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 7
Train loss 0.34447


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 8
Train loss 0.34123


  0%|          | 0/2428 [00:00<?, ?batch/s]

Epoch: 9
Train loss 0.33824


In [26]:
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test["userId"]),
        torch.tensor(df_test["movieId"]),
        test_user_num,
        test_cat,
    ),
    batch_size=4096,
    shuffle=False,
)

In [27]:
# predict dF
dFI.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds, num_feats, cat_feats) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = dFI(
            userIds.to(device),
            movieIds.to(device),
            num_feats.to(device),
            cat_feats.to(device),
            device,
        ).flatten()

Inference:   0%|          | 0/120 [00:00<?, ?batch/s]

In [28]:
df_test["rating_pred"] = total_preds.numpy()
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.5770),
 'NDCG @ K': tensor(0.2841),
 'Diversity (ILD)': tensor(0.1395),
 'Novelty (EPC)': tensor(0.7054)}

## Test user recommendations

In [29]:
pd.set_option("display.max_rows", 102)

In [31]:
test_user_cat = user_cat[df_test_user["userId"].values].clone().detach().to(torch.long)
test_user_num = user_num[df_test_user["userId"].values].clone().detach().to(torch.float)
test_movie_cat = (
    movie_cat[df_test_user["movieId"].values].clone().detach().to(torch.long)
)
test_cat = torch.hstack((test_user_cat, test_movie_cat))

test_loader = DataLoader(
    TensorDataset(
        torch.tensor(df_test_user["userId"]),
        torch.tensor(df_test_user["movieId"]),
        test_user_num,
        test_cat,
    ),
    batch_size=4096,
    shuffle=False,
)

# predict dF
dF.eval()
total_preds = torch.zeros(len(test_loader.dataset))
batch_size = test_loader.batch_size
for i, (userIds, movieIds, num_feats, cat_feats) in enumerate(
    tqdm(test_loader, desc="Inference", unit="batch")
):
    with torch.no_grad():
        total_preds[i * batch_size : (i + 1) * batch_size] = dF(
            userIds.to(device),
            movieIds.to(device),
            num_feats.to(device),
            cat_feats.to(device),
        ).flatten()

df_test_user["rating_pred"] = total_preds.numpy()

display(
    df_test_user.loc[df_test_user.userId == df_test_user["userId"].max()]
    .sort_values(by="rating_pred", ascending=False)
    .merge(df_movies, on="movieId")
    .loc[:100, ["userId", "movieId", "name", "genre"]]
)

Inference:   0%|          | 0/1 [00:00<?, ?batch/s]

Unnamed: 0,userId,movieId,name,genre
0,6040,1227,"Graduate, The (1967)",Drama|Romance
1,6040,1372,Jerry Maguire (1996),Drama|Romance
2,6040,2016,101 Dalmatians (1961),Animation|Children's
3,6040,2728,Big (1988),Comedy|Fantasy
4,6040,2027,Sleeping Beauty (1959),Animation|Children's|Musical
5,6040,2692,"Iron Giant, The (1999)",Animation|Children's
6,6040,2401,Crocodile Dundee (1986),Adventure|Comedy
7,6040,2872,South Pacific (1958),Musical|Romance|War
8,6040,1946,"Absent Minded Professor, The (1961)",Children's|Comedy|Fantasy
9,6040,1000,"Parent Trap, The (1961)",Children's|Drama
