In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from src.utils import (
    load_MovieLens,
    create_test_user,
    train_test_val_split,
    seed_everything,
    split_test_df,
    add_not_watched_movies,
    create_test_user_display_df,
    enrich_train_with_negatives,
)
from src.models import AutoInt
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER)

### Creating a test_user

In [6]:
df_users, df_ratings, new_user_id = create_test_user(
    df_users, df_ratings, [6, 16, 3192, 1461, 827, 887, 593]
)
print("Test user watch list:")
display(df_ratings[df_ratings.userId == new_user_id].merge(df_movies, on="movieId"))

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,5,0,Sabrina (1995),Comedy|Romance
1,6040,16,5,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,5,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,5,3,Love and Other Catastrophes (1996),Romance
4,6040,827,5,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,5,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,5,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [7]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

### Enriching train with negatives

In [8]:
seed_everything(RANDOM_STATE)
df_train = enrich_train_with_negatives(df_train, df_movies)

  0%|          | 0/994175 [00:00<?, ?it/s]

### Loading additional data

In [9]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

### Building ordinal encoded features and normalizing continious features

In [10]:
ord_user = OrdinalEncoder()
user_cat = pd.DataFrame(
    ord_user.fit_transform(df_users[["gender", "occupation"]]),
    columns=ord_user.feature_names_in_,
).assign(userId=df_users["userId"])

ord_movie = OrdinalEncoder()
movie_cat = pd.DataFrame(
    ord_movie.fit_transform(df_movies[["genre"]]), columns=ord_movie.feature_names_in_
).assign(movieId=df_movies["movieId"])

ss_user = StandardScaler()
user_num = pd.DataFrame(
    ss_user.fit_transform(df_users[["age"]]), columns=ss_user.feature_names_in_
).assign(userId=df_users["userId"])

df_train = (
    df_train[["userId", "movieId", "label"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
    .rename(columns={"label": "action"})
)
df_val = (
    df_val[["userId", "movieId", "action"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
)
df_test = (
    df_test[["userId", "movieId", "action"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
)

## AutoInt

In [11]:
seed_everything(RANDOM_STATE)
autoint = AutoInt(
    task="classification",
    target=["action"],
    learning_rate=0.0005,
    head="LinearHead",
    max_epochs=10,
    num_attn_blocks=3,
    layers="32-32-32-32",
)

In [None]:
%%time
autoint.fit(df_train)

Seed set to 42


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


[rank: 0] Seed set to 42
[rank: 1] Seed set to 42
[rank: 2] Seed set to 42
[rank: 3] Seed set to 42
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/10
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/10
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/10
[rank: 4] Seed set to 42
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/10
[rank: 6] Seed set to 42
[rank: 5] Seed set to 42
Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/10
Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/10
Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/10
[rank: 7] Seed set to 42
Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/10
[rank: 8] Seed set to 42
Initializing distributed: GLOBAL_RANK: 8, MEMBER: 9/10
[rank: 9] Seed set to 42
Initializing distributed: GLOBAL_RANK: 9, MEMBER: 10/10
----------------------------------------------------------------------------------------------------
distributed_backend=gloo
All distributed processes registered. Starting with 10 processes
--------

Output()

In [None]:
df_test["pred"] = autoint.predict(df_test)
pred, target, pred_items = split_test_df(df_test, "userId", "movieId", "pred", "action")
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

## Test user recommendations

In [None]:
df_test_user = add_not_watched_movies(new_user_id, df_test, df_train, df_movies)
df_test_user = (
    df_test_user[["userId", "movieId", "action"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
)
df_test_user["pred"] = autoint.predict(df_test_user)

In [None]:
display(
    create_test_user_display_df(
        df_test_user[["userId", "movieId", "pred"]], df_movies, "pred"
    )
)