In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# from scipy.sparse import csr_matrix
# from scipy.sparse.linalg import svds
from surprise import SVDpp, Reader, Dataset
from surprise.model_selection import cross_validate

from typing import Union
from tqdm.notebook import tqdm

import torch.nn as nn

# from torch.utils.data import Dataset, DataLoader
import torch

from src.metrics import reccomendation_report
from src.utils import computeCosineSimilarities
from src.utils import surprise_predict
from src.utils import split_test_df

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7

In [4]:
np.random.seed(RANDOM_STATE)

## Data

In [5]:
df_movies = pd.read_csv(
    DATA_FOLDER + "movies.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["movieId", "name", "genre"],
)
df_ratings = pd.read_csv(
    DATA_FOLDER + "ratings.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "movieId", "rating", "timestamp"],
)
df_users = pd.read_csv(
    DATA_FOLDER + "users.csv",
    encoding="iso-8859-1",
    sep=";",
    names=["userId", "gender", "age", "occupation", "zip-code"],
)

In [6]:
## Encode usedId, movieId
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_movies["movieId"] = movie_encoder.fit_transform(df_movies["movieId"])
df_users["userId"] = user_encoder.fit_transform(df_users["userId"])

df_ratings["movieId"] = movie_encoder.transform(df_ratings["movieId"])
df_ratings["userId"] = user_encoder.transform(df_ratings["userId"])

#### Creating a test_user

In [7]:
df_users.loc[-1] = [df_users["userId"].max() + 1, "F", 21, 5, 777777]
df_users = df_users.reset_index(drop=True)
df_test_user = pd.DataFrame(columns=["userId", "movieId", "rating", "timestamp"])
df_test_user = df_test_user.assign(movieId=[6, 16, 3192, 1461, 827, 887, 593]).assign(
    userId=df_users["userId"].max(), rating=4, timestamp=lambda x: np.arange(x.shape[0])
)
print("Test user watch list:")
display(df_test_user.merge(df_movies, on="movieId"))
df_ratings = pd.concat([df_ratings, df_test_user], ignore_index=True)

Test user watch list:


Unnamed: 0,userId,movieId,rating,timestamp,name,genre
0,6040,6,4,0,Sabrina (1995),Comedy|Romance
1,6040,16,4,1,Sense and Sensibility (1995),Drama|Romance
2,6040,3192,4,2,Singles (1992),Comedy|Drama|Romance
3,6040,1461,4,3,Love and Other Catastrophes (1996),Romance
4,6040,827,4,4,Emma (1996),Comedy|Drama|Romance
5,6040,887,4,5,Singin' in the Rain (1952),Musical|Romance
6,6040,593,4,6,Pretty Woman (1990),Comedy|Romance


### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [41]:
df_ratings["rank"] = (
    df_ratings[["userId", "timestamp"]]
    .groupby("userId", as_index=False)["timestamp"]
    .rank(method="first", ascending=False)
)

In [45]:
# leave one out
df_train = df_ratings.loc[df_ratings["rank"] != 1].reset_index(drop=True)
df_test = (
    df_ratings.loc[df_ratings["rank"] == 1].reset_index(drop=True).assign(action=1)
)
df_test, df_val = train_test_split(df_test, test_size=0.2, random_state=RANDOM_STATE)

In [46]:
# enrich test data with 100 random movies from the ones not intercated by user
df_add = pd.DataFrame()
for user in tqdm(df_test.userId.unique(), desc="Enriching test"):
    movie = df_test.loc[df_test.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_test = pd.concat([df_test, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

df_add = pd.DataFrame()
for user in tqdm(df_val.userId.unique(), desc="Enriching val"):
    movie = df_val.loc[df_val.userId == user, "movieId"]
    watched_movies = np.append(
        movie, df_train.loc[df_train.userId == user, "movieId"].values
    )
    not_wathed_movies = np.setdiff1d(
        np.arange(df_movies["movieId"].max() + 1), watched_movies
    )
    random_100 = np.random.choice(not_wathed_movies, 100, replace=False)

    df_temp = pd.DataFrame().assign(movieId=random_100, userId=user, action=0)
    df_add = pd.concat([df_add, df_temp], ignore_index=True)

df_val = pd.concat([df_val, df_add], ignore_index=True).drop(
    columns=["timestamp", "rating", "rank"]
)

# adding data for test user
df_add = pd.DataFrame()
user = df_users["userId"].max()
movie = df_test.loc[df_test.userId == user, "movieId"]
watched_movies = np.append(
    movie, df_train.loc[df_train.userId == user, "movieId"].values
)
not_wathed_movies = np.setdiff1d(
    np.arange(df_movies["movieId"].max() + 1), watched_movies
)
random_500 = np.random.choice(not_wathed_movies, 500, replace=False)

df_temp = pd.DataFrame().assign(movieId=random_500, userId=user, action=0)
df_add = pd.concat([df_add, df_temp], ignore_index=True)
df_test_user = df_add

Enriching test:   0%|          | 0/4832 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/1209 [00:00<?, ?it/s]

In [11]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

In [13]:
with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)

In [14]:
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

## SVD++

In [15]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    df_ratings[["userId", "movieId", "rating"]], reader
).build_full_trainset()

In [16]:
%%time
svdpp = SVDpp(random_state=RANDOM_STATE, verbose=True, cache_ratings=True)
svdpp.fit(data)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
CPU times: user 2min 47s, sys: 386 ms, total: 2min 48s
Wall time: 2min 48s


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x28c9b1070>

In [17]:
%%time
df_test["rating_pred"] = surprise_predict(
    svdpp, df_test, usercol="userId", itemcol="movieId", predcol="rating"
)["rating"].values

CPU times: user 39.3 s, sys: 853 ms, total: 40.1 s
Wall time: 39.4 s


In [18]:
pred, target, pred_items = split_test_df(
    df_test, "userId", "movieId", "rating_pred", "action"
)

In [19]:
reccomendation_report(pred, target, pred_items, cos_dist, popularity, k=15)

{'Hit rate @ K': tensor(0.3721),
 'NDCG @ K': tensor(0.1810),
 'Diversity (ILD)': tensor(0.0725),
 'Novelty (EPC)': tensor(0.8483)}

## Test user recommendations

In [30]:
pd.set_option("display.max_rows", 102)

In [47]:
df_test_user["rating_pred"] = surprise_predict(
    svdpp, df_test_user, usercol="userId", itemcol="movieId", predcol="rating"
)["rating"].values

In [51]:
display(
    df_test_user.loc[df_test_user.userId == df_test_user["userId"].max()]
    .sort_values(by="rating_pred", ascending=False)
    .merge(df_movies, on="movieId")
    .loc[:100, ["userId", "movieId", "name", "genre"]]
)

Unnamed: 0,userId,movieId,name,genre
0,6040,315,"Shawshank Redemption, The (1994)",Drama
1,6040,2836,Sanjuro (1962),Action|Adventure
2,6040,1132,"Wrong Trousers, The (1993)",Animation|Comedy
3,6040,3238,City Lights (1931),Comedy|Drama|Romance
4,6040,1656,Good Will Hunting (1997),Drama
5,6040,893,It Happened One Night (1934),Comedy
6,6040,1249,Arsenic and Old Lace (1944),Comedy|Mystery|Thriller
7,6040,2953,"General, The (1927)",Comedy
8,6040,2789,American Beauty (1999),Comedy|Drama
9,6040,1258,Young Frankenstein (1974),Comedy|Horror


In [53]:
display(
    df_test_user.loc[df_test_user.userId == df_test_user["userId"].max()]
    .sort_values(by="rating_pred", ascending=False)
    .merge(df_movies, on="movieId")
    .loc[400:, ["userId", "movieId", "name", "genre"]]
)

Unnamed: 0,userId,movieId,name,genre
400,6040,980,"Rich Man's Wife, The (1996)",Thriller
401,6040,175,Lord of Illusions (1995),Horror
402,6040,3319,Harry and the Hendersons (1987),Comedy
403,6040,3695,F/X 2 (1992),Action|Crime|Thriller
404,6040,2126,Dirty Work (1998),Comedy
405,6040,3431,Mr. Saturday Night (1992),Comedy|Drama
406,6040,1505,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller
407,6040,2723,Airplane II: The Sequel (1982),Comedy
408,6040,1409,"Machine, The (1994)",Comedy|Horror
409,6040,1398,Murder at 1600 (1997),Mystery|Thriller
