Load dataset

In [1]:
import pandas as pd

df = pd.read_parquet("../data/processed/ratings_joined.parquet")
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,302,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,377,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,51,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,346,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,...,0,0,0,0,0,0,0,0,0,0


- Prepare Train/Test Split
- make a per-user last-item test split, which is realistic and lightweight.

In [2]:
df = df.sort_values("timestamp")

# Last rating per user is test, rest are train
test = df.groupby("user_id").tail(1)
train = df.drop(test.index)


In [3]:
len(train), len(test)


(99057, 943)

- Baseline #1: Random Recommender
- Discuss Uplift

In [4]:
import numpy as np

all_movies = df["movie_id"].unique()

def recommend_random(user_id, k=10):
    return np.random.choice(all_movies, size=k, replace=False).tolist()


Test the Baseline model

In [5]:
print(recommend_random(42, k=10))

[880, 796, 444, 1207, 1337, 1535, 216, 1632, 66, 1150]


Baseline #2: Popularity Recommender

In [6]:
popularity = train.groupby("movie_id").size().sort_values(ascending=False)
popular_movies = popularity.index.tolist()

def recommend_popularity(user_id, k=10):
    watched = set(train[train.user_id == user_id].movie_id.values)
    recs = [m for m in popular_movies if m not in watched]
    return recs[:k]


Compare the performance of Baseline models against the test set

In [7]:
def hit_at_k(recommender_fn, k=10):
    hits = 0
    for _, row in test.iterrows():
        user = row.user_id
        true_item = row.movie_id
        recs = recommender_fn(user, k)
        if true_item in recs:
            hits += 1
    return hits / len(test)


In [8]:
print("Random Hit@10:", hit_at_k(recommend_random, 10))
print("Popularity Hit@10:", hit_at_k(recommend_popularity, 10))


Random Hit@10: 0.0
Popularity Hit@10: 0.09013785790031813
