In [1]:
# Install Kaggle
! pip install -q kaggle

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"giuseppevenuto","key":"b789d1bcaee3e95873e6a9f9533a5ec3"}'}

In [2]:
# Move the Kaggle API Token in the correct folder, test it works
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                             title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ahsan81/hotel-reservations-classification-dataset               Hotel Reservations Dataset                         480KB  2023-01-04 12:50:31           8485        291  1.0              
googleai/musiccaps                                              MusicCaps                                          793KB  2023-01-25 09:25:48           1633        172  0.9411765        
themrityunjaypathak/most-subscribed-1000-youtube-channels       Most Subscribed 1000 Youtube Channels               28KB  2023-01-21 14:42:05           2044         67  1.0              
nitishsharma01/olympics-124-years-datasettill-2020              O

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [4]:
# Download the dataset from Kaggle
! kaggle datasets download samlearner/letterboxd-movie-ratings-data

Downloading letterboxd-movie-ratings-data.zip to /content
 94% 177M/188M [00:01<00:00, 115MB/s]
100% 188M/188M [00:02<00:00, 95.7MB/s]


In [5]:
# Unzip the data
! unzip letterboxd-movie-ratings-data.zip -d dataset

Archive:  letterboxd-movie-ratings-data.zip
  inflating: dataset/movie_data.csv  
  inflating: dataset/ratings_export.csv  
  inflating: dataset/users_export.csv  


In [6]:
# Load the dataset into a Pandas dataframe
movie_data = pd.read_csv("dataset/movie_data.csv", lineterminator="\n")
ratings_data = pd.read_csv("dataset/ratings_export.csv", lineterminator="\n")
user_data = pd.read_csv("dataset/users_export.csv", lineterminator="\n")

In [7]:
# Print some samples
from google.colab import data_table
data_table.enable_dataframe_formatter()
df = movie_data.head()
df.loc[:, df.columns!='overview']  # Excluding long movie description

Unnamed: 0,_id,genres,image_url,imdb_id,imdb_link,movie_id,movie_title,original_language,popularity,production_countries,release_date,runtime,spoken_languages,tmdb_id,tmdb_link,vote_average,vote_count,year_released
0,5fc85f606758f69634496fd3,"[""Music"",""Animation""]",film-poster/4/6/4/4/4/0/464440-football-freaks...,,,football-freaks,Football Freaks,en,0.6,"[""United Kingdom""]",1971-12-05,0.0,[],535272.0,https://www.themoviedb.org/movie/535272/,0.0,0.0,1971.0
1,5fc85ff26758f696344ace0c,[],film-poster/2/4/5/5/0/0/245500-aftermath-0-230...,tt0586129,http://www.imdb.com/title/tt0586129/maindetails,aftermath-1960,Aftermath,en,0.6,[],1960-04-17,22.0,[],318331.0,https://www.themoviedb.org/movie/318331/,8.0,1.0,1960.0
2,5fc85f606758f69634496fcd,"[""Drama""]",film-poster/9/3/3/1/8/93318-where-chimneys-are...,tt0045731,http://www.imdb.com/title/tt0045731/maindetails,where-chimneys-are-seen,Where Chimneys Are Seen,ja,1.568,"[""Japan""]",1953-03-05,108.0,"[""日本語""]",117779.0,https://www.themoviedb.org/movie/117779/,6.6,10.0,1953.0
3,5fc85f606758f69634496fd1,"[""Drama""]",,tt0187327,http://www.imdb.com/title/tt0187327/maindetails,the-musicians-daughter,The Musician's Daughter,en,0.6,"[""United States of America""]",1911-12-12,15.0,[],560377.0,https://www.themoviedb.org/movie/560377/,0.0,0.0,1911.0
4,5fc85f606758f69634496fd4,"[""Documentary""]",film-poster/4/5/4/6/0/3/454603-50-years-of-fab...,tt4769914,http://www.imdb.com/title/tt4769914/maindetails,50-years-of-fabulous,50 Years of Fabulous,en,0.6,[],2018-05-17,75.0,[],525187.0,https://www.themoviedb.org/movie/525187/,0.0,0.0,2018.0


In [8]:
ratings_data.head()

Unnamed: 0,_id,movie_id,rating_val,user_id
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof
3,5fc57c5d6758f6963451a060,the-future,4,deathproof
4,5fc57c5c6758f69634519398,mank,5,deathproof


In [9]:
user_data.head()

Unnamed: 0,_id,display_name,num_ratings_pages,num_reviews,username
0,5fc4172ec6cd28ebd99dd0e2,Lucy,32.0,1650,deathproof
1,5fc4172ec6cd28ebd99dd0ea,Matt Singer,52.0,1915,superpulse
2,5fc4172ec6cd28ebd99dd0ed,Sean Baker,21.0,1283,lilfilm
3,5fc4172ec6cd28ebd99dd0ee,iana,37.0,1177,ianamurray
4,5fc419171ebf67b9fbe48615,Lizzy,57.0,1810,punchdrunklizzy


In [10]:
# Check number of ratings
len(ratings_data)

11078167

In [11]:
! mkdir ./processed

In [12]:
import pickle

def get_and_save_or_load_maps(d):
  mm = "./processed/movie_to_index.pkl"
  uu = "./processed/user_to_index.pkl"
  try:
    with open(mm, 'rb') as f: d["movie_to_index"] = pickle.load(f)
    with open(uu, 'rb') as f: d["user_to_index"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MAPS")
    d["movie_to_index"] = {m: i for i, m in enumerate(d["all_movies"])}
    d["user_to_index"] = {u: i for i, u in enumerate(d["all_users"])}
    with open(mm, 'wb') as f: pickle.dump(d["movie_to_index"], f)
    with open(uu, 'wb') as f: pickle.dump(d["user_to_index"], f)
  return d

In [13]:
def get_and_save_or_load_movies(d):
  mm = "./processed/all_movies.pkl"
  try:
    with open(mm, 'rb') as f: d["all_movies"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MOVIES")
    d["all_movies"] = ratings_data.movie_id.unique()
    with open(mm, 'wb') as f: pickle.dump(d["all_movies"], f)
  return d


In [14]:
from sklearn.model_selection import train_test_split

def get_and_save_or_load_users(d, train_ratio, test_ratio):
  uu = "./processed/all_users.pkl"
  tt1 = "./processed/train_users.pkl"
  tt2 = "./processed/test_users.pkl"
  try:
    with open(uu, 'rb') as f: d["all_users"] = pickle.load(f)
    with open(tt1, 'rb') as f: d["train_users"] = pickle.load(f)
    with open(tt2, 'rb') as f: d["test_users"] = pickle.load(f)
  except:
    print("COULD NOT LOAD USERS")
    d["all_users"] = ratings_data.user_id.unique()
    d["train_users"], d["test_users"] = train_test_split(d["all_users"], train_size=train_ratio, test_size=test_ratio)
    with open(uu, 'wb') as f: pickle.dump(d["all_users"], f)
    with open(tt1, 'wb') as f: pickle.dump(d["train_users"], f)
    with open(tt2, 'wb') as f: pickle.dump(d["test_users"], f)
  return d


In [15]:
from scipy.sparse import coo_matrix

def get_and_save_or_load_matrix(d, test_or_train):
  s = test_or_train + "_matrix"
  tt = "./processed/" + s + ".pkl"
  try:
    with open(tt, 'rb') as f: d[s] = pickle.load(f)
  except:
    print("COULD NOT LOAD MATRIX: " + test_or_train)
    shape = (len(d["all_users"]), len(d["all_movies"]))
    df = ratings_data[ratings_data["user_id"].isin(d[test_or_train + "_users"])]
    row = df['user_id'].map(d['user_to_index']).values
    col = df['movie_id'].map(d['movie_to_index']).values
    data = df['rating_val'].values
    d[s] = coo_matrix((data, (row, col)), shape=shape)
    d[s] = d[s].tocsr()
    with open(tt, 'wb') as f: pickle.dump(d[s], f)
  return d


In [16]:
def get_and_save_or_load_sample(train_ratio=0.8, test_ratio=0.2):
  d = dict()
  d = get_and_save_or_load_movies(d)
  d = get_and_save_or_load_users(d, train_ratio, test_ratio)
  d = get_and_save_or_load_maps(d)
  for s in ("test", "train"):
    d = get_and_save_or_load_matrix(d, s)
  return d

In [24]:
import warnings
from sklearn.metrics.pairwise import cosine_similarity

def predict_scores(d, uname, num_neighbors, return_dict=False):
    # Get the score array for the user
    user_scores = d["test_matrix"][d["user_to_index"][uname], :].toarray().reshape(1, -1)
    
    # Calculate the cosine similarities
    sims = cosine_similarity(user_scores, d["train_matrix"])
    sims = sims.flatten()
    
    # Get the indices of the most similar users, then get their ratings
    neighbor_indices = sims.argsort()[::-1][:num_neighbors]
    neighbor_ratings = d["train_matrix"][neighbor_indices, :].toarray().astype('float')
    neighbor_ratings[neighbor_ratings < 0.5] = 'nan'
    with warnings.catch_warnings():
      warnings.simplefilter("ignore", category=RuntimeWarning)
      movie_ratings = np.nanmean(neighbor_ratings, axis=0)  # Ignores NaNs by default
    movie_ratings[np.isnan(movie_ratings)] = 6.0  # Predict middle score when forced
    if return_dict: return {d["all_movies"][i]: movie_ratings[i] for i in range(len(movie_ratings))}
    else: return movie_ratings


In [25]:
import random
d = get_and_save_or_load_sample()
uname = random.choice(d["test_users"])
print(uname)
print(d["user_to_index"][uname])
pred_dict = predict_scores(d, uname, 100, return_dict=True)
pred_dict


manilazic
686


{'feast-2014': 7.052631578947368,
 'loving-2016': 6.420289855072464,
 'scripted-content': 6.0,
 'the-future': 5.96,
 'mank': 6.584269662921348,
 'embers-2015': 6.0,
 'the-social-network': 9.175257731958762,
 'miss-you-already': 6.666666666666667,
 'saw-iii': 4.224489795918367,
 'recess-schools-out': 5.083333333333333,
 'the-girl-on-the-train-2016': 4.0,
 'yes-man': 4.653061224489796,
 'insidious': 6.095238095238095,
 'the-alphabet': 6.842105263157895,
 'first-girl-i-loved': 6.111111111111111,
 'red-mist': 6.0,
 'sydney-white': 2.8,
 'bo-burnham-what': 6.45,
 'all-cheerleaders-die': 5.4,
 'the-wave-2015': 5.4,
 'next-floor': 6.6,
 'nerve-2016': 5.627906976744186,
 'the-disappearance-of-eleanor-rigby-her': 6.181818181818182,
 'mud': 7.390243902439025,
 'doubt': 7.140625,
 'walk-hard-the-dewey-cox-story': 7.77027027027027,
 'hush-2016': 5.96875,
 'deadpool': 4.967032967032967,
 'the-invitation-2015': 6.589041095890411,
 'trick-r-treat': 6.468085106382978,
 'queen-of-earth': 6.879310344827

In [26]:
def test_scores(d, uname, num_neighbors, func):
    target = d["test_matrix"][d["user_to_index"][uname], :].toarray().reshape(1, -1)
    pred = func(d, uname, num_neighbors)
    target = target.flatten().astype('float')
    target[target < 0.5] = 'nan'
    mse = np.nanmean((target - pred) ** 2)
    mae = np.nanmean(abs(target - pred))
    return mse, mae


In [27]:
import random

NUM_NEIGHBORS = 100
NUM_ITERATIONS = 100

d = get_and_save_or_load_sample()
mse_arr = list()
mae_arr = list()
for i in range(NUM_ITERATIONS):
  uname = random.choice(d["test_users"])
  mse, mae = test_scores(d, uname, NUM_NEIGHBORS, func=predict_scores)
  mse_arr.append(mse)
  mae_arr.append(mae)
print("Average mean squared error for the iterations")
print(np.mean(mse_arr))
print("Average mean absolute error for the iterations")
print(np.mean(mae_arr))

Average mean squared error for the iterations
2.7932658260792884
Average mean absolute error for the iterations
1.2740231298773699


In [21]:
# SANITY CHECK
# GET PREDICTIONS FROM THE LEAST SIMILAR USERS
# SHOULD HAVE BAD PERFORMANCE
import warnings
from sklearn.metrics.pairwise import cosine_similarity

def antipredict_scores(d, uname, num_neighbors, return_dict=False):
    # Get the score array for the user
    user_scores = d["test_matrix"][d["user_to_index"][uname], :].toarray().reshape(1, -1)
    
    # Calculate the cosine similarities
    sims = cosine_similarity(user_scores, d["train_matrix"])
    sims = sims.flatten()
    
    # Get the indices of the least similar users, then get their ratings
    neighbor_indices = sims.argsort()[::1][:num_neighbors]
    neighbor_ratings = d["train_matrix"][neighbor_indices, :].toarray().astype('float')
    neighbor_ratings[neighbor_ratings < 0.5] = 'nan'
    with warnings.catch_warnings():
      warnings.simplefilter("ignore", category=RuntimeWarning)
      movie_ratings = np.nanmean(neighbor_ratings, axis=0)  # Ignores NaNs by default
    movie_ratings[np.isnan(movie_ratings)] = 6.0  # Predict avg score when forced
    if return_dict: return {d["all_movies"][i]: movie_ratings[i] for i in range(len(movie_ratings))}
    else: return movie_ratings

import random

NUM_NEIGHBORS = 100
NUM_ITERATIONS = 100

d = get_and_save_or_load_sample()
mse_arr = list()
mae_arr = list()
for i in range(NUM_ITERATIONS):
  uname = random.choice(d["test_users"])
  mse, mae = test_scores(d, uname, NUM_NEIGHBORS, func=antipredict_scores)
  mse_arr.append(mse)
  mae_arr.append(mae)
print("Average mean squared error for the iterations")
print(np.mean(mse_arr))
print("Average mean absolute error for the iterations")
print(np.mean(mae_arr))

Average mean squared error for the iterations
7.851886374052705
Average mean absolute error for the iterations
2.400698514778677


In [23]:
# SANITY CHECK
# COMPARE WITH ALWAYS PREDICTING MIDDLE VALUE

def dumb_predict_scores(d, uname, num_neighbors, return_dict=False):
    movie_ratings = np.array([6.0] * len(d["all_movies"]))
    if return_dict: return {d["all_movies"][i]: movie_ratings[i] for i in range(len(movie_ratings))}
    else: return movie_ratings

import random

NUM_NEIGHBORS = 100
NUM_ITERATIONS = 100

d = get_and_save_or_load_sample()
mse_arr = list()
mae_arr = list()
for i in range(NUM_ITERATIONS):
  uname = random.choice(d["test_users"])
  mse, mae = test_scores(d, uname, NUM_NEIGHBORS, func=dumb_predict_scores)
  mse_arr.append(mse)
  mae_arr.append(mae)
print("Average mean squared error for the iterations")
print(np.mean(mse_arr))
print("Average mean absolute error for the iterations")
print(np.mean(mae_arr))


Average mean squared error for the iterations
4.961595496432921
Average mean absolute error for the iterations
1.8138974984660328


In [None]:
! zip -r /content/processed.zip /content/processed
from google.colab import files
files.download("/content/processed.zip")