In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install Kaggle
! pip install -q kaggle

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"giuseppevenuto","key":"b789d1bcaee3e95873e6a9f9533a5ec3"}'}

In [3]:
# Move the Kaggle API Token in the correct folder, test it works
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                             title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ahsan81/hotel-reservations-classification-dataset               Hotel Reservations Dataset                         480KB  2023-01-04 12:50:31           8471        291  1.0              
googleai/musiccaps                                              MusicCaps                                          793KB  2023-01-25 09:25:48           1626        172  0.9411765        
themrityunjaypathak/most-subscribed-1000-youtube-channels       Most Subscribed 1000 Youtube Channels               28KB  2023-01-21 14:42:05           2034         67  1.0              
nitishsharma01/olympics-124-years-datasettill-2020              O

In [4]:
# Download the dataset from Kaggle
! kaggle datasets download samlearner/letterboxd-movie-ratings-data

Downloading letterboxd-movie-ratings-data.zip to /content
 98% 185M/188M [00:08<00:00, 32.5MB/s]
100% 188M/188M [00:08<00:00, 22.1MB/s]


In [5]:
# Unzip the data
! unzip letterboxd-movie-ratings-data.zip -d dataset

Archive:  letterboxd-movie-ratings-data.zip
  inflating: dataset/movie_data.csv  
  inflating: dataset/ratings_export.csv  
  inflating: dataset/users_export.csv  


In [6]:
import numpy as np
import pandas as pd

In [7]:
# Load the dataset into a Pandas dataframe
movie_data = pd.read_csv("dataset/movie_data.csv", lineterminator="\n")
ratings_data = pd.read_csv("dataset/ratings_export.csv", lineterminator="\n")
user_data = pd.read_csv("dataset/users_export.csv", lineterminator="\n")

In [8]:
print(movie_data.info())
print(user_data.info())
print(ratings_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285504 entries, 0 to 285503
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   _id                   285504 non-null  object 
 1   genres                274872 non-null  object 
 2   image_url             264407 non-null  object 
 3   imdb_id               243802 non-null  object 
 4   imdb_link             243802 non-null  object 
 5   movie_id              285502 non-null  object 
 6   movie_title           283340 non-null  object 
 7   original_language     274872 non-null  object 
 8   overview              261248 non-null  object 
 9   popularity            274872 non-null  float64
 10  production_countries  274872 non-null  object 
 11  release_date          271050 non-null  object 
 12  runtime               270055 non-null  float64
 13  spoken_languages      274872 non-null  object 
 14  tmdb_id               279917 non-null  float64
 15  

In [9]:
MIN_RATINGS = 8

def filter_out_sparse():
  movie_counts = ratings_data.groupby("movie_id").size()
  movie_ids = movie_counts[movie_counts >= MIN_RATINGS].index
  user_counts = ratings_data.groupby("user_id").size()
  user_ids = user_counts[user_counts >= MIN_RATINGS].index
  return user_data[user_data.username.isin(user_ids)], movie_data[movie_data.movie_id.isin(movie_ids)], ratings_data[ratings_data.movie_id.isin(movie_ids) & ratings_data.user_id.isin(user_ids)]

user_data, movie_data, ratings_data = filter_out_sparse()
ratings_data.groupby("user_id").size()


user_id
007filmreviwer    2242
007hertzrumble    1959
0o0o0o0o           100
11122001           314
127gbh            1519
                  ... 
zwangsdemokrat    1778
zwergimbikini     2121
zxols              372
zxvs               553
zyopy             2632
Length: 7396, dtype: int64

In [10]:
print(movie_data.info())
print(user_data.info())
print(ratings_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78575 entries, 2 to 283279
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _id                   78575 non-null  object 
 1   genres                76408 non-null  object 
 2   image_url             78126 non-null  object 
 3   imdb_id               76428 non-null  object 
 4   imdb_link             76428 non-null  object 
 5   movie_id              78575 non-null  object 
 6   movie_title           78283 non-null  object 
 7   original_language     76408 non-null  object 
 8   overview              76227 non-null  object 
 9   popularity            76408 non-null  float64
 10  production_countries  76408 non-null  object 
 11  release_date          76383 non-null  object 
 12  runtime               76399 non-null  float64
 13  spoken_languages      76408 non-null  object 
 14  tmdb_id               77212 non-null  float64
 15  tmdb_link         

In [11]:
from sklearn.model_selection import train_test_split

train_ratings_data, test_ratings_data = train_test_split(ratings_data)

In [12]:
def movies_enrich():
  # Enriches movie data by adding vote average sextile, 
  # vote average percentile, vote count sextie, vote count percentile, 
  # 25-year bucket, 10-year bucket, 3-year bucket
  for elem in ("vote_average", "vote_count", "runtime", "popularity"):
    movie_data[elem + "_sextile"] = pd.qcut(movie_data[elem], 6, labels=False, duplicates='drop')
    movie_data[elem + "_percentile"] = pd.qcut(movie_data[elem], 100, labels=False, duplicates='drop')
  movie_data["year_bucket_25"] = (movie_data["year_released"])//25 * 25
  movie_data["year_bucket_10"] = (movie_data["year_released"])//10 * 10
  movie_data["year_bucket_3"] = (movie_data["year_released"])//3 * 3

movies_enrich()
movie_data.head()

Unnamed: 0,_id,genres,image_url,imdb_id,imdb_link,movie_id,movie_title,original_language,overview,popularity,...,vote_average_percentile,vote_count_sextile,vote_count_percentile,runtime_sextile,runtime_percentile,popularity_sextile,popularity_percentile,year_bucket_25,year_bucket_10,year_bucket_3
2,5fc85f606758f69634496fcd,"[""Drama""]",film-poster/9/3/3/1/8/93318-where-chimneys-are...,tt0045731,http://www.imdb.com/title/tt0045731/maindetails,where-chimneys-are-seen,Where Chimneys Are Seen,ja,Gosho’s most celebrated film both in Japan and...,1.568,...,28.0,2.0,9.0,4.0,58.0,1.0,21.0,1950.0,1950.0,1953.0
5,5fc85ff26758f696344ace3a,"[""Romance""]",film-poster/5/6/1/5/2/56152-where-love-has-gon...,tt0058745,http://www.imdb.com/title/tt0058745/maindetails,where-love-has-gone,Where Love Has Gone,en,A divorced couple's teen-age daughter stands t...,2.304,...,23.0,2.0,10.0,5.0,61.0,2.0,35.0,1950.0,1960.0,1962.0
10,5fc85ff26758f696344ace9d,"[""Horror"",""Thriller""]",film-poster/1/5/8/4/5/7/158457-streets-of-deat...,tt0247735,http://www.imdb.com/title/tt0247735/maindetails,streets-of-death,Streets of Death,en,"Hookers are being killed all over the place, t...",0.6,...,0.0,0.0,0.0,3.0,43.0,0.0,0.0,1975.0,1980.0,1986.0
12,5fc85ff26758f696344aced8,"[""Drama""]",film-poster/7/8/4/1/5/78415-the-christine-jorg...,tt0065549,http://www.imdb.com/title/tt0065549/maindetails,the-christine-jorgensen-story,The Christine Jorgensen Story,en,George Jorgensen goes to 1950s Denmark and mak...,1.708,...,5.0,1.0,4.0,4.0,48.0,2.0,24.0,1950.0,1970.0,1968.0
14,5fc85ff26758f696344acf40,"[""Romance"",""TV Movie""]",film-poster/6/7/7/6/5/67765-desperately-seekin...,tt1967688,http://www.imdb.com/title/tt1967688/maindetails,desperately-seeking-santa,Desperately Seeking Santa,en,"Jennifer, a young, ambitious executive running...",4.151,...,23.0,4.0,35.0,2.0,36.0,4.0,57.0,2000.0,2010.0,2010.0


In [38]:
from sklearn.preprocessing import MultiLabelBinarizer

MOVIE_FIELDS_TO_DROP = ["_id", "movie_id", "movie_title", "imdb_id", 
                        "imdb_link", "image_url", "tmdb_id", "tmdb_link", 
                        "overview", "release_date"]

MOVIE_BAG_CATEGORIES = ["genres", "spoken_languages", "production_countries"]

MOVIE_ONE_HOT_CATEGORIES = ["original_language"]

MOVIE_NUMERICAL_VALUES = ["year_released", "vote_count", "vote_average", "popularity",
                          "runtime", "year_bucket_3", "year_bucket_10", "year_bucket_25",
                          "runtime_sextile", "runtime_percentile", 
                          "vote_count_sextile", "vote_count_percentile",
                          "vote_average_sextile", "vote_average_percentile",
                          "popularity_sextile", "popularity_percentile"]

def string_list_to_actual_list(x):
  return x.replace("[", "").replace("'", "").replace("?", "").replace("\"", "").replace("]","").split(",")

def prepare_encoders(movie_data):
  encoders = dict()
  for mbc in MOVIE_BAG_CATEGORIES + MOVIE_ONE_HOT_CATEGORIES:
    encoders[mbc] = MultiLabelBinarizer().fit(movie_data[mbc].fillna("[]").apply(string_list_to_actual_list))
  return encoders

def create_movie_representation(movie_data, encoders):
  movie_representation = movie_data[["movie_id"] + MOVIE_NUMERICAL_VALUES].copy()
  movie_representation.reset_index(inplace=True)
  for mbc in MOVIE_BAG_CATEGORIES + MOVIE_ONE_HOT_CATEGORIES:
    movie_representation = movie_representation.merge(pd.DataFrame(encoders[mbc].transform(movie_data[mbc].fillna("[]").apply(string_list_to_actual_list))), left_index=True, right_index=True)
  movie_representation = movie_representation.drop("index", axis=1)
  new_col_names = list()
  index = 0
  for col in movie_representation.columns:
    if col == "movie_id":
      new_col_names.append("movie_id")
    else:
      new_col_names.append(index)
      index += 1
  movie_representation.columns = new_col_names
  return movie_representation.fillna(-1.0)

encoders = prepare_encoders(movie_data)
movie_representation = create_movie_representation(movie_data, encoders)
movie_representation

  return merge(


Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,...,429,430,431,432,433,434,435,436,437,438
0,where-chimneys-are-seen,1953.0,10.0,6.6,1.568,108.0,1953.0,1950.0,1950.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1,where-love-has-gone,1964.0,11.0,6.1,2.304,111.0,1962.0,1960.0,1950.0,5.0,...,0,0,0,0,0,0,0,0,0,0
2,streets-of-death,1987.0,0.0,0.0,0.600,93.0,1986.0,1980.0,1975.0,3.0,...,0,0,0,0,0,0,0,0,0,0
3,the-christine-jorgensen-story,1970.0,5.0,4.2,1.708,98.0,1968.0,1970.0,1950.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,desperately-seeking-santa,2011.0,43.0,6.1,4.151,86.0,2010.0,2010.0,2000.0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78570,heart-shot,2022.0,21.0,5.7,279.771,19.0,2022.0,2020.0,2000.0,0.0,...,0,0,0,0,0,0,0,0,0,0
78571,erax,2022.0,14.0,5.4,392.611,14.0,2022.0,2020.0,2000.0,0.0,...,0,0,0,0,0,0,0,0,0,0
78572,this-much-i-know-to-be-true,2022.0,0.0,0.0,2.141,105.0,2022.0,2020.0,2000.0,4.0,...,0,0,0,0,0,0,0,0,0,0
78573,those-who-walk-away,2022.0,5.0,5.1,8.943,92.0,2022.0,2020.0,2000.0,3.0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
def create_user_representation(full_ratings_data):
  user_ratings = full_ratings_data
  avg_global = user_ratings.groupby("user_id")["rating_val"].mean().rename("avg_global")
  count_global = user_ratings.groupby("user_id").size().rename("count_global").reset_index()
  global_df = pd.DataFrame(avg_global).reset_index()
  global_df = global_df.merge(count_global, on="user_id")
  
  user_representation = global_df
  highest_rated_movies = user_ratings.groupby("user_id").apply(lambda x: x.nlargest(int(len(x) * 0.1), "rating_val"))
  highest_rated_movies = highest_rated_movies.merge(movie_representation, on="movie_id")
  a = highest_rated_movies.groupby("user_id").mean()
  user_representation = user_representation.merge(a, on="user_id", how="outer")
  lowest_rated_movies = user_ratings.groupby("user_id").apply(lambda x: x.nsmallest(int(len(x) * 0.1), "rating_val"))
  lowest_rated_movies = lowest_rated_movies.merge(movie_representation, on="movie_id")
  b = lowest_rated_movies.groupby("user_id").mean()
  user_representation = user_representation.merge(b, on="user_id", how="outer")
  new_col_names = list()
  index = 0
  for col in user_representation.columns:
    if col == "user_id":
      new_col_names.append("user_id")
    else:
      new_col_names.append(index)
      index += 1
  user_representation.columns = new_col_names
  return user_representation.fillna(-1.0)

user_ids = train_ratings_data["user_id"].sample(100)
user_representation = create_user_representation(ratings_data)
user_representation

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,872,873,874,875,876,877,878,879,880,881
0,007filmreviwer,6.639607,2242,9.607143,1996.424107,5654.098214,7.328125,33.821277,125.410714,1995.455357,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,007hertzrumble,6.885656,1959,9.225641,1971.620513,2655.323077,7.413333,17.789462,107.871795,1970.692308,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0o0o0o0o,6.290000,100,8.600000,1987.200000,7129.400000,8.140000,31.340500,135.200000,1986.600000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11122001,6.162420,314,9.645161,2007.741935,5868.967742,7.387097,336.065806,114.903226,2006.806452,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,127gbh,6.971692,1519,10.000000,1992.099338,1059.615894,6.300000,12.439662,84.947020,1991.105960,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7391,zwangsdemokrat,5.204162,1778,8.649718,1993.683616,1696.062147,6.821469,20.374130,106.067797,1992.644068,...,0.00565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7392,zwergimbikini,6.139085,2121,8.698113,2003.047170,5624.443396,7.198113,31.168175,106.438679,2002.033019,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7393,zxols,8.172043,372,10.000000,1998.351351,3712.405405,7.429730,17.991135,111.891892,1997.270270,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7394,zxvs,7.515371,553,9.854545,1996.254545,5193.527273,7.789091,33.246473,121.218182,1995.272727,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
def extract_random_rating_batch(batch_size, train_or_test_ratings_data):
  random_ratings = train_or_test_ratings_data.sample(batch_size)
  batch_user_representation = random_ratings.merge(user_representation, on="user_id")[user_representation.columns].drop("user_id", axis=1)
  batch_user_representation = batch_user_representation.loc[:,~batch_user_representation.columns.duplicated()]
  batch_movie_representation = random_ratings.merge(movie_representation, on="movie_id")[movie_representation.columns].drop("movie_id", axis=1)
  batch_movie_representation = batch_movie_representation.loc[:,~batch_movie_representation.columns.duplicated()]
  return np.concatenate([batch_user_representation, batch_movie_representation], axis=1), random_ratings["rating_val"].values

X, y = extract_random_rating_batch(100, train_ratings_data)
X.shape

(100, 1321)

In [62]:
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K

DROPOUT_RATE = 0.5
HIDDEN_LAYER_SIZE = 128
X_WIDTH = 1321

def custom_final_activation(x):
    return K.hard_sigmoid(x) * 10

def build_model():
    inp = layers.Input(shape=(X_WIDTH,), name="inp")
    
    x = layers.BatchNormalization(name="bn1")(inp)
    x = layers.Dense(HIDDEN_LAYER_SIZE, name="d1", activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE, name="dr1")(x)
    x = layers.Dense(HIDDEN_LAYER_SIZE, name="d2", activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE, name="dr2")(x)
    x = layers.Dense(1, activation=custom_final_activation, name="predicted_rating")(x)

    return tf.keras.Model(inputs=inp, outputs=x)

model = build_model()
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])

In [None]:
import pickle
import shutil
import numpy as np
import tensorflow as tf
from keras.models import load_model
from keras.utils import Sequence
from google.colab import files

BATCH_SIZE = 100
NUM_ITERATIONS = 1000
NUM_EPOCHS_PER_ITERATION = 10
NUM_SAMPLES_PER_EPOCH = 15000
SAVED_MODEL_PATH = "/content/models/checkpoint"
ZIP_PATHNAME = "/content/drive/My Drive/letterboxd_content_based_model_checkpoint/letterboxd_content_based_model_checkpoint.zip"

class TrainingDataGenerator(Sequence):
    def __init__(self, batch_size=BATCH_SIZE):
        self.num_movies = NUM_SAMPLES_PER_EPOCH
        self.batch_size = batch_size
        
    def __len__(self):
        return int(self.num_movies / self.batch_size)
    
    def __getitem__(self, idx):
        return extract_random_rating_batch(self.batch_size, train_ratings_data)

class EvaluationDataGenerator(Sequence):
    def __init__(self, batch_size=BATCH_SIZE):
        self.num_movies = NUM_SAMPLES_PER_EPOCH
        self.batch_size = batch_size
        
    def __len__(self):
        return int(self.num_movies / self.batch_size)
    
    def __getitem__(self, idx):
        return extract_random_rating_batch(self.batch_size, test_ratings_data)

training_generator = TrainingDataGenerator(batch_size=BATCH_SIZE)
evaluation_generator = EvaluationDataGenerator(batch_size=BATCH_SIZE)
for i in range(NUM_ITERATIONS):
    print("TRAINING")
    model.fit(training_generator, epochs=NUM_EPOCHS_PER_ITERATION, verbose=1)
    model.save(SAVED_MODEL_PATH)
    shutil.make_archive(ZIP_PATHNAME[:-4], 'zip', SAVED_MODEL_PATH)
    model = load_model(SAVED_MODEL_PATH)
    print("EVALUATION")
    model.evaluate(evaluation_generator)

TRAINING
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
EVALUATION
TRAINING
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




EVALUATION
TRAINING
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




EVALUATION
TRAINING
Epoch 1/10
Epoch 2/10