# Setup

In [60]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import math
import zipfile
import gzip
import shutil
from urllib.request import urlretrieve
import requests
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers
from keras.layers import StringLookup
from sklearn.preprocessing import LabelEncoder
import random



# Prepare the data

## Download and prepare the DataFrames


In [61]:
URL_MOVIELENS = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
URL_IMBD_NAMES_BASICS = "https://datasets.imdbws.com/name.basics.tsv.gz"
URL_IMBD_TITLE_BASICS = "https://datasets.imdbws.com/title.basics.tsv.gz"
URL_IMBD_TITLE_RATINGS = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Local path where the file will be saved
LOCAL_MOVIELENS_PATH = "ml-1m.zip"
LOCAL_IMBD_NAMES_BASICS_PATH = "name.basics.tsv.gz"
LOCAL_IMBD_TITLE_BASICS_PATH = "title.basics.tsv.gz"
LOCAL_IMBD_TITLE_RATINGS_PATH = "title.ratings.tsv.gz"
# Directory where the dataset will be extracted
EXTRACT_DIR = "dataset"

In [62]:
# Function to download the file
def download_file(url, local_filename):
    print(f"Downloading {url} to {local_filename}")
    # Check if the file already exists
    if os.path.exists(local_filename):
        print(f"File {local_filename} already exists")
        return local_filename
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded {url} to {local_filename}")
    return local_filename

# Function to unzip the file
def unzip_file(zip_path, extract_to):
    print(f"Unzipping {zip_path} to {extract_to}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")

def gunzip_file(gz_path, extract_to):
    print(f"Gunzipping {gz_path} to {extract_to}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(extract_to, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Gunzipped {gz_path} to {extract_to}")

In [63]:
# Ensure the dataset directory exists
os.makedirs(EXTRACT_DIR, exist_ok=True)
# Download the file
download_file(URL_MOVIELENS, LOCAL_MOVIELENS_PATH)
download_file(URL_IMBD_NAMES_BASICS, LOCAL_IMBD_NAMES_BASICS_PATH)
download_file(URL_IMBD_TITLE_BASICS, LOCAL_IMBD_TITLE_BASICS_PATH)
download_file(URL_IMBD_TITLE_RATINGS, LOCAL_IMBD_TITLE_RATINGS_PATH)

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ml-1m.zip
File ml-1m.zip already exists
Downloading https://datasets.imdbws.com/name.basics.tsv.gz to name.basics.tsv.gz
File name.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.basics.tsv.gz to title.basics.tsv.gz
File title.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz to title.ratings.tsv.gz
File title.ratings.tsv.gz already exists


'title.ratings.tsv.gz'

In [64]:
# Extract the files
print("Unzipping file...")
unzip_file(LOCAL_MOVIELENS_PATH, EXTRACT_DIR)

print("Gunzipping files...")
gunzip_file(LOCAL_IMBD_NAMES_BASICS_PATH, os.path.join(EXTRACT_DIR, "name.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_BASICS_PATH, os.path.join(EXTRACT_DIR, "title.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_RATINGS_PATH, os.path.join(EXTRACT_DIR, "title.ratings.tsv"))

print("Extraction complete.")

Unzipping file...
Unzipping ml-1m.zip to dataset
Unzipped ml-1m.zip to dataset
Gunzipping files...
Gunzipping name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipped name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipping title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipped title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipping title.ratings.tsv.gz to dataset\title.ratings.tsv
Gunzipped title.ratings.tsv.gz to dataset\title.ratings.tsv
Extraction complete.


In [65]:
movies = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header=None, engine='python', names=['movie_id', 'title', 'genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', names=['user_id', 'movie_id', 'rating', 'unix_timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header=None, engine='python', names=['user_id', 'sex', 'age_group', 'occupation', 'zip_code'], encoding='ISO-8859-1')

In [66]:
title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']
name_basics_df = pd.read_csv('dataset/name.basics.tsv', sep='\t', header=0)

  title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)


Here, we do some simple data processing to fix the data types of the columns.

In [67]:
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")
users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")

movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")

ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

movies['title'] = movies['title'].str.lower()
movies['title'] = movies['title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

title_basics_df['primaryTitle'] = title_basics_df['primaryTitle'].str.lower().str.strip()
title_basics_df.rename(columns={"genres": "genres_title"}, inplace=True)
title_basics_df = title_basics_df.drop_duplicates(subset='primaryTitle')

# Convert primaryName to string
name_basics_df['primaryName'] = name_basics_df['primaryName'].astype(str)

name_basics_df = name_basics_df.assign(knownForTitles=name_basics_df['knownForTitles'].str.split(','))
name_basics_df = name_basics_df.explode('knownForTitles')

# Group by knownForTitles and aggregate primaryName
name_basics_grouped = name_basics_df.groupby('knownForTitles')['primaryName'].agg(lambda x: ','.join(set(x))).reset_index()

# Merge with title_basics_df
title_basics_df = pd.merge(title_basics_df, name_basics_grouped, left_on='tconst', right_on='knownForTitles', how='left')

title_basics_df.drop(columns='knownForTitles', inplace=True)
title_basics_df['primaryName'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  title_basics_df['primaryName'].fillna('unknown', inplace=True)


In [68]:
# Merge the dataframes movies and title_basics_df, keep only the the genres_title column and merge on the title column. If the title is not found in title_basics_df, the row is not kept
movies = movies.merge(title_basics_df[['primaryTitle', 'runtimeMinutes','genres_title', 'primaryName']], left_on='title', right_on='primaryTitle', how='left')
# Drop the primaryTitle column
movies.drop(columns=['primaryTitle'], inplace=True)
movies['runtimeMinutes'] = pd.to_numeric(movies['runtimeMinutes'], errors='coerce')
mean_runtime = movies['runtimeMinutes'].mean()
movies['runtimeMinutes'].fillna(mean_runtime, inplace=True)
movies = movies.drop_duplicates()
movies

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['runtimeMinutes'].fillna(mean_runtime, inplace=True)


Unnamed: 0,movie_id,title,genres,runtimeMinutes,genres_title,primaryName
0,movie_1,toy story,Animation|Children's|Comedy,81.000000,"Adventure,Animation,Comedy","Don Conway,Molly,William Reeves,Jack Angel,Kyô..."
1,movie_2,jumanji,Adventure|Children's|Fantasy,104.000000,"Adventure,Comedy,Family","Greg Taylor,Elizabeth A. Brown,Cynthia T. Lewi..."
2,movie_3,grumpier old men,Comedy|Romance,101.000000,"Comedy,Romance","John J. Smith,Phillip Goodrich,Valerie Charles..."
3,movie_4,waiting to exhale,Comedy|Drama,124.000000,"Comedy,Drama,Romance","Brenda L. Felix,Fred Unger,Mary S. Gray,Michae..."
4,movie_5,father of the bride part ii,Comedy,106.000000,"Comedy,Family,Romance","Rocky Paolone,Danielle Gantner,Randy Waldman,D..."
...,...,...,...,...,...,...
3878,movie_3948,meet the parents,Comedy,75.000000,Comedy,"Dick Galloway,Greg Glienna,Carol Wayland,Scott..."
3879,movie_3949,requiem for a dream,Drama,102.000000,Drama,"Marcia Jean Kurtz,Ellen Burke,Brian Costello,B..."
3880,movie_3950,tigerland,Drama,101.000000,"Drama,War","Jack Newman,Ronnie Schafer,Russell Richardson,..."
3881,movie_3951,two family house,Drama,108.000000,"Comedy,Drama,Romance","Bryan Wachtel,Dawn Weisberg,Walter Burns,Al Kl..."


In [69]:
# Merge the genres column (seperated with | ) with the genres_title column (seperated with ','). Do not keep duplicates
movies['genres'] = movies['genres'].str.split('|')
movies['genres_title'] = movies['genres_title'].str.split(',')
# If the genres_title column is NaN, replace it with an empty list
movies['genres_title'] = movies['genres_title'].apply(lambda x: [] if type(x) == float else x)
movies['genres'] = movies['genres'] + movies['genres_title']
movies['genres'] = movies['genres'].apply(lambda x: list(set(x)))
movies['genres'] = movies['genres'].apply(lambda x: '|'.join(x))
movies.drop(columns=['genres_title'], inplace=True)

In [70]:
movies

Unnamed: 0,movie_id,title,genres,runtimeMinutes,primaryName
0,movie_1,toy story,Adventure|Animation|Children's|Comedy,81.000000,"Don Conway,Molly,William Reeves,Jack Angel,Kyô..."
1,movie_2,jumanji,Fantasy|Comedy|Adventure|Children's|Family,104.000000,"Greg Taylor,Elizabeth A. Brown,Cynthia T. Lewi..."
2,movie_3,grumpier old men,Romance|Comedy,101.000000,"John J. Smith,Phillip Goodrich,Valerie Charles..."
3,movie_4,waiting to exhale,Drama|Romance|Comedy,124.000000,"Brenda L. Felix,Fred Unger,Mary S. Gray,Michae..."
4,movie_5,father of the bride part ii,Romance|Family|Comedy,106.000000,"Rocky Paolone,Danielle Gantner,Randy Waldman,D..."
...,...,...,...,...,...
3878,movie_3948,meet the parents,Comedy,75.000000,"Dick Galloway,Greg Glienna,Carol Wayland,Scott..."
3879,movie_3949,requiem for a dream,Drama,102.000000,"Marcia Jean Kurtz,Ellen Burke,Brian Costello,B..."
3880,movie_3950,tigerland,Drama|War,101.000000,"Jack Newman,Ronnie Schafer,Russell Richardson,..."
3881,movie_3951,two family house,Drama|Romance|Comedy,108.000000,"Bryan Wachtel,Dawn Weisberg,Walter Burns,Al Kl..."


In [71]:
# First, ensure primaryName is a string
movies['primaryName'] = movies['primaryName'].fillna('unknown')
movies['primaryName'] = movies['primaryName'].astype(str)

# Create a LabelEncoder
le = LabelEncoder()

# Fit and transform the primaryName column
movies['primaryName_encoded'] = le.fit_transform(movies['primaryName'])

# Normalize the encoded values to be between 0 and 1
movies['primaryName_encoded'] = movies['primaryName_encoded'] / movies['primaryName_encoded'].max()

# Drop the original primaryName and genres_title columns
movies.drop(columns=['primaryName'], inplace=True)

In [72]:
movies

Unnamed: 0,movie_id,title,genres,runtimeMinutes,primaryName_encoded
0,movie_1,toy story,Adventure|Animation|Children's|Comedy,81.000000,0.231313
1,movie_2,jumanji,Fantasy|Comedy|Adventure|Children's|Family,104.000000,0.344738
2,movie_3,grumpier old men,Romance|Comedy,101.000000,0.477129
3,movie_4,waiting to exhale,Drama|Romance|Comedy,124.000000,0.100781
4,movie_5,father of the bride part ii,Romance|Family|Comedy,106.000000,0.822239
...,...,...,...,...,...
3878,movie_3948,meet the parents,Comedy,75.000000,0.223131
3879,movie_3949,requiem for a dream,Drama,102.000000,0.613611
3880,movie_3950,tigerland,Drama|War,101.000000,0.397174
3881,movie_3951,two family house,Drama|Romance|Comedy,108.000000,0.112681


In [73]:
users

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,user_1,F,group_1,occupation_10,48067
1,user_2,M,group_56,occupation_16,70072
2,user_3,M,group_25,occupation_15,55117
3,user_4,M,group_45,occupation_7,02460
4,user_5,M,group_25,occupation_20,55455
...,...,...,...,...,...
6035,user_6036,F,group_25,occupation_15,32603
6036,user_6037,F,group_45,occupation_1,76006
6037,user_6038,F,group_56,occupation_1,14706
6038,user_6039,F,group_45,occupation_0,01060


Each movie has multiple genres. We split them into separate columns in the movies DataFrame.

In [74]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,user_1,movie_1193,5.0,978300760
1,user_1,movie_661,3.0,978302109
2,user_1,movie_914,3.0,978301968
3,user_1,movie_3408,4.0,978300275
4,user_1,movie_2355,5.0,978824291
...,...,...,...,...
1000204,user_6040,movie_1091,1.0,956716541
1000205,user_6040,movie_1094,5.0,956704887
1000206,user_6040,movie_562,5.0,956704746
1000207,user_6040,movie_1096,4.0,956715648


In [75]:
movie_features = set()
for actual_genres in movies['genres'].str.split('|'):
    movie_features.update(actual_genres)
movie_features = list(movie_features)

for genre in movie_features:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )

# Add runtimeMinutes and primaryName_encoded to movie_features
movie_features.extend(["runtimeMinutes", "primaryName_encoded"])

In [76]:
movies

Unnamed: 0,movie_id,title,genres,runtimeMinutes,primaryName_encoded,Fantasy,Drama,Horror,Thriller,Crime,...,Animation,Children's,Family,\N,Sport,Biography,Romance,Music,Action,History
0,movie_1,toy story,Adventure|Animation|Children's|Comedy,81.000000,0.231313,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,movie_2,jumanji,Fantasy|Comedy|Adventure|Children's|Family,104.000000,0.344738,1,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,movie_3,grumpier old men,Romance|Comedy,101.000000,0.477129,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,movie_4,waiting to exhale,Drama|Romance|Comedy,124.000000,0.100781,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,movie_5,father of the bride part ii,Romance|Family|Comedy,106.000000,0.822239,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,movie_3948,meet the parents,Comedy,75.000000,0.223131,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,movie_3949,requiem for a dream,Drama,102.000000,0.613611,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,movie_3950,tigerland,Drama|War,101.000000,0.397174,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,movie_3951,two family house,Drama|Romance|Comedy,108.000000,0.112681,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Transform the movie ratings data into sequences
First, let's sort the the ratings data using the unix_timestamp, and then group the movie_id values and the rating values by user_id.

The output DataFrame will have a record for each user_id, with two ordered lists (sorted by rating datetime): the movies they have rated, and their ratings of these movies.

In [77]:
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)

In [78]:
ratings_data

Unnamed: 0,user_id,movie_ids,ratings,timestamps
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_102...","[4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, ...","[978300019, 978300055, 978300055, 978300055, 9..."
1,user_10,"[movie_597, movie_858, movie_743, movie_1210, ...","[4.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 5.0, 3.0, ...","[978224375, 978224375, 978224375, 978224400, 9..."
2,user_100,"[movie_260, movie_1676, movie_1198, movie_541,...","[4.0, 3.0, 4.0, 3.0, 4.0, 3.0, 1.0, 1.0, 5.0, ...","[977593595, 977593595, 977593607, 977593624, 9..."
3,user_1000,"[movie_971, movie_260, movie_2990, movie_2973,...","[4.0, 5.0, 4.0, 3.0, 5.0, 5.0, 2.0, 5.0, 5.0, ...","[975040566, 975040566, 975040566, 975040629, 9..."
4,user_1001,"[movie_1198, movie_1617, movie_2885, movie_390...","[4.0, 4.0, 4.0, 2.0, 2.0, 1.0, 4.0, 5.0, 5.0, ...","[975039591, 975039702, 975039702, 975039898, 9..."
...,...,...,...,...
6035,user_995,"[movie_1894, movie_260, movie_247, movie_433, ...","[2.0, 4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 4.0, 3.0, ...","[975054785, 975054785, 975054785, 975054853, 9..."
6036,user_996,"[movie_1347, movie_2146, movie_1961, movie_274...","[4.0, 3.0, 5.0, 3.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...","[975052132, 975052132, 975052195, 975052284, 9..."
6037,user_997,"[movie_1196, movie_2082, movie_3247, movie_244...","[4.0, 3.0, 3.0, 3.0, 2.0, 5.0, 5.0, 5.0, 4.0, ...","[975044235, 975044425, 975044426, 975044426, 9..."
6038,user_998,"[movie_2266, movie_1264, movie_1097, movie_164...","[3.0, 4.0, 5.0, 5.0, 4.0, 3.0, 4.0, 3.0, 4.0, ...","[975043499, 975043593, 975043593, 975043593, 9..."


Now, let's split the movie_ids list into a set of sequences of a fixed length. We do the same for the ratings. Set the sequence_length variable to change the length of the input sequence to the model. You can also change the step_size to control the number of sequences to generate for each user.

In [79]:
sequence_length = 4
step_size = 2


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            # Pad the sequence if it's shorter than window_size
            seq = seq + [None] * (window_size - len(seq))
        sequences.append(seq)
        if end_index >= len(values):
            break
        start_index += step_size
    return sequences


ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["timestamps"]

After that, we process the output to have each sequence in a separate records in the DataFrame. In addition, we join the user features with the ratings data.

In [80]:
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("user_id"), on="user_id"
)
ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply(
    lambda x: ",".join([str(v) if v is not None else "" for v in x])
)
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) if v is not None else "" for v in x])
)

del ratings_data_transformed["zip_code"]

ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

In [81]:
ratings_data_transformed

Unnamed: 0,user_id,sequence_movie_ids,sequence_ratings,sex,age_group,occupation
0,user_1,"movie_3186,movie_1721,movie_1270,movie_1022","4.0,4.0,5.0,5.0",F,group_1,occupation_10
1,user_1,"movie_1270,movie_1022,movie_2340,movie_1836","5.0,5.0,3.0,5.0",F,group_1,occupation_10
2,user_1,"movie_2340,movie_1836,movie_3408,movie_1207","3.0,5.0,4.0,4.0",F,group_1,occupation_10
3,user_1,"movie_3408,movie_1207,movie_2804,movie_260","4.0,4.0,5.0,4.0",F,group_1,occupation_10
4,user_1,"movie_2804,movie_260,movie_720,movie_1193","5.0,4.0,3.0,5.0",F,group_1,occupation_10
...,...,...,...,...,...,...
495541,user_999,"movie_2264,movie_1959,movie_2676,movie_2540","2.0,1.0,3.0,2.0",M,group_25,occupation_15
495542,user_999,"movie_2676,movie_2540,movie_1363,movie_765","3.0,2.0,3.0,3.0",M,group_25,occupation_15
495543,user_999,"movie_1363,movie_765,movie_3565,movie_1410","3.0,3.0,4.0,2.0",M,group_25,occupation_15
495544,user_999,"movie_3565,movie_1410,movie_2269,movie_2504","4.0,2.0,3.0,3.0",M,group_25,occupation_15


In [82]:
def pair_users(ratings_data_transformed):
    paired_data = []
    for _, row in ratings_data_transformed.iterrows():
        sequence_movie_ids = row['sequence_movie_ids'].split(',')
        sequence_ratings = row['sequence_ratings'].split(',')
        
        # Check if we have a valid target movie and rating
        if len(sequence_movie_ids) > 0 and len(sequence_ratings) > 0 and sequence_ratings[-1] != '':
            target_movie_id = sequence_movie_ids[-1]
            target_rating = float(sequence_ratings[-1])
            
            paired_data.append({
                'user_id': row['user_id'],
                'sequence_movie_ids': ','.join(sequence_movie_ids[:-1]),
                'sequence_ratings': ','.join(sequence_ratings[:-1]),
                'sex': row['sex'],
                'age_group': row['age_group'],
                'occupation': row['occupation'],
                'target_movie_id': target_movie_id,
                'target_rating': target_rating
            })
    
    # Group by target_movie_id
    from collections import defaultdict
    movie_groups = defaultdict(list)
    for entry in paired_data:
        movie_groups[entry['target_movie_id']].append(entry)
    
    # Pair users who rated the same movie
    paired_data_final = []
    for movie_id, group in movie_groups.items():
        for i in range(0, len(group) - 1, 2):
            pair1 = group[i]
            pair2 = group[i + 1]
            paired_data_final.append({
                'user_id_1': pair1['user_id'],
                'sequence_movie_ids_1': pair1['sequence_movie_ids'],
                'sequence_ratings_1': pair1['sequence_ratings'],
                'sex_1': pair1['sex'],
                'age_group_1': pair1['age_group'],
                'occupation_1': pair1['occupation'],
                'user_id_2': pair2['user_id'],
                'sequence_movie_ids_2': pair2['sequence_movie_ids'],
                'sequence_ratings_2': pair2['sequence_ratings'],
                'sex_2': pair2['sex'],
                'age_group_2': pair2['age_group'],
                'occupation_2': pair2['occupation'],
                'target_movie_id': movie_id,
                'target_rating': (pair1['target_rating'] + pair2['target_rating']) / 2
            })
    
    return pd.DataFrame(paired_data_final)

# Now you can use the function
paired_data = pair_users(ratings_data_transformed)
print(f"Number of paired data points: {len(paired_data)}")

Number of paired data points: 245349


In [83]:
paired_data

Unnamed: 0,user_id_1,sequence_movie_ids_1,sequence_ratings_1,sex_1,age_group_1,occupation_1,user_id_2,sequence_movie_ids_2,sequence_ratings_2,sex_2,age_group_2,occupation_2,target_movie_id,target_rating
0,user_1,"movie_3186,movie_1721,movie_1270","4.0,4.0,5.0",F,group_1,occupation_10,user_10,"movie_3347,movie_1292,movie_527","5.0,5.0,4.0",F,group_35,occupation_1,movie_1022,5.0
1,user_1057,"movie_3114,movie_2096,movie_2080","5.0,4.0,4.0",M,group_45,occupation_17,user_1087,"movie_3114,movie_2018,movie_594","4.0,5.0,5.0",M,group_25,occupation_16,movie_1022,4.5
2,user_1088,"movie_1059,movie_971,movie_3897","4.0,4.0,4.0",F,group_1,occupation_10,user_1092,"movie_2081,movie_1947,movie_1031","4.0,4.0,3.0",F,group_18,occupation_4,movie_1022,4.0
3,user_1100,"movie_2294,movie_3615,movie_1029","3.0,1.0,1.0",M,group_25,occupation_0,user_112,"movie_2139,movie_2096,movie_596","3.0,4.0,5.0",M,group_25,occupation_16,movie_1022,3.5
4,user_1120,"movie_3213,movie_2080,movie_2085","4.0,4.0,4.0",M,group_18,occupation_4,user_1141,"movie_1992,movie_1991,movie_2804","1.0,1.0,5.0",F,group_25,occupation_3,movie_1022,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245344,user_5582,"movie_910,movie_903,movie_1929","5.0,4.0,5.0",M,group_45,occupation_1,user_6036,"movie_3350,movie_2355,movie_2852","4.0,4.0,3.0",F,group_25,occupation_15,movie_3282,3.5
245345,user_5682,"movie_1515,movie_748,movie_2713","3.0,1.0,2.0",M,group_18,occupation_0,user_881,"movie_2353,movie_2763,movie_1858","3.0,2.0,2.0",M,group_18,occupation_14,movie_1636,3.0
245346,user_660,"movie_2798,movie_102,movie_3268","1.0,1.0,2.0",M,group_45,occupation_16,user_699,"movie_2036,movie_102,movie_2799","3.0,1.0,2.0",M,group_18,occupation_0,movie_634,1.0
245347,user_696,"movie_2357,movie_994,movie_1094","4.0,4.0,5.0",M,group_25,occupation_12,user_889,"movie_1079,movie_3462,movie_3341","4.0,3.0,3.0",M,group_45,occupation_20,movie_2999,4.0


With sequence_length of 4 and step_size of 2, we end up with 498,623 sequences.

Finally, we split the data into training and testing splits, with 85% and 15% of the instances, respectively, and store them to CSV files.

In [84]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv(os.path.join(EXTRACT_DIR, "train_data.csv"), index=False, sep="|", header=False)
test_data.to_csv(os.path.join(EXTRACT_DIR, "test_data.csv"), index=False, sep="|", header=False)

## Define metadata

In [118]:
CSV_HEADER = [
    "user_id_1", "sequence_movie_ids_1", "sequence_ratings_1", "sex_1", "age_group_1", "occupation_1",
    "user_id_2", "sequence_movie_ids_2", "sequence_ratings_2", "sex_2", "age_group_2", "occupation_2",
    "target_movie_id", "target_rating"
]

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "user_id": list(users.user_id.unique()),
    "movie_id": list(movies.movie_id.unique()),
    "sex": list(users.sex.unique()),
    "age_group": list(users.age_group.unique()),
    "occupation": list(users.occupation.unique()),
}

USER_FEATURES = ["sex", "age_group", "occupation"]

MOVIE_FEATURES = ["genres", "runtimeMinutes", "primaryName_encoded"]

## Create tf.data.Dataset for training and evaluation

In [119]:
class SequenceProcessor(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, sequence_movies_ids, sequence_ratings):
        mask = tf.not_equal(sequence_movies_ids, '')
        sequence_movies_ids = tf.where(mask, sequence_movies_ids, 'unknown_movie')
        sequence_ratings = tf.where(mask, sequence_ratings, 0.0)
        return sequence_movies_ids, sequence_ratings

In [120]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=32):
    print(f"Loading dataset from {csv_file_path}")
    print(f"Shuffle: {shuffle}, Batch size: {batch_size}")

    # Read the CSV file
    df = pd.read_csv(csv_file_path, sep='|', header=None, names=CSV_HEADER)
    
    # Convert DataFrame to tensor
    dataset = tf.data.Dataset.from_tensor_slices(dict(df))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))

    dataset = dataset.batch(batch_size)

    def process(features):
        processed_features = {}
        for i in range(1, 3):
            movie_ids_string = features[f"sequence_movie_ids_{i}"]
            sequence_movie_ids = tf.strings.split(movie_ids_string, ",").to_tensor()

            processed_features[f"target_movie_id_{i}"] = features["target_movie_id"]
            processed_features[f"sequence_movie_ids_{i}"] = sequence_movie_ids

            ratings_string = features[f"sequence_ratings_{i}"]
            sequence_ratings = tf.strings.to_number(
                tf.strings.split(ratings_string, ","), tf.dtypes.float32
            ).to_tensor()

            processed_features[f"sequence_ratings_{i}"] = sequence_ratings

            # Copy other features
            for feature in ["user_id", "sex", "age_group", "occupation"]:
                processed_features[f"{feature}_{i}"] = features[f"{feature}_{i}"]

        target = features["target_rating"]
        return processed_features, target

    dataset = dataset.map(process)

    # Print dataset info
    dataset_size = sum(1 for _ in dataset)
    print(f"Dataset size (number of batches): {dataset_size}")

    return dataset

## Create model inputs

In [121]:
def create_model_inputs():
    inputs = {}
    for i in range(1, 3):
        inputs.update({
            f"user_id_{i}": keras.Input(name=f"user_id_{i}", shape=(1,), dtype="string"),
            f"sequence_movie_ids_{i}": keras.Input(
                name=f"sequence_movie_ids_{i}", shape=(sequence_length - 1,), dtype="string"
            ),
            f"sequence_ratings_{i}": keras.Input(
                name=f"sequence_ratings_{i}", shape=(sequence_length - 1,), dtype=tf.float32
            ),
            f"target_movie_id_{i}": keras.Input(name=f"target_movie_id_{i}", shape=(1,), dtype="string"),
            f"sex_{i}": keras.Input(name=f"sex_{i}", shape=(1,), dtype="string"),
            f"age_group_{i}": keras.Input(name=f"age_group_{i}", shape=(1,), dtype="string"),
            f"occupation_{i}": keras.Input(name=f"occupation_{i}", shape=(1,), dtype="string"),
        })
    return inputs

## Encode input features
The encode_input_features method works as follows:

1. Each categorical user feature is encoded using layers.Embedding, with embedding dimension equals to the square root of the vocabulary size of the feature. The embeddings of these features are concatenated to form a single input tensor.

2. Each movie in the movie sequence and the target movie is encoded layers.Embedding, where the dimension size is the square root of the number of movies.

3. A multi-hot genres vector for each movie is concatenated with its embedding vector, and processed using a non-linear layers.Dense to output a vector of the same movie embedding dimensions.

4. A positional embedding is added to each movie embedding in the sequence, and then multiplied by its rating from the ratings sequence.

5. The target movie embedding is concatenated to the sequence movie embeddings, producing a tensor with the shape of [batch size, sequence length, embedding size], as expected by the attention layer for the transformer architecture.

6. The method returns a tuple of two elements: encoded_transformer_features and encoded_other_features.

In [122]:
def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=True,
    include_movie_features=True,
    name_prefix=""
):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    ## Encode user features
    for feature_name in other_feature_names:
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        embedding_dims = int(math.sqrt(len(vocabulary)))
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{name_prefix}{feature_name}_embedding",
        )
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a movie embedding encoder
    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
    movie_index_lookup = StringLookup(
        vocabulary=movie_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name=f"{name_prefix}movie_index_lookup",
    )
    movie_embedding_encoder = layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name=f"{name_prefix}movie_embedding",
    )
    movie_feature_vectors = movies[movie_features].to_numpy()
    movie_features_lookup = layers.Embedding(
        input_dim=movie_feature_vectors.shape[0],
        output_dim=movie_feature_vectors.shape[1],
        embeddings_initializer=keras.initializers.Constant(movie_feature_vectors),
        trainable=False,
        name=f"{name_prefix}features_vector",
    )
    movie_embedding_processor = layers.Dense(
        units=movie_embedding_dims,
        activation="relu",
        name=f"{name_prefix}process_movie_embedding_with_genres",
    )

    ## Define a function to encode a given movie id.
    def encode_movie(movie_id):
        movie_idx = movie_index_lookup(movie_id)
        movie_embedding = movie_embedding_encoder(movie_idx)
        encoded_movie = movie_embedding
        if include_movie_features:
            movie_genres_vector = movie_features_lookup(movie_idx)
            encoded_movie = movie_embedding_processor(
                layers.concatenate([movie_embedding, movie_genres_vector])
            )
        return encoded_movie

    ## Encoding target_movie_id
    target_movie_id = inputs["target_movie_id"]
    encoded_target_movie = encode_movie(target_movie_id)

    ## Encoding sequence movie_ids.
    sequence_movies_ids = inputs["sequence_movie_ids"]
    sequence_ratings = inputs["sequence_ratings"]
    
    # Use the custom layer for sequence processing
    sequence_processor = SequenceProcessor(name=f"{name_prefix}sequence_processor")
    sequence_movies_ids, sequence_ratings = sequence_processor(sequence_movies_ids, sequence_ratings)
    
    encoded_sequence_movies = encode_movie(sequence_movies_ids)
    
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=movie_embedding_dims,
        name=f"{name_prefix}position_embedding",
    )
    positions = keras.ops.arange(start=0, stop=sequence_length - 1, step=1)
    encoded_positions = position_embedding_encoder(positions)
    
    sequence_ratings = layers.Reshape((-1, 1))(sequence_ratings)
    
    encoded_sequence_movies_with_position_and_rating = layers.Multiply()(
        [encoded_sequence_movies + encoded_positions, sequence_ratings]
    )

    # Reshape encoded_sequence_movies_with_position_and_rating to match the target shape
    encoded_sequence_movies_with_position_and_rating = layers.Reshape(
        (sequence_length - 1, 1, movie_embedding_dims)
    )(encoded_sequence_movies_with_position_and_rating)

    # Reshape encoded_target_movie to match the target shape
    encoded_target_movie = layers.Reshape((1, 1, movie_embedding_dims))(encoded_target_movie)

    encoded_transformer_features = layers.Concatenate(axis=1)(
        [encoded_sequence_movies_with_position_and_rating, encoded_target_movie]
    )

    return encoded_transformer_features, encoded_other_features

In [123]:
USER_FEATURES

['sex', 'age_group', 'occupation']

## Create a BST model

In [124]:
include_user_id = False
include_user_features = True
include_movie_features = True

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    
    user_features = []
    for i in range(1, 3):
        user_input = {k.replace(f"_{i}", ""): v for k, v in inputs.items() if f"_{i}" in k}
        
        # Use the correct key for target_movie_id
        user_input["target_movie_id"] = inputs[f"target_movie_id_{i}"]
        
        transformer_features, other_features = encode_input_features(
            user_input, include_user_id, include_user_features, include_movie_features,
            name_prefix=f"user_{i}_"
        )

        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate,
            name=f"user_{i}_attention"
        )(transformer_features, transformer_features)

        attention_output = layers.Dropout(dropout_rate, name=f"user_{i}_dropout_1")(attention_output)
        x1 = layers.Add(name=f"user_{i}_add_1")([transformer_features, attention_output])
        x1 = layers.LayerNormalization(name=f"user_{i}_layer_norm_1")(x1)
        x2 = layers.LeakyReLU(name=f"user_{i}_leaky_relu")(x1)
        x2 = layers.Dense(units=x2.shape[-1], name=f"user_{i}_dense_1")(x2)
        x2 = layers.Dropout(dropout_rate, name=f"user_{i}_dropout_2")(x2)
        transformer_features = layers.Add(name=f"user_{i}_add_2")([x1, x2])
        transformer_features = layers.LayerNormalization(name=f"user_{i}_layer_norm_2")(transformer_features)
        features = layers.Flatten(name=f"user_{i}_flatten")(transformer_features)

        if other_features is not None:
            features = layers.concatenate(
                [features, layers.Reshape([other_features.shape[-1]], name=f"user_{i}_reshape")(other_features)],
                name=f"user_{i}_concatenate"
            )
        
        user_features.append(features)
    
    # Combine features from both users
    combined_features = layers.Average(name="average_user_features")(user_features)

    # Fully-connected layers
    for i, num_units in enumerate(hidden_units):
        combined_features = layers.Dense(num_units, name=f"dense_{i+1}")(combined_features)
        combined_features = layers.BatchNormalization(name=f"batch_norm_{i+1}")(combined_features)
        combined_features = layers.LeakyReLU(name=f"leaky_relu_{i+1}")(combined_features)
        combined_features = layers.Dropout(dropout_rate, name=f"dropout_{i+1}")(combined_features)

    outputs = layers.Dense(units=1, name="output")(combined_features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

model = create_model()

## Run training and evaluation experiment

In [126]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)
def count_batches(dataset):
    return sum(1 for _ in dataset)
# Read the training data.
# Read the training data
train_dataset = get_dataset_from_csv(os.path.join(EXTRACT_DIR, "train_paired_data.csv"), shuffle=True, batch_size=32)

# Fit the model with the training data
history = model.fit(train_dataset, epochs=10, verbose=1)

# Read the test data.
test_dataset = get_dataset_from_csv(os.path.join(EXTRACT_DIR, "test_paired_data.csv"), shuffle=True, batch_size=32)

# Evaluate the model on the test data.
_, rmse = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(rmse, 3)}")

Loading dataset from dataset\train_paired_data.csv
Shuffle: True, Batch size: 32
Dataset size (number of batches): 6
Epoch 1/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 32ms/step - loss: 1.1359 - mean_absolute_error: 0.8294
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 1.3754 - mean_absolute_error: 0.9368
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1.2302 - mean_absolute_error: 0.8743
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 1.1743 - mean_absolute_error: 0.8800
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 1.5389 - mean_absolute_error: 1.0086
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1.1918 - mean_absolute_error: 0.8386
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 1.2061 - mean_absolute_

In [108]:
with open(os.path.join(EXTRACT_DIR, "train_paired_data.csv"), 'r') as f:
    line_count = sum(1 for line in f)
print(f"Number of lines in CSV file: {line_count}")

Number of lines in CSV file: 168


In [None]:
for data, target in train_dataset.take(1):
    print(f"Data batch: {data}")
    print(f"Target batch: {target}")
    break

#Print the shape of the batch and the target batch
print(f"Shape of the batch: {data['user_id_1'].shape}")
print(f"Shape of the target batch: {target.shape}")


Data batch: OrderedDict({'user_id_1': <tf.Tensor: shape=(168,), dtype=string, numpy=
array([b'user_864', b'user_1749', b'user_5968', b'user_1803',
       b'user_3563', b'user_735', b'user_2382', b'user_5458',
       b'user_3454', b'user_4034', b'user_5413', b'user_2540',
       b'user_1279', b'user_3081', b'user_5053', b'user_591',
       b'user_2175', b'user_2294', b'user_3734', b'user_3416',
       b'user_5749', b'user_4411', b'user_1757', b'user_2589',
       b'user_1698', b'user_1010', b'user_2824', b'user_3308',
       b'user_1592', b'user_881', b'user_3242', b'user_5677',
       b'user_5135', b'user_3682', b'user_415', b'user_2839',
       b'user_5277', b'user_3677', b'user_2967', b'user_4141',
       b'user_1598', b'user_3749', b'user_3481', b'user_6011',
       b'user_4892', b'user_1587', b'user_5185', b'user_3183',
       b'user_3190', b'user_412', b'user_643', b'user_1974', b'user_4139',
       b'user_2888', b'user_4317', b'user_1193', b'user_1059',
       b'user_4675', b'use

## We can now make prediction of the ratings

In [127]:
# Use the model to get predictions.
predictions = model.predict(test_dataset)
print(predictions[:1000])
# Print the real target values.
for batch in test_dataset.take(1):
    print(batch[1][:1000])

# Compute the model's MAE
mae = np.mean(np.abs(predictions - batch[1]))
print(f"Mean Absolute Error: {round(mae, 3)}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step
[[3.1166832]
 [3.2093947]
 [2.8160112]
 [3.173418 ]
 [3.2258918]
 [3.2054703]
 [3.2762897]
 [3.1553035]
 [2.9931672]
 [3.161397 ]
 [3.0899923]
 [3.1620018]
 [3.1326396]
 [3.036026 ]
 [3.4601958]
 [3.1086967]
 [3.1149664]
 [3.0082424]
 [3.2496047]
 [2.9622958]
 [3.0648463]
 [3.158933 ]
 [3.2974222]
 [2.8759012]
 [2.618849 ]
 [3.1793735]
 [2.936443 ]
 [3.2637842]
 [3.2261758]
 [2.9208415]
 [3.292204 ]
 [2.9934883]
 [2.6179812]
 [3.3914008]
 [3.1755524]
 [3.230599 ]
 [3.2872312]
 [3.344542 ]
 [3.0405748]
 [3.132909 ]
 [3.1377656]
 [2.735123 ]
 [3.2238836]
 [3.1138923]]
tf.Tensor(
[4.5 4.  4.  4.5 4.  3.  2.5 5.  4.5 4.5 2.  4.5 3.5 3.5 5.  4.5 4.  2.
 5.  4.  4.5 3.5 3.5 3.5 4.5 4.5 3.5 4.  4.5 5.  3.5 4. ], shape=(32,), dtype=float64)
Mean Absolute Error: 1.043
