# Import libraries

In [1]:
# Import libraries
import os
import math
import zipfile
from urllib.request import urlretrieve
import requests
import gzip
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
from typing import List, Tuple
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup


# Import data

In [2]:
URL_MOVIELENS = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
URL_IMBD_NAMES_BASICS = "https://datasets.imdbws.com/name.basics.tsv.gz"
URL_IMBD_TITLE_BASICS = "https://datasets.imdbws.com/title.basics.tsv.gz"
URL_IMBD_TITLE_RATINGS = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Local path where the file will be saved
LOCAL_MOVIELENS_PATH = "ml-1m.zip"
LOCAL_IMBD_NAMES_BASICS_PATH = "name.basics.tsv.gz"
LOCAL_IMBD_TITLE_BASICS_PATH = "title.basics.tsv.gz"
LOCAL_IMBD_TITLE_RATINGS_PATH = "title.ratings.tsv.gz"
# Directory where the dataset will be extracted
EXTRACT_DIR = "dataset"

In [3]:
# Function to download the file
def download_file(url, local_filename):
    print(f"Downloading {url} to {local_filename}")
    # Check if the file already exists
    if os.path.exists(local_filename):
        print(f"File {local_filename} already exists")
        return local_filename
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded {url} to {local_filename}")
    return local_filename

# Function to unzip the file
def unzip_file(zip_path, extract_to):
    print(f"Unzipping {zip_path} to {extract_to}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")

def gunzip_file(gz_path, extract_to):
    print(f"Gunzipping {gz_path} to {extract_to}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(extract_to, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Gunzipped {gz_path} to {extract_to}")

In [4]:
# Ensure the dataset directory exists
os.makedirs(EXTRACT_DIR, exist_ok=True)
# Download the file
download_file(URL_MOVIELENS, LOCAL_MOVIELENS_PATH)
download_file(URL_IMBD_NAMES_BASICS, LOCAL_IMBD_NAMES_BASICS_PATH)
download_file(URL_IMBD_TITLE_BASICS, LOCAL_IMBD_TITLE_BASICS_PATH)
download_file(URL_IMBD_TITLE_RATINGS, LOCAL_IMBD_TITLE_RATINGS_PATH)

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ml-1m.zip
File ml-1m.zip already exists
Downloading https://datasets.imdbws.com/name.basics.tsv.gz to name.basics.tsv.gz
File name.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.basics.tsv.gz to title.basics.tsv.gz
File title.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz to title.ratings.tsv.gz
File title.ratings.tsv.gz already exists


'title.ratings.tsv.gz'

In [5]:
# Extract the files
print("Unzipping file...")
unzip_file(LOCAL_MOVIELENS_PATH, EXTRACT_DIR)

print("Gunzipping files...")
gunzip_file(LOCAL_IMBD_NAMES_BASICS_PATH, os.path.join(EXTRACT_DIR, "name.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_BASICS_PATH, os.path.join(EXTRACT_DIR, "title.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_RATINGS_PATH, os.path.join(EXTRACT_DIR, "title.ratings.tsv"))

print("Extraction complete.")

Unzipping file...
Unzipping ml-1m.zip to dataset
Unzipped ml-1m.zip to dataset
Gunzipping files...
Gunzipping name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipped name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipping title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipped title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipping title.ratings.tsv.gz to dataset\title.ratings.tsv
Gunzipped title.ratings.tsv.gz to dataset\title.ratings.tsv
Extraction complete.


# Preprocess data

In [71]:
movies_dat_df = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header=None, engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header=None, engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

name_basics_df = pd.read_csv('dataset/name.basics.tsv', sep='\t', header=0)
title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)
title_ratings_df = pd.read_csv('dataset/title.ratings.tsv', sep='\t', header=0)

# Filter only movies from title_basics_df
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Preprocess the Title columns
movies_dat_df['Title'] = movies_dat_df['Title'].str.lower()
movies_dat_df['Title'] = movies_dat_df['Title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

title_basics_df['primaryTitle'] = title_basics_df['primaryTitle'].str.lower().str.strip()

# ratings.drop(columns=['Timestamp'], inplace=True)



  title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)


In [85]:
title_ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2023
3,tt0000004,5.4,179
4,tt0000005,6.2,2788
...,...,...,...
1447920,tt9916730,7.0,12
1447921,tt9916766,7.1,23
1447922,tt9916778,7.2,36
1447923,tt9916840,7.2,10


In [84]:
name_basics_df

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"
...,...,...,...,...,...,...
13575521,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt11657662,tt14069590,tt2455546"
13575522,nm9993716,Essias Loberg,\N,\N,\N,\N
13575523,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
13575524,nm9993718,Aayush Nair,\N,\N,cinematographer,tt8736744


In [83]:
title_basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,miss jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,the corbett-fitzsimmons fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,the story of the kelly gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,the prodigal son,L'enfant prodigue,0,1907,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
10855986,tt9916622,movie,rodolpho teóphilo - o legado de um pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
10856013,tt9916680,movie,de la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
10856025,tt9916706,movie,dankyavar danka,Dankyavar Danka,0,2013,\N,\N,Comedy
10856035,tt9916730,movie,6 gunn,6 Gunn,0,2017,\N,116,Drama


In [81]:
movies_dat_df.columns

Index(['MovieID', 'Title', 'Genres'], dtype='object')

Here, we do some simple data processing to fix the data types of the columns.

In [7]:
users["UserID"] = users["UserID"].apply(lambda x: f"user_{x}")
users["Age"] = users["Age"].apply(lambda x: f"group_{x}")
users["Occupation"] = users["Occupation"].apply(lambda x: f"occupation_{x}")
movies = pd.merge(movies_dat_df, title_basics_df, left_on='Title', right_on='primaryTitle', how='left')

# Replace NaN values with empty strings
movies['Genres'] = movies['Genres'].fillna('')
movies['genres'] = movies['genres'].fillna('')

# Replace '|' with ',' in Genres column and convert to lowercase
movies['Genres'] = movies['Genres'].str.replace('|', ',')
movies['Genres'] = movies['Genres'].str.lower()

# Convert genres column to lowercase
movies['genres'] = movies['genres'].str.lower()

#Delete the \\N values in the genres column
movies = movies[movies['genres'] != '\\n']

# Function to combine and clean genre columns
def combine_genres(row):
    genres = set(row['genres'].split(',')) | set(row['Genres'].split(','))
    genres.discard('')  # Remove empty strings
    return ','.join(sorted(genres))

# Apply the function to combine the genres
movies['combined_genres'] = movies.apply(combine_genres, axis=1)

# Drop the original genre columns
movies = movies.drop(columns=['genres', 'Genres', 'primaryTitle'])
movies = pd.merge(movies, title_ratings_df, on='tconst', how='left')
movies = movies.drop(columns=['tconst'])
# Drop the column endYear as it is always NaN
movies = movies.drop(columns=['endYear'])
# Drop also the column titleType as we will keep only the rated movies
movies = movies.drop(columns=['titleType'])
# Drop the original Title column as we will use the cleaned one
movies = movies.drop(columns=['Title'])
# For the runtimeMinutes column, we will replace the NaN values with the median
movies['runtimeMinutes'] = pd.to_numeric(movies['runtimeMinutes'], errors='coerce')
movies['runtimeMinutes'] = movies['runtimeMinutes'].fillna(movies['runtimeMinutes'].median())
movies["MovieID"] = movies["MovieID"].apply(lambda x: f"movie_{x}")

ratings["MovieID"] = ratings["MovieID"].apply(lambda x: f"movie_{x}")
ratings["UserID"] = ratings["UserID"].apply(lambda x: f"user_{x}")
ratings["Rating"] = ratings["Rating"].apply(lambda x: float(x))


Each movie has multiple genres. We split them into separate columns in the `movies`
DataFrame.

In [8]:
all_genres = set()
for genres in movies['combined_genres'].str.split(','):
    all_genres.update(genres)
all_genres = list(all_genres)
for genre in all_genres:
    movies[genre] = movies["combined_genres"].apply(
        lambda values: int(genre in values.split(","))
    )

### Transform the movie ratings data into sequences

First, let's sort the the ratings data using the `unix_timestamp`, and then group the
`movie_id` values and the `rating` values by `user_id`.

The output DataFrame will have a record for each `user_id`, with two ordered lists
(sorted by rating datetime): the movies they have rated, and their ratings of these movies.


In [9]:
ratings_group = ratings.sort_values(by=["Timestamp"]).groupby("UserID")
ratings_data = pd.DataFrame(
    data={
        "UserID": list(ratings_group.groups.keys()),
        "MovieIDs": list(ratings_group.MovieID.apply(list)),
        "Ratings": list(ratings_group.Rating.apply(list)),
        "Timestamps": list(ratings_group.Timestamp.apply(list)),
    }
)

Now, let's split the `movie_ids` list into a set of sequences of a fixed length.
We do the same for the `ratings`. Set the `sequence_length` variable to change the length
of the input sequence to the model. You can also change the `step_size` to control the
number of sequences to generate for each user.


In [10]:
sequence_length = 4
step_size = 2


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.MovieIDs = ratings_data.MovieIDs.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.Ratings = ratings_data.Ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["Timestamps"]


After that, we process the output to have each sequence in a separate records in
the DataFrame. In addition, we join the user features with the ratings data.


In [11]:
ratings_data_movies = ratings_data[["UserID", "MovieIDs"]].explode(
    "MovieIDs", ignore_index=True
)
ratings_data_rating = ratings_data[["Ratings"]].explode("Ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("UserID"), on="UserID"
)
ratings_data_transformed.MovieIDs = ratings_data_transformed.MovieIDs.apply(
    lambda x: ",".join(x)
)
ratings_data_transformed.Ratings = ratings_data_transformed.Ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)

del ratings_data_transformed["Zip-code"]

ratings_data_transformed.rename(
    columns={"MovieIDs": "sequence_movie_ids", "Ratings": "sequence_ratings"},
    inplace=True,
)


With `sequence_length` of 4 and `step_size` of 2, we end up with 498,623 sequences.

Finally, we split the data into training and testing splits, with 85% and 15% of
the instances, respectively, and store them to CSV files.


In [12]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv(os.path.join(EXTRACT_DIR , "train_data.csv"), index=False, sep="|", header=False)
test_data.to_csv(os.path.join(EXTRACT_DIR , "test_data.csv"), index=False, sep="|", header=False)


In [13]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,user_1,F,group_1,occupation_10,48067
1,user_2,M,group_56,occupation_16,70072
2,user_3,M,group_25,occupation_15,55117
3,user_4,M,group_45,occupation_7,02460
4,user_5,M,group_25,occupation_20,55455
...,...,...,...,...,...
6035,user_6036,F,group_25,occupation_15,32603
6036,user_6037,F,group_45,occupation_1,76006
6037,user_6038,F,group_56,occupation_1,14706
6038,user_6039,F,group_45,occupation_0,01060


## Define metadata

In [None]:
movies

In [14]:
CSV_HEADER = list(ratings_data_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "UserID": list(users.UserID.unique()),
    "MovieID": list(movies.MovieID.unique()),
    "Gender": list(users.Gender.unique()),
    "Age": list(users.Age.unique()),
    "Occupation": list(users.Occupation.unique()),
}

USER_FEATURES = ["Gender", "Age", "Occupation"]

MOVIE_FEATURES = ["genres"]


In [15]:
movies

Unnamed: 0,MovieID,originalTitle,isAdult,startYear,runtimeMinutes,combined_genres,averageRating,numVotes,action,thriller,...,reality-tv,adventure,mystery,news,fantasy,sport,western,animation,history,drama
0,movie_1,Toy Story,0,1995,81.0,"adventure,animation,children's,comedy",8.3,1074033.0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,movie_2,Jumanji,0,1995,104.0,"adventure,children's,comedy,family,fantasy",7.1,379284.0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,movie_3,Grumpier Old Men,0,1995,101.0,"comedy,romance",6.6,29842.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,movie_4,Waiting to Exhale,0,1995,124.0,"comedy,drama,romance",6.0,12281.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,movie_5,Father of the Bride Part II,0,1995,106.0,"comedy,family,romance",6.1,41883.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8372,movie_3950,Tigerland,0,2000,101.0,"drama,war",6.9,43453.0,0,0,...,0,0,0,0,0,0,0,0,0,1
8373,movie_3950,Taken by the Tiger,0,2019,91.0,"documentary,drama",6.9,72.0,0,0,...,0,0,0,0,0,0,0,0,0,1
8374,movie_3950,Tigerland,0,\N,97.0,drama,,,0,0,...,0,0,0,0,0,0,0,0,0,1
8375,movie_3951,Two Family House,0,2000,108.0,"comedy,drama,romance",7.2,1732.0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Create `tf.data.Dataset` for training and evaluation

In [16]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        movie_ids_string = features["sequence_movie_ids"]
        sequence_movie_ids = tf.strings.split(movie_ids_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        features["target_movie_id"] = sequence_movie_ids[:, -1]
        features["sequence_movie_ids"] = sequence_movie_ids[:, :-1]

        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset

## Create model inputs

In [17]:
def create_model_inputs():
    return {
        "UserID": layers.Input(name="UserID", shape=(1,), dtype=tf.string),
        "sequence_movie_ids": layers.Input(
            name="sequence_movie_ids", shape=(sequence_length - 1,), dtype=tf.string
        ),
        "target_movie_id": layers.Input(
            name="target_movie_id", shape=(1,), dtype=tf.string
        ),
        "sequence_ratings": layers.Input(
            name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        "Gender": layers.Input(name="Gender", shape=(1,), dtype=tf.string),
        "Age": layers.Input(name="Age", shape=(1,), dtype=tf.string),
        "Occupation": layers.Input(name="Occupation", shape=(1,), dtype=tf.string),
    }

## Encode input features

The `encode_input_features` method works as follows:

1. Each categorical user feature is encoded using `layers.Embedding`, with embedding
   dimension equals to the square root of the vocabulary size of the feature.
   The embeddings of these features are concatenated to form a single input tensor.

2. Each movie in the movie sequence and the target movie is encoded `layers.Embedding`,
   where the dimension size is the square root of the number of movies.

3. A multi-hot genres vector for each movie is concatenated with its embedding vector,
   and processed using a non-linear `layers.Dense` to output a vector of the same movie
   embedding dimensions.

4. A positional embedding is added to each movie embedding in the sequence, and then
   multiplied by its rating from the ratings sequence.

5. The target movie embedding is concatenated to the sequence movie embeddings, producing
   a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected
   by the attention layer for the transformer architecture.

6. The method returns a tuple of two elements: `encoded_transformer_features` and
   `encoded_other_features`.


In [32]:
class ExpandDimsLayer(layers.Layer):
    def __init__(self, axis, **kwargs):
        super(ExpandDimsLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.expand_dims(inputs, axis=self.axis)

class SplitLayer(layers.Layer):
    def __init__(self, num_or_size_splits, axis=0, **kwargs):
        super(SplitLayer, self).__init__(**kwargs)
        self.num_or_size_splits = num_or_size_splits
        self.axis = axis

    def call(self, inputs):
        return tf.split(inputs, num_or_size_splits=self.num_or_size_splits, axis=self.axis)

In [64]:
class ExpandDimsLayer(layers.Layer):
    def __init__(self, axis, **kwargs):
        super(ExpandDimsLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.expand_dims(inputs, axis=self.axis)

class SplitLayer(layers.Layer):
    def __init__(self, num_or_size_splits, axis=0, **kwargs):
        super(SplitLayer, self).__init__(**kwargs)
        self.num_or_size_splits = num_or_size_splits
        self.axis = axis

    def call(self, inputs):
        return tf.split(inputs, num_or_size_splits=self.num_or_size_splits, axis=self.axis)

class SqueezeLayer(layers.Layer):
    def __init__(self, axis, **kwargs):
        super(SqueezeLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.squeeze(inputs, axis=self.axis)

def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=True,
    include_movie_features=True,
    sequence_length=3
):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("UserID")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    for feature_name in other_feature_names:
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        embedding_dims = int(math.sqrt(len(vocabulary)))
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        encoded_other_features.append(embedding_encoder(idx))

    if len(encoded_other_features) > 1:
        encoded_other_features = layers.Concatenate()(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["MovieID"]
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
    max_movie_id = 'movie_3952'
    movie_index_lookup = layers.StringLookup(
        max_tokens=max_movie_id + 1,
        vocabulary=movie_vocabulary,
        mask_token=None,
        num_oov_indices=1,
        oov_token='[UNK]',  # Handle OOV values explicitly
        name="movie_index_lookup",
    )
    movie_embedding_encoder = layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name="movie_embedding",
    )
    genre_vectors = movies[genres].to_numpy()
    movie_genres_lookup = layers.Embedding(
        input_dim=genre_vectors.shape[0],
        output_dim=genre_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(genre_vectors),
        trainable=False,
        name="genres_vector",
    )
    movie_embedding_processor = layers.Dense(
        units=movie_embedding_dims,
        activation="relu",
        name="process_movie_embedding_with_genres",
    )

    def encode_movie(movie_id):
        movie_idx = movie_index_lookup(movie_id)
        movie_embedding = movie_embedding_encoder(movie_idx)
        encoded_movie = movie_embedding
        if include_movie_features:
            movie_genres_vector = movie_genres_lookup(movie_idx)
            encoded_movie = movie_embedding_processor(
                layers.Concatenate()([movie_embedding, movie_genres_vector])
            )
        return encoded_movie

    target_movie_id = inputs["target_movie_id"]
    encoded_target_movie = encode_movie(target_movie_id)

    sequence_movies_ids = inputs["sequence_movie_ids"]
    encoded_sequence_movies = encode_movie(sequence_movies_ids)
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=movie_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length, delta=1)
    encoded_positions = position_embedding_encoder(positions)
    sequence_ratings = ExpandDimsLayer(axis=-1)(inputs["sequence_ratings"])
    encoded_sequence_movies_with_position_and_rating = layers.Multiply()(
        [(encoded_sequence_movies + encoded_positions), sequence_ratings]
    )

    split_layer = SplitLayer(num_or_size_splits=sequence_length, axis=1)
    split_movies = split_layer(encoded_sequence_movies_with_position_and_rating)

    for encoded_movie in split_movies:
        encoded_transformer_features.append(encoded_movie)

    expand_dims_layer = ExpandDimsLayer(axis=1)
    expanded_target_movie = expand_dims_layer(encoded_target_movie)

    # Remove the extra dimension from expanded_target_movie using a custom layer
    squeeze_layer = SqueezeLayer(axis=2)
    expanded_target_movie = squeeze_layer(expanded_target_movie)
    
    encoded_transformer_features.append(expanded_target_movie)

    encoded_transformer_features = layers.Concatenate(axis=1)(encoded_transformer_features)

    return encoded_transformer_features, encoded_other_features

## Create a BST model

In [65]:
include_user_id = False
include_user_features = False
include_movie_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3

def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_movie_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

model = create_model()

TypeError: can only concatenate str (not "int") to str

In [66]:
model.summary()

## Run training and evaluation experiment


In [67]:
# Compile the model.
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

# Read the training data.
train_dataset = get_dataset_from_csv(
    os.path.join(EXTRACT_DIR , "train_data.csv"), shuffle=True, batch_size=265
)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv(os.path.join(EXTRACT_DIR , "test_data.csv"), batch_size=265)

# Evaluate the model on the test data.
_, mae = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(mae, 3)}")


Epoch 1/5
      1/Unknown [1m6s[0m 6s/step - loss: 6.7570 - mean_absolute_error: 2.3071

InvalidArgumentError: Graph execution error:

Detected at node functional_9_1/movie_embedding_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Python312\Lib\asyncio\base_events.py", line 639, in run_forever

  File "C:\Python312\Lib\asyncio\base_events.py", line 1985, in _run_once

  File "C:\Python312\Lib\asyncio\events.py", line 88, in _run

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\marti\AppData\Local\Temp\ipykernel_6264\1875749509.py", line 14, in <module>

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 314, in fit

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 117, in one_step_on_iterator

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 104, in one_step_on_data

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 51, in train_step

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\layers\layer.py", line 846, in __call__

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\models\functional.py", line 202, in call

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\ops\function.py", line 155, in _run_through_graph

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\models\functional.py", line 592, in call

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\layers\layer.py", line 846, in __call__

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\layers\core\embedding.py", line 146, in call

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\ops\numpy.py", line 4850, in take

  File "c:\Users\marti\Documents\Epita\SCIA\Recommender\recommender_system\venv\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 1940, in take

indices[148,2] = 3879 is not in [0, 3879)
	 [[{{node functional_9_1/movie_embedding_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_47284]

In [58]:
movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["MovieID"]
movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
movie_vocabulary

['movie_1',
 'movie_2',
 'movie_3',
 'movie_4',
 'movie_5',
 'movie_6',
 'movie_7',
 'movie_8',
 'movie_9',
 'movie_10',
 'movie_11',
 'movie_12',
 'movie_13',
 'movie_14',
 'movie_15',
 'movie_16',
 'movie_17',
 'movie_18',
 'movie_19',
 'movie_20',
 'movie_21',
 'movie_22',
 'movie_23',
 'movie_24',
 'movie_25',
 'movie_26',
 'movie_27',
 'movie_28',
 'movie_29',
 'movie_30',
 'movie_31',
 'movie_32',
 'movie_33',
 'movie_34',
 'movie_35',
 'movie_36',
 'movie_37',
 'movie_38',
 'movie_39',
 'movie_40',
 'movie_41',
 'movie_42',
 'movie_43',
 'movie_44',
 'movie_45',
 'movie_46',
 'movie_47',
 'movie_48',
 'movie_49',
 'movie_50',
 'movie_51',
 'movie_52',
 'movie_53',
 'movie_54',
 'movie_55',
 'movie_56',
 'movie_57',
 'movie_58',
 'movie_59',
 'movie_60',
 'movie_61',
 'movie_62',
 'movie_63',
 'movie_64',
 'movie_65',
 'movie_66',
 'movie_67',
 'movie_68',
 'movie_69',
 'movie_70',
 'movie_71',
 'movie_72',
 'movie_73',
 'movie_74',
 'movie_75',
 'movie_76',
 'movie_77',
 'movie_

In [59]:
train_dataset

<_MapDataset element_spec=(OrderedDict({'UserID': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'sequence_movie_ids': TensorSpec(shape=(None, None), dtype=tf.string, name=None), 'sequence_ratings': TensorSpec(shape=(None, None), dtype=tf.float32, name=None), 'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Age': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Occupation': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'target_movie_id': TensorSpec(shape=(None,), dtype=tf.string, name=None)}), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>