## Objective - Information Retrieval:: Retrieve relevant movie candidates

In [1]:
# load some libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import tensorflow_recommenders as tfrs
import datetime
import faiss
import warnings
import tempfile
from typing import Dict, Text
import os
warnings.filterwarnings('ignore')

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
! rm -rf ./logs/

In [3]:
# load the data
ratings_data = pd.read_csv('../data_files/title.basics.csv')
users_data = pd.read_csv('../data_files/title.user-rating.csv')

#select needed features from ratings
ratings_data = ratings_data[['tconst', 'originalTitle', 'genres', 'runtimeMinutes']]

# rename column
ratings_data.rename(columns={'tconst': 'movieID'}, inplace=True)

data = users_data.merge(ratings_data, on='movieID')
data.head()

Unnamed: 0,userID,movieID,rating,review date,originalTitle,genres,runtimeMinutes
0,ur4592644,tt0120884,10,16 January 2005,When the Light Comes,"Adventure,Drama,Romance",115
1,ur3174947,tt0118688,3,16 January 2005,Batman & Robin,"Action,Sci-Fi",125
2,ur3780035,tt0387887,8,16 January 2005,Bottom Live 2003: Weapons Grade Y-Fronts Tour,Comedy,93
3,ur4592628,tt0346491,1,16 January 2005,Alexander,"Action,Biography,Drama",175
4,ur3174947,tt0094721,8,16 January 2005,Beetlejuice,"Comedy,Fantasy",92


In [4]:
data.shape

(2400016, 7)

## Step 1: Basic data preprocessing

In [5]:
# create a new movie title list that enforces type uniformity in its values
# might come in handy when creaying the movie title vocalbulary
updated_movie_titles = []
for i in data['originalTitle']:
    if type(i) != 'str':
        a = str(i)
        updated_movie_titles.append(a)
    else:
        updated_movie_titles.append(i)

In [6]:
data['originalTitle'] = updated_movie_titles
data['originalTitle']

0                                   When the Light Comes
1                                         Batman & Robin
2          Bottom Live 2003: Weapons Grade Y-Fronts Tour
3                                              Alexander
4                                            Beetlejuice
                               ...                      
2400011                        Robin Hood: Men in Tights
2400012                                   Batman Returns
2400013                          Lipstick on Your Collar
2400014                                  Out for Justice
2400015                                     The I Inside
Name: originalTitle, Length: 2400016, dtype: object

In [7]:
# Due to data size and training time, we will use some records
df = data.sample(n = 100000, random_state = 50, axis = 0)

In [8]:
df.shape

(100000, 7)

### Step1.1: convert the string typed review date to datetime format

In [9]:
# convert the review date to datetime
df['review date'] = pd.to_datetime(df['review date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 235927 to 575509
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   userID          100000 non-null  object        
 1   movieID         100000 non-null  object        
 2   rating          100000 non-null  int64         
 3   review date     100000 non-null  datetime64[ns]
 4   originalTitle   100000 non-null  object        
 5   genres          100000 non-null  object        
 6   runtimeMinutes  100000 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 6.1+ MB


In [10]:
# convert the datetime to just unix timestamp
df['review date in unix'] = [datetime.datetime.timestamp(time) for time in df['review date']]
df.head()

Unnamed: 0,userID,movieID,rating,review date,originalTitle,genres,runtimeMinutes,review date in unix
235927,ur9821471,tt0434409,10,2006-03-18,V for Vendetta,"Action,Drama,Sci-Fi",132,1142640000.0
553747,ur15651776,tt0076070,6,2007-08-23,The Gauntlet,"Action,Crime,Thriller",109,1187824000.0
448345,ur13504572,tt0300015,8,2007-01-28,I Capture the Castle,"Drama,Romance",113,1169942000.0
612775,ur4481891,tt0401383,9,2008-01-06,Le scaphandre et le papillon,"Biography,Drama",112,1199578000.0
989315,ur0565973,tt0110413,10,2002-11-03,L√©on,"Action,Crime,Drama",110,1036282000.0


In [11]:
# sort the data by review date so that we can split the data into training, testing and validation sets
df.sort_values(by='review date', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(20)

Unnamed: 0,userID,movieID,rating,review date,originalTitle,genres,runtimeMinutes,review date in unix
0,ur0000002,tt0134619,4,1998-07-27,Disturbing Behavior,"Horror,Mystery,Sci-Fi",84,901494000.0
1,ur0000059,tt0119822,8,1998-07-29,As Good as It Gets,"Comedy,Drama,Romance",139,901666800.0
2,ur0053637,tt0120746,10,1998-07-29,The Mask of Zorro,"Action,Adventure,Comedy",136,901666800.0
3,ur0086088,tt0119668,1,1998-07-31,Midnight in the Garden of Good and Evil,"Crime,Drama,Mystery",155,901839600.0
4,ur0087139,tt0134619,7,1998-07-31,Disturbing Behavior,"Horror,Mystery,Sci-Fi",84,901839600.0
5,ur0089183,tt0120591,10,1998-08-01,Armageddon,"Action,Adventure,Sci-Fi",151,901926000.0
6,ur0087349,tt0120685,3,1998-08-02,Godzilla,"Action,Sci-Fi,Thriller",139,902012400.0
7,ur0086207,tt0120591,10,1998-08-02,Armageddon,"Action,Adventure,Sci-Fi",151,902012400.0
8,ur0068895,tt0120609,9,1998-08-02,The Big Hit,"Action,Comedy,Crime",91,902012400.0
9,ur0085471,tt0114369,9,1998-08-02,Se7en,"Crime,Drama,Mystery",127,902012400.0


### Step1.2: split the data(into train, test and val) based on date so as you use past records for training 

In [12]:
# split the data into training, testing and validation sets 
train = df[:int(0.7*len(df))]
test = df[int(0.7*len(df)):int(0.85*len(df))]
val = df[int(0.85*len(df)):]
print('train:', train.shape)
print('test:', test.shape)
print('val:', val.shape)    

train: (70000, 8)
test: (15000, 8)
val: (15000, 8)


In [13]:
# drop the review date column
train = train.drop('review date', axis=1)
test = test.drop('review date', axis=1)
val = val.drop('review date', axis=1)

### Step1.3: Convert the pandas dataset to tensor dataset

In [14]:
# convert the train, test and val into a tensorflow dataset
train_df = tf.data.Dataset.from_tensor_slices(train.to_dict('list'))
test_df = tf.data.Dataset.from_tensor_slices(test.to_dict('list'))
val_df = tf.data.Dataset.from_tensor_slices(val.to_dict('list'))


2025-04-08 16:30:53.804933: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-04-08 16:30:53.804968: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-04-08 16:30:53.804979: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-04-08 16:30:53.805011: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-08 16:30:53.805028: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Step 1.4: Extract the needed features and create a json object for use in our training

In [15]:
# selecting just the user id, movie title, ratings, genres, runtimemins and reviewdate
train_df = train_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle'],
    'rating':x['rating'],
    'genres': x['genres'],
    'runtimeMinutes': x['runtimeMinutes'],
    'review date in unix': x['review date in unix']
    
})
test_df = test_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle'],
    'rating':x['rating'],
    'genres': x['genres'],
    'runtimeMinutes': x['runtimeMinutes'],
    'review date in unix': x['review date in unix']
})
val_df = val_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle'],
    'rating':x['rating'],
    'genres': x['genres'],
    'runtimeMinutes': x['runtimeMinutes'],
    'review date in unix': x['review date in unix']
})

In [16]:
for data in val_df.take(1).as_numpy_iterator(): # to check if our selection worked
    print(data)

{'userID': b'ur0482513', 'originalTitle': b'Psyche 59', 'rating': 3, 'genres': b'Drama,Mystery,Romance', 'runtimeMinutes': b'94', 'review date in unix': 1533769200.0}


### Step 1.5: Obtain a movie title and user id tensor items

In [17]:
# obtaining a movie title list

# first convert df to a tensorflow dataset
df = df.drop('review date', axis= 1)

df_tensor = tf.data.Dataset.from_tensor_slices(df.to_dict('list'))


## ===== these will come in handy when creating our vocabularies
# select just the movie titles
movie_titles =  df_tensor.map(lambda x: x['originalTitle'])

# select just the genre
genres =  df_tensor.map(lambda x: x['genres'])

movietitle_genres = df_tensor.map( lambda x: {
        'originalTitle' : x['originalTitle'],
        'genres': x['genres']
    }
)

# select just the review date unix timestamp
timestamp =  df_tensor.map(lambda x: x['review date in unix'])

# selecting just the user id
user_ids = df_tensor.map(lambda x: x['userID'])

# view the movie titles
movie_titles

<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [18]:
for val in movie_titles.take(5).as_numpy_iterator():
    print(val)

b'Disturbing Behavior'
b'As Good as It Gets'
b'The Mask of Zorro'
b'Midnight in the Garden of Good and Evil'
b'Disturbing Behavior'


In [19]:
for val in movietitle_genres.take(5).as_numpy_iterator():
    print(val)

{'originalTitle': b'Disturbing Behavior', 'genres': b'Horror,Mystery,Sci-Fi'}
{'originalTitle': b'As Good as It Gets', 'genres': b'Comedy,Drama,Romance'}
{'originalTitle': b'The Mask of Zorro', 'genres': b'Action,Adventure,Comedy'}
{'originalTitle': b'Midnight in the Garden of Good and Evil', 'genres': b'Crime,Drama,Mystery'}
{'originalTitle': b'Disturbing Behavior', 'genres': b'Horror,Mystery,Sci-Fi'}


### Step 1.6: Create a movie, timestamp and user vocabulary. While for the genres, we will apply text vectorization technique later

In [20]:
# NB: batching the mapdataset introduces dimension.
movie_titles_batched = movie_titles.batch(1000)
user_ids_batched = user_ids.batch(1000)
timestamp_batched=timestamp.batch(1000)
genres_batched = genres.batch(1000)


# obtain the unique movie titles and user ids
unique_movie_titles = np.unique(np.concatenate(list(movie_titles_batched)))
unique_user_ids = np.unique(np.concatenate(list(user_ids_batched)))
timestamp =  np.concatenate(list(timestamp_batched)) # no need to obtain unique timestamp. it most likely wiil be different
unique_genres = np.unique(np.concatenate(list(genres_batched)))

# for our timestamp we will apply standardization or normalization technique,
# We will also create its embeddings. Its embeddings can be gotten by first creating a bucketed timestamp which then acts more categorical 
#Hence, we obtain the bucketed timestamp to be used later

max_timestamp = timestamp.max()
min_timestamp = timestamp.min()

timestamp_bucket = np.linspace(
    min_timestamp, max_timestamp, num=1000)

In [21]:
unique_movie_titles.shape

(34132,)

In [22]:
timestamp_bucket[:50]

array([9.01494016e+08, 9.02200182e+08, 9.02906347e+08, 9.03612513e+08,
       9.04318679e+08, 9.05024845e+08, 9.05731010e+08, 9.06437176e+08,
       9.07143342e+08, 9.07849508e+08, 9.08555673e+08, 9.09261839e+08,
       9.09968005e+08, 9.10674171e+08, 9.11380336e+08, 9.12086502e+08,
       9.12792668e+08, 9.13498834e+08, 9.14204999e+08, 9.14911165e+08,
       9.15617331e+08, 9.16323497e+08, 9.17029662e+08, 9.17735828e+08,
       9.18441994e+08, 9.19148160e+08, 9.19854325e+08, 9.20560491e+08,
       9.21266657e+08, 9.21972823e+08, 9.22678988e+08, 9.23385154e+08,
       9.24091320e+08, 9.24797486e+08, 9.25503651e+08, 9.26209817e+08,
       9.26915983e+08, 9.27622149e+08, 9.28328314e+08, 9.29034480e+08,
       9.29740646e+08, 9.30446812e+08, 9.31152977e+08, 9.31859143e+08,
       9.32565309e+08, 9.33271475e+08, 9.33977640e+08, 9.34683806e+08,
       9.35389972e+08, 9.36096138e+08])

## Step 2: Model development - 2 tower retrieval model

### Query tower

In [23]:
class UserModel(tf.keras.Model):

    def __init__(self, use_timestamp):
        super().__init__()
        
        self.use_timestamp =  use_timestamp
        
        # converting user ids to integers and then to embeddings using keras preprocessing layers
        self.user_embedding = tf.keras.Sequential(
            [
            tf.keras.layers.StringLookup( # convert the string user ids to integer indices
                vocabulary = unique_user_ids, mask_token=None
            ),
            tf.keras.layers.Embedding( # convert the indices to vector embeddings
                len(unique_user_ids) + 1, 32
            )
            ]
        )

        #  incorporating timestamps to model user preferences at a point in time.
        
        # depending on the timestamp value it switches on and off this feature influence in our matrix computation 
        # dual operations: Firstly: obtain timestamp embeddings
        if self.use_timestamp:
            self.timestamp_embeddings = tf.keras.Sequential([
                tf.keras.layers.Discretization(
                    timestamp_bucket.tolist()
                ),
                tf.keras.layers.Embedding(
                    len(timestamp_bucket) + 1, 32
                )
            ])
            # Secondly normalize timestamp
            self.normalized_timestamp =  tf.keras.layers.Normalization(
                axis=None
            )
            self.normalized_timestamp.adapt(timestamp)

    def call(self, inputs):
        if not self.use_timestamp:
            return self.user_embedding(inputs['userID'])

        return tf.concat(
            [
            self.user_embedding(inputs['userID']),
            self.timestamp_embeddings(inputs['review date in unix']),
            tf.reshape(self.normalized_timestamp(inputs['review date in unix']), (-1, 1))
        ], axis=1)
        


In [24]:
# To capture more complex relationships, such as user preferences evolving over time, 
# we may need a deeper model with multiple stacked dense layers - Deep query retrieval model

# Full Query model
class QueryModel(tf.keras.Model):
    """ model for encoding user features """
    
    def __init__(self, layer_sizes, use_timestamp):

        """ initialize the user model embedding layer and the dense layer
        Args:
        * layer_sizes: a list of inttegers to create the dense layer depth

        * use_timestamp -  a boolean variable - that helps introduce some additional impact on the relationship between timing and user's preference
        
        """
        super().__init__()

        # pass the user model
        self.query_embedding_model = UserModel(use_timestamp)

        # add the dense layer
        self.dense_layers = tf.keras.Sequential()

        # using the relu activation for all the layers except the last. This helps introduce non-linearity for
        # studying complex relationships
        for layer in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer, activation='relu'))

        # for the last layer
        for layer in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer))

    def call(self, inputs):
        feature_embeddings = self.query_embedding_model(inputs)
        return self.dense_layers(feature_embeddings)
            
            
        
    

## Candidate tower

In [25]:
class MovieModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        max_token = 10000 # maximum number of tokens to be generated in the vocabulary

        self.movie_embeddings = tf.keras.Sequential(
            [
            tf.keras.layers.StringLookup(
                vocabulary = unique_movie_titles, mask_token =None
            ),
            tf.keras.layers.Embedding(
                len(unique_movie_titles) + 1, 32
            )
            ]
        )

        self.genre_embeddings = tf.keras.Sequential([
            tf.keras.layers.TextVectorization(
                max_tokens=max_token,
                vocabulary= unique_genres
            ),
            tf.keras.layers.Embedding(
                max_token, 32, mask_zero=True
            ),
            tf.keras.layers.GlobalAveragePooling1D(),
        ])

    def call(self, inputs):
        
        return tf.concat([
            self.movie_embeddings(inputs['originalTitle']),
            self.genre_embeddings(inputs['genres'])
        ], axis=1)


In [26]:
# Full candidate deep model

class CandidateModel(tf.keras.Model):

    """ model for encoding candidate features """

    def __init__(self, layer_sizes):

        """ initialize the movie model embedding layer and the dense layer"""

        super().__init__()

        # pass the movie model
        self.candidate_embedding_model = MovieModel()

        # add the dense layers
        self.dense_layers =  tf.keras.Sequential()

        for layer in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer, activation='relu'))

        # capturing the last dense layer
        for layer in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer))
                                  
    def call(self, inputs):

        feature_embeddings = self.candidate_embedding_model(inputs)
        return self.dense_layers(feature_embeddings)

        

In [27]:
list(genres)[:10] # just inspecting the genres tensor dataset

[<tf.Tensor: shape=(), dtype=string, numpy=b'Horror,Mystery,Sci-Fi'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Comedy,Drama,Romance'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Action,Adventure,Comedy'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Crime,Drama,Mystery'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Horror,Mystery,Sci-Fi'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Action,Adventure,Sci-Fi'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Action,Sci-Fi,Thriller'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Action,Adventure,Sci-Fi'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Action,Comedy,Crime'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Crime,Drama,Mystery'>]

## Full Model Architecture

In [28]:
# using the tfrs.Model class to wrap our two-tower model architecture and define metrics and loss functions

class FinalModel(tfrs.models.Model):
    def __init__(self, layer_sizes, use_timestamp):
        super().__init__()
    
        self.query_model: tf.keras.Model = QueryModel(layer_sizes, use_timestamp)
        self.candidate_model: tf.keras.Model = CandidateModel(layer_sizes)
        self.tasks =  tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates= movietitle_genres.batch(128).map(self.candidate_model)
            )
        )

    
    def compute_loss(self, features, training = False) -> tf.Tensor:

        # pass the user id feature
        user_embeddings = self.query_model(features)

        # pass the movie title feature
        positive_movie_embeddings = self.candidate_model(features)

        metrics_and_loss = self.tasks(user_embeddings, positive_movie_embeddings)

        return metrics_and_loss


In [29]:
# batching the datasets for better performance, speed and memory efficiency
tf.random.set_seed(42)
cached_train_df  = train_df.batch(1000).cache()
cached_test_df  = test_df.batch(1000).cache()
cached_val_df  = val_df.batch(1000).cache()

## Compiling and Fitting Model

In [30]:

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


In [33]:
# using tensorboard for observability
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Defining callback objects
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7),
    tf.keras.callbacks.ModelCheckpoint(filepath='logs/models', save_weights_only=True, save_best_only=True, save_freq="epoch",),
    tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1),
]

model = FinalModel([32],use_timestamp=True)
optimize = tf.keras.optimizers.legacy.Adam(learning_rate=0.01)
model.compile(optimizer=optimize)

with tf.device('/GPU:0'): # setting tensorflow to run the fit operation on GPU
    model.fit(cached_train_df, epochs= 10, validation_data=cached_val_df, callbacks= callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### NB: From the above result we can see that the model doesn't generalize well on our validation set. factors such as the depth and width of the model, activation function, learning rate, and optimizer can radically change the performance of the model.

## Predicting or recommending for new user

In [35]:
# user BruteForce from factorized top-k

# creating the model that takes in raw query features
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)

# recommends movie out of the entire dataset
index.index_from_dataset(
    tf.data.Dataset.zip(
        (movie_titles.batch(100), movietitle_genres.batch(100).map(model.candidate_model))
)
)

# getting recommendations
# Get recommendations.
_, titles = index({'userID' : tf.constant(['ur0090329']), 'review date in unix': tf.constant([902185200.0]) })
print(f"Recommendations for user ur0090329: {titles[0, :5]}")

Recommendations for user ur0090329: [b"Buffalo '66" b"Buffalo '66" b"Buffalo '66" b"Buffalo '66"
 b"Buffalo '66"]


# Since BruteForce is slow for serving a model with many possible candidates we explore approximate retrieval index

In [37]:
# Using FAISS for approximate retrieval operation
embedding_dimension = 32

# creating a distance based indices
faiss_index = faiss.IndexFlatL2(embedding_dimension)


# list to store movied ids. This will be used to retrieve the movie names later
movie_ids_list = []

# a function to index movie embeddings in FAISS
def index_movie_in_faiss(movies, movie_ids):
    # retrieve movie embeddings
    movie_embeddings = model.candidate_model(movies)

    # Check the shape of the embeddings
    print(f'shape of the movie_embeddings: {movie_embeddings.shape}')

    # convert to a numpy array
    movie_embeddings_np = movie_embeddings.numpy()

    # add the movie embeddings to the faiss index
    faiss_index.add(movie_embeddings_np)

    # Update the movie ID list (ensure the order is consistent)
    movie_ids_list.extend(movie_ids)  # Add the movie IDs of the current batch

    print(f'length of the variable - movie_ids_list: {len(movie_ids_list)}') # for observability

In [38]:
# function to perform similarity search
def search_top_k(user_id, k):
    # get the user embeddings and convert it to numpy array
    user_embeddings = model.query_model(user_id)
    user_embeddings_np = user_embeddings.numpy()

    # performing search in faiss index
    distances, indices = faiss_index.search(user_embeddings_np, k)

    # Print indices for debugging
    print(f"indices: {indices}")

    # Convert indices to movie IDs using the mapping
    recommended_movie_ids = []
    for index in indices:
        # print(index)
        movie_id_for_each_index = [movie_ids_list[i] for i in index] # Retrieve movie IDs for each index
        recommended_movie_ids.append(movie_id_for_each_index)

    return distances, recommended_movie_ids
    

In [41]:
movie_titles_with_ids =  df_tensor.map(lambda x: 
    {
    'movieID': x['movieID'],
    'originalTitle': x['originalTitle'],
    'genres': x['genres']
    
    }
)


for movie in movie_titles_with_ids.batch(100).as_numpy_iterator():
    # Assume movie_batch contains a list of movie features and movie_ids_batch is a list of IDs
    movie_ids_batch = [mov for mov in movie['movieID']]  # return the movie ids for the batch
    # print(f'length of the movie_ids for a 100 batch: {len(movie_ids_batch)}')
    index_movie_in_faiss(movie, movie_ids_batch)

# searching for the top k most similar movies for a user
user = {'userID' : tf.constant(['ur0090329']), 'review date in unix': tf.constant([902185200.0])}
distances, recommended_movie_ids = search_top_k(user, k=10)

# Print the recommended movie IDs for the user
print(f"Recommended Movie IDs: {recommended_movie_ids}")

shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100100
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100200
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100300
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100400
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100500
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100600
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100700
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100800
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100900
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 101000
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 101100
shape of the movie_em

In [42]:
for movie in movie_titles_with_ids.take(20).batch(10).as_numpy_iterator():
    print(movie['movieID'])

[b'tt0134619' b'tt0119822' b'tt0120746' b'tt0119668' b'tt0134619'
 b'tt0120591' b'tt0120685' b'tt0120591' b'tt0120609' b'tt0114369']
[b'tt0117887' b'tt0111309' b'tt0118789' b'tt0088040' b'tt0114694'
 b'tt0119314' b'tt0107843' b'tt0116209' b'tt0017739' b'tt0120184']


In [43]:
# removing duplicates from result
final_movie_recommended_ids = list(dict.fromkeys(recommended_movie_ids[0]))
final_movie_recommended_ids

[b'tt0594078', b'tt0127045', b'tt0368185', b'tt0086650']

In [44]:
# converting byte strings to string
recommended_movie_str =[val.decode(encoding='utf-8') for val in final_movie_recommended_ids]
recommended_movie_str

['tt0594078', 'tt0127045', 'tt0368185', 'tt0086650']

In [77]:
# FAISS Retrival class
class Faiss_retrieval_index():
    """ Using FAISS for approximate retrieval operation 
    
    * args:
    embedding_dimension: The dimension size of the vectors

    * model: This is the already compiled and trained_model
    """

    # list to store movie ids. This will be used to retrieve the movie names later
    movie_ids_list = []

    def __init__(self, embedding_dimension, model):
        self.embedding_dimension = embedding_dimension
        self.movie_model = model.candidate_model
        self.query_model = model.query_model
        # creating a distance based indices
        self.faiss_index = faiss.IndexFlatL2(self.embedding_dimension)


    # a function to index movie embeddings in FAISS
    def index_movie_in_faiss(self, movies, movie_ids):
        """ function that index movie embeddings

        args:
        * movies -  a batch of movie title list derived from a tensor dataset

        * movie_ids - the ids of the batch of movie title list
        
        """
        # retrieve movie embeddings
        movie_embeddings = self.movie_model(movies)

        # convert to a numpy array
        movie_embeddings_np = movie_embeddings.numpy()

        # add the movie embeddings to the faiss index
        self.faiss_index.add(movie_embeddings_np)

        # Update the movie ID list (ensure the order is consistent)
        movie_ids_list.extend(movie_ids)  # Add the movie IDs of the current batch

    
    # function to perform similarity search
    def search_top_k(self, user_id, k):
        """ perform similarity search of the query with the existing embeddings
        
        args:
        * user_id: the id which represents the query we hope to find a result for

        * k: the number of possible result to be returned
        """

        # get the user embeddings and convert it to numpy array
        user_embeddings = self.query_model(user_id)
        user_embeddings_np = user_embeddings.numpy()

        # performing search in faiss index
        distances, indices = self.faiss_index.search(user_embeddings_np, k)

        # Convert indices to movie IDs using the mapping
        recommended_movie_ids = []
        for index in indices:
            # print(index)
            movie_id_for_each_index = [movie_ids_list[i] for i in index] # Retrieve movie IDs for each index
            recommended_movie_ids.append(movie_id_for_each_index)

        return distances, recommended_movie_ids
    

In [83]:

faiss_retrieval =  Faiss_retrieval_index(embedding_dimension=32, model=model)
movie_titles_with_ids =  df_tensor.map(lambda x: 
    {
    'movieID': x['movieID'],
    'originalTitle': x['originalTitle'],
    'genres': x['genres']
    }
)


for movie in movie_titles_with_ids.batch(100).as_numpy_iterator():
    movie_ids_batch = [mov for mov in movie['movieID']]  # return the movie ids for the batch
    faiss_retrieval.index_movie_in_faiss(movie, movie_ids_batch)

# searching for the top k most similar movies for a user
user = {'userID' : tf.constant(['ur0090329']), 'review date in unix': tf.constant([902185200.0])}
distances, recommended_movie_ids = faiss_retrieval.search_top_k(user, k=50)

# removing duplicates from result
final_movie_recommended_ids = list(dict.fromkeys(recommended_movie_ids[0]))

# converting byte strings to string
recommended_movie_str =[val.decode(encoding='utf-8') for val in final_movie_recommended_ids]

recommended_movies = [df[df['movieID']== id]['originalTitle'].values[0] for id in recommended_movie_str]

# Print the recommended movies for the user
print(f"Recommended Movies: {recommended_movies}")

Recommended Movies: ['Dirt', 'Neon Maniacs', 'Puni Puni Poemi', '√Ä nos amours', 'Meet the Fockers', 'All Through the Night', 'Buddy', 'Buddy', 'Night of the Wolf', 'Broadway Melody of 1936', 'Scaredy Cat', 'Mixed Nuts', 'Mixed Nuts', "Von Ryan's Express", 'The River', 'The River', 'The River', 'How to Make a Monster', 'How to Make a Monster']
