In [1]:
# load some libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import tensorflow_recommenders as tfrs
import datetime
import faiss
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
! rm -rf ./logs/

In [3]:
from typing import Dict, Text

In [4]:
# load the data
data = pd.read_csv('sliced_user_id_and_movie_title_data.csv')
data.head()

Unnamed: 0,userID,movieID,originalTitle,rating,review date
0,ur4592644,tt0120884,When the Light Comes,10,16 January 2005
1,ur3174947,tt0118688,Batman & Robin,3,16 January 2005
2,ur3780035,tt0387887,Bottom Live 2003: Weapons Grade Y-Fronts Tour,8,16 January 2005
3,ur4592628,tt0346491,Alexander,1,16 January 2005
4,ur3174947,tt0094721,Beetlejuice,8,16 January 2005


## Step 1: Basic data preprocessing

In [5]:
# create a new movie title list that enforces type uniformity in its values
# might come in handy when creaying the movie title vocalbulary
updated_movie_titles = []
for i in data['originalTitle']:
    if type(i) != 'str':
        a = str(i)
        updated_movie_titles.append(a)
    else:
        updated_movie_titles.append(i)

In [6]:
data['originalTitle'] = updated_movie_titles
data['originalTitle']

0                                 When the Light Comes
1                                       Batman & Robin
2        Bottom Live 2003: Weapons Grade Y-Fronts Tour
3                                            Alexander
4                                          Beetlejuice
                             ...                      
49995               The Five People You Meet in Heaven
49996                                         Sin City
49997                                         Earthsea
49998                                           L√©olo
49999                                           Sahara
Name: originalTitle, Length: 50000, dtype: object

In [7]:
data.to_dict('list')

{'userID': ['ur4592644',
  'ur3174947',
  'ur3780035',
  'ur4592628',
  'ur3174947',
  'ur1162550',
  'ur4371033',
  'ur4584306',
  'ur3174947',
  'ur2694867',
  'ur2035667',
  'ur4513109',
  'ur4318504',
  'ur1355507',
  'ur4545306',
  'ur3168872',
  'ur1000879',
  'ur2183556',
  'ur3174947',
  'ur4593110',
  'ur4593029',
  'ur4593101',
  'ur1935667',
  'ur4593048',
  'ur4452141',
  'ur2983202',
  'ur3174947',
  'ur3143319',
  'ur4593124',
  'ur2694867',
  'ur2488512',
  'ur0470968',
  'ur4593070',
  'ur4111911',
  'ur3780035',
  'ur0816433',
  'ur2904081',
  'ur3174947',
  'ur4593205',
  'ur2882544',
  'ur0470968',
  'ur1518804',
  'ur4584029',
  'ur3174947',
  'ur3446219',
  'ur2074560',
  'ur4587088',
  'ur3174947',
  'ur1529007',
  'ur1609501',
  'ur4593421',
  'ur3446219',
  'ur3446219',
  'ur4531500',
  'ur4013711',
  'ur0521881',
  'ur3174947',
  'ur2074560',
  'ur0157498',
  'ur3066329',
  'ur3446219',
  'ur2567136',
  'ur0157498',
  'ur2074560',
  'ur3976881',
  'ur3066329',


In [8]:
# create a copy of the data
df = data.copy()

### Step1.1: convert the string typed review date to datetime format

In [9]:
# convert the review date to datetime
df['review date'] = pd.to_datetime(df['review date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   userID         50000 non-null  object        
 1   movieID        50000 non-null  object        
 2   originalTitle  50000 non-null  object        
 3   rating         50000 non-null  int64         
 4   review date    50000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.9+ MB


In [10]:
# sort the data by review date so that we can split the data into training, testing and validation sets
df.sort_values(by='review date', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(20)

Unnamed: 0,userID,movieID,originalTitle,rating,review date
0,ur4515719,tt0286112,Siu Lam juk kau,10,2005-01-13
1,ur1418745,tt0410006,Gefangen,1,2005-01-13
2,ur1495662,tt0036384,Stage Door Canteen,9,2005-01-13
3,ur4266247,tt0027661,Gay Love,10,2005-01-13
4,ur3757197,tt0056172,Lawrence of Arabia,10,2005-01-13
5,ur2776875,tt0097259,Easy Kill,1,2005-01-13
6,ur0562732,tt0080487,Caddyshack,5,2005-01-13
7,ur4519345,tt0156342,Barjatri,10,2005-01-13
8,ur2766295,tt0438140,L.A. Dream,10,2005-01-13
9,ur4131581,tt0067089,A fekete v√°ros,10,2005-01-13


### Step1.2: split the data(into train, test and val) based on date so as you use past records for training 

In [11]:
# split the data into training, testing and validation sets 
train = df[:int(0.7*len(df))]
test = df[int(0.7*len(df)):int(0.85*len(df))]
val = df[int(0.85*len(df)):]
print('train:', train.shape)
print('test:', test.shape)
print('val:', val.shape)    

train: (35000, 5)
test: (7500, 5)
val: (7500, 5)


In [12]:
# drop the review date column
train = train.drop('review date', axis=1)
test = test.drop('review date', axis=1)
val = val.drop('review date', axis=1)

### Step1.3: Convert the pandas dataset to tensor dataset

In [13]:
# convert the train, test and val into a tensorflow dataset
train_df = tf.data.Dataset.from_tensor_slices(train.to_dict('list'))
test_df = tf.data.Dataset.from_tensor_slices(test.to_dict('list'))
val_df = tf.data.Dataset.from_tensor_slices(val.to_dict('list'))


### Step 1.4: Extract the needed features and create a json object for use in our training

In [14]:
# selecting just the user id and movie title
train_df = train_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle']
})
test_df = test_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle']
})
val_df = val_df.map(lambda x : {
    'userID': x['userID'],
    'originalTitle': x['originalTitle']
})

### Step 1.5: Obtain a movie title and user id tensor items

In [15]:
# obtaining a movie title list

# first convert df to a tensforflow dataset
df = df.drop('review date', axis= 1)

df_tensor = tf.data.Dataset.from_tensor_slices(df.to_dict('list'))

# select just the movie titles
movie_titles =  df_tensor.map(lambda x: x['originalTitle'])


# selecting just the user id
user_ids = df_tensor.map(lambda x: x['userID'])

# view the movie titles
movie_titles

<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [16]:
for val in movie_titles.take(5).as_numpy_iterator():
    print(val)

b'Siu Lam juk kau'
b'Gefangen'
b'Stage Door Canteen'
b'Gay Love'
b'Lawrence of Arabia'


In [17]:
type(user_ids)

tensorflow.python.data.ops.map_op._MapDataset

### Step 1.6: Batch and cache the dataset and create a movie and user vocabulary

In [18]:
# batching the datasets for better performance, speed and memory efficiency
movie_titles_batched = movie_titles.batch(1000)
user_ids_batched = user_ids.batch(1000)


# obtain the unique movie titles and user ids
unique_movie_titles = np.unique(np.concatenate(list(movie_titles_batched)))
unique_user_ids = np.unique(np.concatenate(list(user_ids_batched)))

In [19]:
unique_movie_titles.shape

(16870,)

## Step 2: Model development - 2 tower retrieval model

### Query tower

In [20]:
# define the embedding size or dimension
embedding_dimension = 32

# converting user ids to integers and then to embeddings using keras prprocessing layers
query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup( # convert the string user ids to integer indices
        vocabulary = unique_user_ids, mask_token=None
    ),
    tf.keras.layers.Embedding( # convert the indices to vector embeddings
        len(unique_user_ids) + 1, embedding_dimension
    )
])


## Candidate tower

In [21]:
movie_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = unique_movie_titles, mask_token =None
    ),
    tf.keras.layers.Embedding(
        len(unique_movie_titles) + 1, embedding_dimension
    )
])


## Define Metrics to be observed

In [22]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates= movie_titles.batch(128).map(movie_model)
)

## Define Loss 

In [23]:
tasks  =  tfrs.tasks.Retrieval(
    metrics =  metrics
)

## Full Model Architecture

In [24]:
# using the tfrs.Model class

class Movie_model(tfrs.Model):
    def __init__(self, movie_model, user_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.tasks: tf.keras.layers.Layer =  tasks

    
    def compute_loss(self, features, training = False) -> tf.Tensor:

        # pass the user id feature
        user_embeddings = self.user_model(features['userID'])

        # pass the movie title feature
        positive_movie_embeddings = self.movie_model(features['originalTitle'])

        metrics_and_loss = self.tasks(user_embeddings, positive_movie_embeddings)

        return metrics_and_loss


In [25]:
tf.random.set_seed(42)
cached_train_df  = train_df.batch(1000).cache()
cached_test_df  = test_df.batch(1000).cache()
cached_val_df  = val_df.batch(1000).cache()

## Compiling and Fitting Model

In [26]:
model = Movie_model(movie_model, query_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# using tensorboard for observability
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(cached_train_df, epochs= 10, validation_data=cached_val_df, callbacks=[tensorboard_callback])



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x16b993b80>

## Predicting or recommending for new user

In [27]:
# user BruteForce from factorized top-k

# creating the model that takes in raw query features
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# recommends movie out of the entire dataset
index.index_from_dataset(
    tf.data.Dataset.zip(
        (movie_titles.batch(100), movie_titles.batch(100).map(model.movie_model))
)
)

# getting recommendations
# Get recommendations.
_, titles = index(tf.constant(['ur3174947']))
print(f"Recommendations for user ur3174947: {titles[0, :3]}")

Recommendations for user ur3174947: [b'Old School' b'Old School' b'Old School']


# Since BruteForce is slow for serving a model with many possible candidates we explore approximate retrieval index

In [28]:
# Using FAISS for approximate retrieval operation

# creating a distance based indices
faiss_index = faiss.IndexFlatL2(embedding_dimension)


# list to store movied ids. This will be used to retrieve the movie names later
movie_ids_list = []

# a function to index movie embeddings in FAISS
def index_movie_in_faiss(movies, movie_ids):
    # retrieve movie embeddings
    movie_embeddings = model.movie_model(movies)

    # Check the shape of the embeddings
    print(f'shape of the movie_embeddings: {movie_embeddings.shape}')

    # convert to a numpy array
    movie_embeddings_np = movie_embeddings.numpy()

    # add the movie embeddings to the faiss index
    faiss_index.add(movie_embeddings_np)

    # Update the movie ID list (ensure the order is consistent)
    movie_ids_list.extend(movie_ids)  # Add the movie IDs of the current batch

    print(f'length of the variable - movie_ids_list: {len(movie_ids_list)}') # for observability

In [29]:
# function to perform similarity search
def search_top_k(user_id, k):
    # get the user embeddings and convert it to numpy array
    user_embeddings = model.user_model(user_id)
    user_embeddings_np = user_embeddings.numpy()

    # performing search in faiss index
    distances, indices = faiss_index.search(user_embeddings_np, k)

    # Print indices for debugging
    print(f"indices: {indices}")

    # Convert indices to movie IDs using the mapping
    recommended_movie_ids = []
    for index in indices:
        # print(index)
        movie_id_for_each_index = [movie_ids_list[i] for i in index] # Retrieve movie IDs for each index
        recommended_movie_ids.append(movie_id_for_each_index)

    return distances, recommended_movie_ids
    

In [None]:
movie_titles_with_ids =  df_tensor.map(lambda x: 
    {
    'movieID': x['movieID'],
    'originalTitle': x['originalTitle']
    }
)


for movie in movie_titles_with_ids.batch(100).as_numpy_iterator():
    # Assume movie_batch contains a list of movie features and movie_ids_batch is a list of IDs
    movie_ids_batch = [mov for mov in movie['movieID']]  # return the movie ids for the batch
    # print(f'length of the movie_ids for a 100 batch: {len(movie_ids_batch)}')
    index_movie_in_faiss(movie['originalTitle'], movie_ids_batch)

# searching for the top k most similar movies for a user
user = tf.constant(['ur3174947'])
distances, recommended_movie_ids = search_top_k(user, k=10)

# Print the recommended movie IDs for the user
print(f"Recommended Movie IDs: {recommended_movie_ids}")

shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 100
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 200
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 300
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 400
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 500
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 600
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 700
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 800
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 900
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 1000
shape of the movie_embeddings: (100, 32)
length of the variable - movie_ids_list: 1100
shape of the movie_embeddings: (100, 32)
length of t