# Vector Databases in Movie Recommenders

This notebook offers a hands-on exploration of collaborative filtering techniques and the use of vector databases in movie recommenders. It guides you through steps illustrating the concepts discussed in our blog, 'Vector Databases in Movie Recommenders.' For complete background information, refer to our post here.

# Importing Libraries
Let's begin by installing and importing all the necessary libraries. I have consolidated them in one place for clarity.



In [None]:
!pip install transformers
!pip install requests
!pip install pandas

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import json
import requests
import pandas as pd
import re
from tqdm import tqdm

# Loading the model and dataset

Next, let's load our model and dataset into our environment.

The dataset contains the below files:

`movie_embeddings.csv`: Contains the movies along with their embeddings and metadata.

`movie2movie_encoded.json`: Contains the mappings of the original movie IDs to sequential IDs that were used for the model training.

`user2user_encoded.json`: Contains the mappings of the original user IDs to sequential IDs that were used for the model training.

`ratings.csv`, `movies. csv` and `links.csv`: These are the files from the original dataset from Movielens.

In [None]:
# Load the model from Hugging Face
from huggingface_hub import HfFolder
from huggingface_hub import from_pretrained_keras
model = from_pretrained_keras("emno/movie-recommender-collaborative-filtering")
model.summary()

In [7]:
# Load the dataset from the git repository and save it in Google Drive so we can load the files here
embeddings_file_path = '/content/drive/My Drive/emno-movie-recommender-cf-dataset/movie_embeddings.csv'
path_to_movie2movie_encoded = '/content/drive/My Drive/emno-movie-recommender-cf-dataset/movie2movie_encoded.json'
path_to_user2user_encoded = '/content/drive/My Drive/emno-movie-recommender-cf-dataset/user2user_encoded.json'

# Paths to the 'ratings.csv' and 'movies.csv' from the Movielens dataset files
ratings_file_path = '/content/drive/My Drive/emno-movie-recommender-cf-dataset/ml-latest-small/ratings.csv'
movies_file_path = '/content/drive/My Drive/emno-movie-recommender-cf-dataset/ml-latest-small/movies.csv'

# Load the mappings
with open(path_to_movie2movie_encoded, 'r') as file:
    movie2movie_encoded = json.load(file)

with open(path_to_user2user_encoded, 'r') as f:
    user2user_encoded = json.load(f)



# Preparing the vector database for recommendations

If you haven't already, sign up for a free [emno account](https://emno.io/).

Also, generate an API Key from the dashboard and copy it. We need it to work with the emno APIs. Replace the token in the scripts below with your API Key.

## Create a collection

In [None]:
base_url = "https://apis.emno.io/collections"
token = "t_L***********" # Replace with your API Key

def check_or_create_collection(collection_name, token, dim, model):
    headers = {"Token": token}

    # Check for existing collection
    response = requests.get(f"{base_url}/{collection_name}", headers=headers)
    if response.status_code == 200:
        # Collection exists
        return response.json()

    # If collection does not exist, create a new one
    create_payload = {
        "name": collection_name,
        "config": {"dim": dim, "model": model}
    }
    create_response = requests.post(base_url, json=create_payload, headers=headers)
    if create_response.status_code == 201:
        return create_response.json()
    else:
        raise Exception(f"Error creating collection: {create_response.content}")


collection_name = "recommender-model"  # Replace with your collection name
dim = 32  # Dimension of the embeddings
model = "CUSTOM"

collection = check_or_create_collection(collection_name, token, dim, model)
print(f"Collection ID: {collection['id']}")

## Insert embeddings

In [None]:
batch_size = 100  # Adjust as needed
collection_id = collection['id']
headers = {"Content-Type": "application/json", "Token": token}
upload_url = f"{base_url}/{collection_id}/vectors/create"

# Function to process data into batches
def process_data(file_path, chunk_size):
    # Read data
    data_df = pd.read_csv(file_path)

    # Process data into chunks
    chunks = [data_df[i:i + chunk_size] for i in range(0, len(data_df), chunk_size)]
    return chunks

# Process data into batches
data_batches = process_data(embeddings_file_path, batch_size)

# Upload data to the collection
for batch in tqdm(data_batches, desc="Uploading batches"):
    # Prepare payload for batch upload
    payload = [{
        "content": str(item['movieId']),
        "values": json.loads(item['values']),
        "metadata": item['metadata']
    } for item in batch.to_dict(orient='records')]

    # Construct the upload URL
    upload_url = f"{base_url}/{collection_id}/vectors/create"

    # Upload batch
    upload_response = requests.post(upload_url, json=payload, headers=headers)

    if upload_response.status_code != 200:
        print(f"Error uploading batch: {upload_response.content}")

print("\nUpload complete.")

## Defining a method for semantic search

In [14]:
def query_emno(embedding):
    url = "https://apis.emno.io/collections/c_jeFo2rITem42TPCL/query"
    payload = {
        "vectors": [embedding.tolist()],
        "limit": 10,  # Adjust the limit as needed
    }
    headers = {
        "Content-Type": "application/json",
        "Token": token
    }

    response = requests.post(url, json=payload, headers=headers)

    # Check the status code of the response
    if response.status_code != 200:
        print("Error: Received status code", response.status_code)
        print("Response content:", response.content)
        return None

    try:
        return response.json()
    except json.JSONDecodeError as e:
        print("JSON decoding failed:", e)
        print("Response content:", response.content)
        return None

# Item-based Collaborative Filtering

## Findings results similar to a given movie

Defining a Utility methods and loading

In [38]:
# Utility Function to safely extract data using regular expressions
def safe_extract(regex, string):
    match = re.search(regex, string)
    return match.group(1) if match else None

# Updated regex pattern to match titles with single quotes
title_regex = r"'title': '(.*?)'"  # Non-greedy match to get the title

def convert_to_json_string(metadata_str):
    return metadata_str.replace("\'", "\"").replace('\"{', '{').replace('}\"', '}').replace('"{', '\'').replace('}"', '\'')


# Load the movies data
movies_df = pd.read_csv(movies_file_path)

In [None]:
actual_movie_id = 112852 # Sample movie ID. You can experiment with other IDs.
movie_id_encoded = movie2movie_encoded.get(str(actual_movie_id))

# Retrieve details for the movie
queried_movie_details = movies_df[movies_df['movieId'] == actual_movie_id]

# Display the details of the movie
if not queried_movie_details.empty:
    print("Finding movies similar to:")
    print(queried_movie_details.to_string(index=False))
    print("\n")
else:
    print("Movie ID {} not found in the dataset.".format(actual_movie_id))

## Get Movie Embedding from the model

In [22]:
#Define a method to get embeddings for a specific movie from the model
def get_movie_embedding(model, sequential_id):
    sequential_id_array = np.array([sequential_id])
    movie_embedding_layer = model.get_layer('embedding_2')
    movie_embedding = movie_embedding_layer(sequential_id_array)
    return movie_embedding.numpy()[0]

# Get the movie embedding
movie_embedding = get_movie_embedding(model, movie_id_encoded)

## Query emno to get similar movies

In [None]:
# Query emno
movie_matching_embedding_results = query_emno(movie_embedding)


# Parse the results
parsed_movie_results = []

for sublist in movie_matching_embedding_results:
    for result in sublist:
        metadata_str = result['metadata']
        metadata_str_json = convert_to_json_string(metadata_str)
        try:
            metadata = json.loads(metadata_str_json)
            movie_id = metadata.get('movie_id')
            title = metadata.get('title')
            genres = metadata.get('genres')
            score = result['score']

            if movie_id and title and genres:
                parsed_movie_results.append({
                        'movie_id': movie_id,
                        'movies': title,
                        'genres': genres,
                        'scores': score
                    })
        except json.JSONDecodeError as e:
#            print(f"JSON parsing error: {e}")
             print("")


recommendations_for_movie_df = pd.DataFrame(parsed_movie_results)


# Display the top recommendations
print("Top Recommendations for similar movies:")
print(recommendations_for_movie_df.head(10).to_string(index=False))

# Embedding-Based Personalized Movie Recommendations

## Get User Embedding from the Model

In [26]:
def get_user_embedding(model, user_id_encoded):
    user_id_array = np.array([user_id_encoded])
    user_embedding_layer = model.get_layer('embedding')
    user_embedding = user_embedding_layer(user_id_array)
    return user_embedding.numpy()[0]

actual_user_id = 537 # Sample user ID. You can experiment with other IDs.
user_id_encoded = user2user_encoded.get(str(actual_user_id))

# Get the user embedding
user_embedding = get_user_embedding(model, user_id_encoded)

## Here are the top 10 movies based on the interaction history

In [None]:
# Load the ratings data
ratings_df = pd.read_csv(ratings_file_path)

# Filter for the specified user's ratings and sort them
user_ratings_df = ratings_df[ratings_df['userId'] == actual_user_id]
user_ratings_df = user_ratings_df.sort_values(by='rating', ascending=False)

# Merge with the movies data to get the titles and genres
user_ratings_with_details_df = user_ratings_df.merge(movies_df, on='movieId', how='left')
top_10_movies = user_ratings_with_details_df.head(10)[['movieId', 'title', 'genres', 'rating']]

# Display the top 10 rated movies for the user
print("Top 10 movies rated by the user:")
print(top_10_movies.to_string(index=False))

## Query emno to get similar movies

In [None]:
# Query emno
user_matching_embedding_results = query_emno(user_embedding)
movies_watched_by_user = ratings_df[ratings_df['userId'] == actual_user_id]['movieId'].unique().tolist()


# Parse the results and track filtered movies
parsed_user_results = []
filtered_movies = []  # To keep track of filtered movies

for sublist in user_matching_embedding_results:
    for result in sublist:
        metadata_str = result['metadata']
        metadata_str_json = convert_to_json_string(metadata_str)
        try:
            metadata = json.loads(metadata_str_json)
            movie_id = metadata.get('movie_id')
            title = metadata.get('title')
            genres = metadata.get('genres')
            score = result['score']

            if movie_id and title and genres:
                if int(movie_id) in movies_watched_by_user:
                    filtered_movies.append({
                        'movie_id': movie_id,
                        'movies': title,
                        'genres': genres,
                        'scores': score
                    })
                else:
                    parsed_user_results.append({
                        'movie_id': movie_id,
                        'movies': title,
                        'genres': genres,
                        'scores': score
                    })
        except json.JSONDecodeError as e:
            #print(f"JSON parsing error: {e}")
            print("")


recommendations_for_user_df = pd.DataFrame(parsed_user_results)
filtered_movies_df = pd.DataFrame(filtered_movies)

# Display the top 10 recommendations, excluding the movies that the user has already watched
print("Top Recommendations for the user:")
print(recommendations_for_user_df.head(10).to_string(index=False))

# Display the filtered movies
print("\nMovies filtered out (already watched by the user):")
print(filtered_movies_df.to_string(index=False))
