# Importing the dependencies, Loading the Data and API keys

In [6]:
import pandas as pd
import os
from dotenv import load_dotenv
import sqlite3


import chromadb
from chromadb.config import Settings
from langchain.embeddings import OpenAIEmbeddings 

In [7]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
database = "../SQL_Database/Movies.db"
database_key_based = pd.read_sql_query("SELECT m.* FROM Movies_Key_Based AS m", sqlite3.connect(database))
database_key_based.head()

Unnamed: 0,id,title,keywords,review_summary,tags,embeddings
0,283995,Guardians of the Galaxy Vol. 2,"['demi god', 'alien creature', 'sarcasm', 'cra...",Guardians of the Galaxy Vol. 2 elicited a gene...,adventure action sci-fi chris pratt zoe saldañ...,"[-0.010018928121777062, -0.042597577593544884,..."
1,480530,Creed II,"['baby', 'training montage', 'sequel', 'boxing...",Creed II elicits a mixed response from audienc...,drama michael b. jordan sylvester stallone tes...,"[-0.012480250747134571, -0.02905849380429892, ..."
2,299536,Avengers: Infinity War,"['superhero', 'ensemble cast', 'marvel cinemat...",Avengers: Infinity War elicits a generally pos...,adventure action sci-fi robert downey jr. chri...,"[-0.01140899767743463, -0.027857139652446076, ..."
3,299534,Avengers: Endgame,"['time travel', 'superhero', 'super villain', ...",Avengers: Endgame elicited a largely positive ...,adventure sci-fi action robert downey jr. chri...,"[-0.00412223552630982, -0.031019326010432745, ..."
4,337167,Fifty Shades Freed,"['sex scene', 'wedding ceremony', 'bondage', '...",The audience reaction to Fifty Shades Freed is...,drama romance dakota johnson jamie dornan eric...,"[-0.0075680177194708525, -0.024964091224630152..."


# Creating Embeddings using OPENAI Embedding Model
We will be using the OpenAI Embedding Model to create embeddings for the text data. The OpenAI Embedding Model is a transformer-based model that is trained on a large corpus of text data. The model is capable of generating high-quality embeddings for text data, which can be used for various natural language processing tasks.

In [17]:
embedding = OpenAIEmbeddings()

In [23]:
def get_embeddings(text):
    response = embedding.embed_query(text)
    return response

# ChromDB Client Setup and Storage

In [18]:
storage_folder = "../Chroma_DB/"

Client = chromadb.PersistentClient(path = storage_folder)

In [19]:
movie_collection = Client.create_collection(name='Movies')

In [23]:
for index, row in database_key_based.iterrows():
    print(eval(row["embeddings"]))
    break

[-0.010018928121777062, -0.042597577593544884, -0.01710481780239764, -0.03653961845575618, 0.004430394683950002, 0.02689074763907861, -0.00781914975657099, -0.03377104972101667, -0.017666755500327685, 0.006979670476800209, 0.04443415318790003, 0.0179545770208961, 0.025629815085860295, 0.01362354886916307, 0.035799507570847544, -0.01182809116707346, 0.02080537967752151, -0.0079630605168552, 0.011040008786973306, -0.03083801314250536, 0.011177066875654122, 0.00940216811969728, -0.006575349813683747, -0.00447836478215764, 0.008045295370063689, -0.022943482135651862, 0.006780936481043675, -0.02275160174282131, 0.002542422888416166, 0.003032404740542813, 0.00029767249793277245, -0.004063765111636088, 0.011451182121693161, -0.02856285352701678, -0.016296176476164714, -0.001049349099713478, -0.006253264003775775, -0.016967760645039413, 0.030783189907033034, 0.004653113495979708, -0.004046632501305016, -0.0019993312015486892, -0.01537788961030973, -0.021106906541296714, -0.019037334524490652, 

In [15]:
for index, row in database_key_based.iterrows():
    movie_collection.add(
        ids = str(row['id']),
        embeddings = eval(row["embeddings"]),
        metadatas = {'title': row['title'], 'tags': row['tags']}
    )

Let us now go ahead and create the function that will be query the chroma database for the most similar titles to a given title.

In [50]:
# Find similar movies based on the movie title from the Chroma database
def find_similar_movies(movie_title, data, movie_collection, top_k=6):
    try:
        movie_row = data[data['title'] == movie_title]
        
        if movie_row.empty:
            return f"Movie title '{movie_title}' not found in the database."
        

        query_embedding = eval(movie_row.iloc[0]['embeddings'])
        
        results = movie_collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        
        final_movies = {}
        for movie in results['metadatas'][0]:
            final_movies.append(movie['title'])
            
        return final_movies[1:]
        

    except Exception as e:
        print(f"Error occurred during request: {e}")
        return []

# Prediction

In [51]:
movie = "Black Panther"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Guardians of the Galaxy Vol. 2
Spider-Man: Into the Spider-Verse
Captain Marvel
Justice League
Deadpool 2


In [37]:
movie = "Creed II"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Just Mercy
Amateur
Escape Plan: The Extractors
American Wrestler: The Wizard
Molly's Game


In [38]:
movie = "The Lion King"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Christopher Robin
Aladdin
Maleficent: Mistress of Evil
The Boy Who Harnessed the Wind
Dumbo


In [39]:
movie = "Joker"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


The Informer
Sleepless
Death Wish
Widows
Terminal


In [40]:
movie = "Hotel Mumbai"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


The 15:17 to Paris
7 Days in Entebbe
All the Money in the World
6 Days
12 Strong


With this, we have a recommendation system model ready, which takes the test input and gives the top 5 movies based on the cosine similarity between it's embeddings. We can use the chroma database to get the most similar titles to a given title.