# Importing the dependencies, Loading the Data and API keys

In [13]:
import pandas as pd
import os
from dotenv import load_dotenv
import sqlite3


import chromadb
from chromadb.config import Settings
from langchain.embeddings import OpenAIEmbeddings 

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
data = pd.read_csv("../Data/Movies_Key_Based.csv")
data.head()

Unnamed: 0,id,title,tags
0,283995,Guardians of the Galaxy Vol. 2,adventure action sci-fi chris pratt zoe saldañ...
1,480530,Creed II,drama michael b. jordan sylvester stallone tes...
2,299536,Avengers: Infinity War,adventure action sci-fi robert downey jr. chri...
3,299534,Avengers: Endgame,adventure sci-fi action robert downey jr. chri...
4,337167,Fifty Shades Freed,drama romance dakota johnson jamie dornan eric...


# Creating Embeddings using OPENAI Embedding Model

In [17]:
embedding = OpenAIEmbeddings()

In [23]:
def get_embeddings(text):
    response = embedding.embed_query(text)
    return response

In [26]:
data['embeddings'] = data['tags'].apply(get_embeddings)
data.head()

Unnamed: 0,id,title,tags,embeddings
0,283995,Guardians of the Galaxy Vol. 2,adventure action sci-fi chris pratt zoe saldañ...,"[-0.010018928121777062, -0.042597577593544884,..."
1,480530,Creed II,drama michael b. jordan sylvester stallone tes...,"[-0.012480250747134571, -0.02905849380429892, ..."
2,299536,Avengers: Infinity War,adventure action sci-fi robert downey jr. chri...,"[-0.01140899767743463, -0.027857139652446076, ..."
3,299534,Avengers: Endgame,adventure sci-fi action robert downey jr. chri...,"[-0.00412223552630982, -0.031019326010432745, ..."
4,337167,Fifty Shades Freed,drama romance dakota johnson jamie dornan eric...,"[-0.0075680177194708525, -0.024964091224630152..."


In [30]:
database = "../SQL_Database/Movies.db"
data = pd.read_sql_query("SELECT m.* FROM Movies_Key_Based AS m", sqlite3.connect(database))

# ChromDB Client Setup and Storage

In [25]:
storage_folder = "../Chroma_Database"

client = chromadb.PersistentClient(path = storage_folder)

In [10]:
movie_collection = client.create_collection(name='Movies')

In [27]:
movie_collection = client.get_collection(name='Movies')

In [15]:
for index, row in data.iterrows():
    movie_collection.add(
        ids = str(row['id']),
        embeddings = row['embeddings'],
        metadatas = {'title': row['title'], 'tags': row['tags']}
    )

Let us now go ahead and create the function that will be query the chroma database for the most similar titles to a given title.

In [50]:
# Find similar movies based on the movie title from the Chroma database
def find_similar_movies(movie_title, data, movie_collection, top_k=6):
    try:
        movie_row = data[data['title'] == movie_title]
        
        if movie_row.empty:
            return f"Movie title '{movie_title}' not found in the database."
        

        query_embedding = eval(movie_row.iloc[0]['embeddings'])
        
        results = movie_collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        
        final_movies = {}
        for movie in results['metadatas'][0]:
            final_movies.append(movie['title'])
            
        return final_movies[1:]
        

    except Exception as e:
        print(f"Error occurred during request: {e}")
        return []

# Prediction

In [51]:
movie = "Black Panther"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Guardians of the Galaxy Vol. 2
Spider-Man: Into the Spider-Verse
Captain Marvel
Justice League
Deadpool 2


In [37]:
movie = "Creed II"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Just Mercy
Amateur
Escape Plan: The Extractors
American Wrestler: The Wizard
Molly's Game


In [38]:
movie = "The Lion King"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


Christopher Robin
Aladdin
Maleficent: Mistress of Evil
The Boy Who Harnessed the Wind
Dumbo


In [39]:
movie = "Joker"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


The Informer
Sleepless
Death Wish
Widows
Terminal


In [40]:
movie = "Hotel Mumbai"
pred1 = find_similar_movies(movie,data,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


The 15:17 to Paris
7 Days in Entebbe
All the Money in the World
6 Days
12 Strong


With this, we have a recommendation system model ready, which takes the test input and gives the top 5 movies based on the cosine similarity between it's embeddings. We can use the chroma database to get the most similar titles to a given title.