# Importing the dependencies, Loading the Data and API keys

In [6]:
import pandas as pd
import os
from dotenv import load_dotenv
import sqlite3


import chromadb
from chromadb.config import Settings
from langchain.embeddings import OpenAIEmbeddings 

In [7]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [66]:
database = "../SQL_Database/Movies.db"
database_key_based = pd.read_sql_query("SELECT m.* FROM Movies_Key_Based AS m", sqlite3.connect(database))
database_key_based.head()

Unnamed: 0,id,title,keywords,review_summary,tags,embeddings
0,283995,Guardians of the Galaxy Vol. 2,"['demi god', 'alien creature', 'sarcasm', 'cra...",Guardians of the Galaxy Vol. 2 elicited a gene...,adventure action sci-fi chris pratt zoe saldañ...,"[-0.010018928121777062, -0.042597577593544884,..."
1,480530,Creed II,"['baby', 'training montage', 'sequel', 'boxing...",Creed II elicits a mixed response from audienc...,drama michael b. jordan sylvester stallone tes...,"[-0.012480250747134571, -0.02905849380429892, ..."
2,299536,Avengers: Infinity War,"['superhero', 'ensemble cast', 'marvel cinemat...",Avengers: Infinity War elicits a generally pos...,adventure action sci-fi robert downey jr. chri...,"[-0.01140899767743463, -0.027857139652446076, ..."
3,299534,Avengers: Endgame,"['time travel', 'superhero', 'super villain', ...",Avengers: Endgame elicited a largely positive ...,adventure sci-fi action robert downey jr. chri...,"[-0.00412223552630982, -0.031019326010432745, ..."
4,337167,Fifty Shades Freed,"['sex scene', 'wedding ceremony', 'bondage', '...",The audience reaction to Fifty Shades Freed is...,drama romance dakota johnson jamie dornan eric...,"[-0.0075680177194708525, -0.024964091224630152..."


# Creating Embeddings using OPENAI Embedding Model
We will be using the OpenAI Embedding Model to create embeddings for the text data. The OpenAI Embedding Model is a transformer-based model that is trained on a large corpus of text data. The model is capable of generating high-quality embeddings for text data, which can be used for various natural language processing tasks.

In [17]:
embedding = OpenAIEmbeddings()

In [23]:
def get_embeddings(text):
    response = embedding.embed_query(text)
    return response

# ChromDB Client Setup and Storage

In [68]:
storage_folder = "../CHROMA_DATABASE"

Client = chromadb.PersistentClient(path = storage_folder)

In [70]:
movie_collection = Client.create_collection(name='MOVIES')

In [71]:
movie_collection

<chromadb.api.models.Collection.Collection at 0x1219b3950>

In [72]:
for index, row in database_key_based.iterrows():
    movie_collection.add(
        ids = str(row['id']),
        embeddings = eval(row["embeddings"]),
        metadatas = {'title': row['title'], 'keywords' : row['keywords'], 'review_summary' : row['review_summary']}
    )

Let us now go ahead and create the function that will be query the chroma database for the most similar titles to a given title.

In [50]:
# Find similar movies based on the movie title from the Chroma database
def find_similar_movies(movie_title, data, movie_collection, top_k=6):
    try:
        movie_row = data[data['title'] == movie_title]
        
        if movie_row.empty:
            return f"Movie title '{movie_title}' not found in the database."
        

        query_embedding = eval(movie_row.iloc[0]['embeddings'])
        
        results = movie_collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        
        final_movies = []
        for movie in results['metadatas'][0]:
            final_movies.append(movie)
            
        return final_movies[1:]
        

    except Exception as e:
        print(f"Error occurred during request: {e}")
        return []

# Prediction

In [51]:
movie = "Black Panther"
pred1 = find_similar_movies(movie,database_key_based,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


{'keywords': "['demi god', 'alien creature', 'sarcasm', 'crash landing', 'blue skin', 'green skin', 'good versus evil', 'sister sister relationship', 'father son reunion', 'father son fight', 'raccoon', 'marvel cinematic universe', 'mixtape', 'bipedal alien', 'patricide', 'superhero team', 'anti hero', 'estranged sister', 'final showdown', 'megalomaniac', 'evil god', 'mechanical hand', 'space fleet', 'tree like humanoid', 'celestial']", 'reveiw_summary': "Guardians of the Galaxy Vol. 2 elicited a generally positive response from audiences, with many praising its humor, action, and character development. While viewers found the film entertaining and visually impressive, some felt it lacked the freshness and originality of its predecessor. The movie's soundtrack was widely applauded, with reviewers highlighting its perfect blend of classic hits and emotional weight.  Several critiques focused on the villain's lack of depth and the movie's tendency to rely on over-the-top comedy,  suggest

In [53]:
movie = "Creed II"
pred1 = find_similar_movies(movie,database_key_based,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


{'keywords': "['death penalty', 'injustice', 'humiliation', 'intimidation', 'concealing the truth', 'racial bias', 'wrongful conviction of murder', 'based on true story', 'prison', 'execution', 'electric chair', 'race relations', 'husband wife relationship', 'mother son relationship', 'father son relationship', 'mother daughter relationship', 'father daughter relationship', 'telephone call', 'telephone', 'pay phone', 'mobile phone', 'typewriter', 'alabama', 'monroeville alabama', 'reference to harper lee']", 'reveiw_summary': '"Just Mercy" elicits a strong emotional response from viewers, with most reviewers expressing a profound sense of sadness, anger, and disgust at the systemic racism and injustice depicted in the film. The true story of Walter McMillian and Bryan Stevenson\'s fight for justice deeply moved many, while some also found the performances, particularly those of Michael B. Jordan and Jamie Foxx, to be exceptional. However, a few critics felt the film lacked originality 

In [54]:
movie = "The Lion King"
pred1 = find_similar_movies(movie,database_key_based,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


{'keywords': "['father daughter relationship', 'live action and animation', 'bear', 'pig', 'live action cgi hybrid', 'balloon', 'honey', 'piglet', 'storybook in opening shot', 'loyality', 'childhood', 'promise', 'childhood memory', 'rabbit', 'tiger', 'donkey', 'winnie the pooh', 'magical realism', 'title spoken by character', 'kangaroo', 'owl', 'memory', 'dog', 'two word title', 'younger version of character']", 'reveiw_summary': '"Christopher Robin" elicits a mixed response from viewers, with a clear divide between those who find it a heartwarming and nostalgic return to the Hundred Acre Wood and those who find it overly sentimental or lacking in substance. The film\'s most praised elements include its stunning visuals, particularly the CGI rendering of Winnie the Pooh and his friends, which effectively blends the familiar cartoon characters with a more realistic aesthetic. The performances, particularly Ewan McGregor\'s nuanced portrayal of a jaded Christopher Robin and Jim Cummings\

In [55]:
movie = "Joker"
pred1 = find_similar_movies(movie,database_key_based,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


{'keywords': "['tied up while barefoot', 'ex convict', 'undercover', 'maximum security prison', 'bound and gagged', 'police detective', 'shot in the arm', 'security guard', 'prison warden', 'prison gang', 'murder', 'kidnapping', 'held at gunpoint', 'betrayal', 'blood', 'mob boss', 'photograph', 'faked death', 'reverse footage', 'fake identity', 'surprise ending', 'tied feet', 'prison', 'revenge', 'shot in the leg']", 'reveiw_summary': "The Informer garners a mixed reaction from viewers. While many praise the film's tense atmosphere, gritty realism, and strong performances, particularly from Joel Kinnaman, several reviewers find the plot predictable and the pacing uneven. The film's intense and violent scenes, as well as the portrayal of prison life, are frequently cited as highlights, while some criticize the underdeveloped characters and the reliance on clichés. Overall, the audience finds The Informer to be an entertaining and engaging thriller, but one that falls short of achieving 

In [56]:
movie = "Hotel Mumbai"
pred1 = find_similar_movies(movie,database_key_based,movie_collection)

if(type(pred1) == str):
    print("Movie not found")
else:
    for i in pred1:
        print(i)


{'keywords': "['train', 'terrorism', 'paris france', 'american', 'soldier', 'airman', 'vacation', 'france', '2000s', 'school', 'mother son relationship', 'sacramento california', 'friendship', 'san antonio texas', 'portugal', 'terrorist', 'jammed gun', 'armed assailant', 'french legion of honor', 'based on true story', 'time in title', 'hour in title', 'flashback', 'nonlinear timeline', 'school sport']", 'reveiw_summary': "The 15:17 to Paris elicited a mixed reaction from viewers, with opinions sharply divided on the film's merits. Many praised the film's authenticity, appreciating Clint Eastwood's decision to cast the real-life heroes in their own story. However, widespread criticism targeted the film's pacing, deemed slow and lacking dramatic tension, and the acting, which was considered uneven, particularly from the non-professional actors.  Several reviewers felt the film lacked a compelling narrative, with too much time devoted to backstory and unnecessary scenes, ultimately under

With this, we have a recommendation system model ready, which takes the test input and gives the top 5 movies based on the cosine similarity between it's embeddings. We can use the chroma database to get the most similar titles to a given title.