In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma 



In [2]:
import pandas as pd

movies = pd.read_csv("movies_cleaned.csv")

In [3]:
movies.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,...,Genre_Music,Genre_Romance,Genre_Animation,Genre_Film-Noir,Genre_Biography,Genre_Western,Genre_Mystery,Genre_Drama,Genre_Horror,Genre_Family
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994.0,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,2343110,...,0,0,0,0,0,0,0,1,0,0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972.0,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,1620367,...,0,0,0,0,0,0,0,1,0,0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008.0,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,2303232,...,0,0,0,0,0,0,0,1,0,0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974.0,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,1129952,...,0,0,0,0,0,0,0,1,0,0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957.0,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,689845,...,0,0,0,0,0,0,0,1,0,0


In [4]:
movies['id'] = movies.index.astype(str)

In [5]:
movies['tag_desc'] = movies['id'] + " " + movies['Series_Title'] + " " + movies['Overview']
movies['tag_desc']

0      0 The Shawshank Redemption Two imprisoned men ...
1      1 The Godfather An organized crime dynasty's a...
2      2 The Dark Knight When the menace known as the...
3      3 The Godfather: Part II The early life and ca...
4      4 12 Angry Men A jury holdout attempts to prev...
                             ...                        
744    744 Giù la testa A low-life bandit and an I.R....
745    745 Kelly's Heroes A group of U.S. soldiers sn...
746    746 The Jungle Book Bagheera the Panther and B...
747    747 A Hard Day's Night Over two "typical" days...
748    748 From Here to Eternity In Hawaii in 1941, a...
Name: tag_desc, Length: 749, dtype: object

In [6]:
movies['tag_desc'].to_csv('tag_desc.txt', sep='\n',index=False, header=False)

In [7]:
raw_docs = TextLoader("tag_desc.txt").load()
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=0, chunk_overlap=0)
docs = text_splitter.split_documents(raw_docs)

Created a chunk of size 145, which is longer than the specified 0
Created a chunk of size 126, which is longer than the specified 0
Created a chunk of size 207, which is longer than the specified 0
Created a chunk of size 189, which is longer than the specified 0
Created a chunk of size 128, which is longer than the specified 0
Created a chunk of size 194, which is longer than the specified 0
Created a chunk of size 158, which is longer than the specified 0
Created a chunk of size 197, which is longer than the specified 0
Created a chunk of size 164, which is longer than the specified 0
Created a chunk of size 146, which is longer than the specified 0
Created a chunk of size 205, which is longer than the specified 0
Created a chunk of size 251, which is longer than the specified 0
Created a chunk of size 173, which is longer than the specified 0
Created a chunk of size 219, which is longer than the specified 0
Created a chunk of size 202, which is longer than the specified 0
Created a 

In [8]:
docs[0]

Document(metadata={'source': 'tag_desc.txt'}, page_content='0 The Shawshank Redemption Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.')

In [9]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

embedding_model = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db_movies = Chroma.from_documents(
    docs,
    embedding=embedding_model,
)

  embedding_model = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
query = 'A movie about superheros'
sample = db_movies.similarity_search(query, k = 5)
test = [x.page_content for x in sample]
print(test) 

['257 The Incredibles A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world.', '688 Watchmen In 1985 where former superheroes exist, the murder of a colleague sends active vigilante Rorschach into his own sprawling investigation, uncovering something that could completely change the course of history as we know it.', '2 The Dark Knight When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.', '231 Deadpool A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks.', '50 Spider-Man: Into the Spider-Verse Teen Miles Morales becomes the Spider-Man of his universe, and must join with five spider-powered individuals from other dimensions to stop a threat for all realities.']


In [11]:
movies[movies['id']==(sample[0].page_content.split()[0].strip())]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,...,Genre_Animation,Genre_Film-Noir,Genre_Biography,Genre_Western,Genre_Mystery,Genre_Drama,Genre_Horror,Genre_Family,id,tag_desc
257,https://m.media-amazon.com/images/M/MV5BMTY5OT...,The Incredibles,2004.0,115,"Animation, Action, Adventure",8.0,"A family of undercover superheroes, while tryi...",90.0,Brad Bird,657047,...,1,0,0,0,0,0,0,0,257,257 The Incredibles A family of undercover sup...


In [12]:
def retrieve_movies(query: str, top_k:int = 10) -> pd.DataFrame:
    """Retrieve movies based on a query

    Args:
        query (str): The query to search for movies.
        top_k (int, optional): The number of top recommendations to return. Defaults to 10.

    Returns:
        pd.DataFrame: A DataFrame containing the recommended movies.
    """
    recs = db_movies.similarity_search_with_score(query, k=10)
    recs = sorted(recs, key=lambda x: x[1], reverse=True)
    movies_list = []
    for i in range(0, len(recs)):
        movies_list += [recs[i][0].page_content.strip('""').split()[0]]
    return movies[(movies['id']).isin(movies_list)].head(top_k)

In [13]:
retrieve_movies("A movie about superheroes", top_k=5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,...,Genre_Animation,Genre_Film-Noir,Genre_Biography,Genre_Western,Genre_Mystery,Genre_Drama,Genre_Horror,Genre_Family,id,tag_desc
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008.0,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,2303232,...,0,0,0,0,0,1,0,0,2,2 The Dark Knight When the menace known as the...
50,https://m.media-amazon.com/images/M/MV5BMjMwND...,Spider-Man: Into the Spider-Verse,2018.0,117,"Animation, Action, Adventure",8.4,Teen Miles Morales becomes the Spider-Man of h...,87.0,Bob Persichetti,375110,...,1,0,0,0,0,0,0,0,50,50 Spider-Man: Into the Spider-Verse Teen Mile...
231,https://m.media-amazon.com/images/M/MV5BYzE5Mj...,Deadpool,2016.0,108,"Action, Adventure, Comedy",8.0,A wisecracking mercenary gets experimented on ...,65.0,Tim Miller,902669,...,0,0,0,0,0,0,0,0,231,231 Deadpool A wisecracking mercenary gets exp...
237,https://m.media-amazon.com/images/M/MV5BNDYxNj...,The Avengers,2012.0,143,"Action, Adventure, Sci-Fi",8.0,Earth's mightiest heroes must come together an...,69.0,Joss Whedon,1260806,...,0,0,0,0,0,0,0,0,237,237 The Avengers Earth's mightiest heroes must...
257,https://m.media-amazon.com/images/M/MV5BMTY5OT...,The Incredibles,2004.0,115,"Animation, Action, Adventure",8.0,"A family of undercover superheroes, while tryi...",90.0,Brad Bird,657047,...,1,0,0,0,0,0,0,0,257,257 The Incredibles A family of undercover sup...


In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Poster_Link      749 non-null    object 
 1   Series_Title     749 non-null    object 
 2   Released_Year    749 non-null    float64
 3   Runtime          749 non-null    int64  
 4   Genre            749 non-null    object 
 5   IMDB_Rating      749 non-null    float64
 6   Overview         749 non-null    object 
 7   Meta_score       749 non-null    float64
 8   Director         749 non-null    object 
 9   No_of_Votes      749 non-null    int64  
 10  Genre_list       749 non-null    object 
 11  Genre_Sport      749 non-null    int64  
 12  Genre_Action     749 non-null    int64  
 13  Genre_War        749 non-null    int64  
 14  Genre_Fantasy    749 non-null    int64  
 15  Genre_Sci-Fi     749 non-null    int64  
 16  Genre_Comedy     749 non-null    int64  
 17  Genre_History   

In [17]:
movies.to_csv("movies.csv", index=False)