In [1]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

In [2]:
def compare_and_replace(row):
    if len(row['anime_Synopsis']) > len(row['text']):
        row['text'] = row['anime_Synopsis']
    return row

In [3]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cuda"},
    encode_kwargs=encode_kwargs,
)

In [4]:
df = pd.read_json(r'C:\Users\jvhua\OneDrive\Desktop\CSE-6242-Group-Project\vector_database_creation\fin_anime_dfv1.json')

In [5]:
df = df.apply(compare_and_replace, axis=1)

In [6]:
df['tokens'] = df['tokens'].apply(lambda x: " ".join(x))
df['anime_Score'] = df['anime_Score'].replace("UNKNOWN", 0)
df['anime_Score'] = df['anime_Score'].apply(lambda x: float(x))

In [7]:
anime_docs = [Document(page_content=row['text'].lower(), 
                       metadata={
                           "anime_id": row['anime_anime_id'],
                           "cast": row['imdb_name_basics_primaryName'],
                           'episodes': row['Episodes'],
                           'genre': row['Genres'],
                           'source': row['Source'],
                           'Duration': row['Duration'],
                           "name": row['Name'],
                           'tokens': row['tokens'],
                           'score': row['anime_Score'],
                           'producer': row['Producers'],
                           'studio': row['Studios'],
                           'licensors': row['Licensors']
                       },
                       
                       )
                       for _, row in tqdm(df.iterrows())
                    ]

11565it [00:01, 7032.36it/s]


In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_anime_docs = text_splitter.split_documents(anime_docs)

In [9]:
len(split_anime_docs)

38400

In [10]:
split_anime_docs

[Document(page_content="initial d (japanese: 頭文字イニシャル d, hepburn: inisharu dī) is a japanese street racing manga series written and illustrated by shuichi shigeno. it was serialized in kodansha's seinen manga magazine weekly young magazine from 1995 to 2013, with the chapters collected into 48 tankōbon volumes. the story focuses on the world of illegal japanese street racing, where all the action is concentrated in the mountain passes and rarely in cities or urban areas, and with the drifting racing style emphasized in particular. professional race car driver and pioneer of drifting keiichi tsuchiya helped with editorial supervision. the story is centered on the prefecture of gunma, more specifically on several mountains in the kantō region and in their surrounding cities and towns. although some of the names of the locations the characters race in have been fictionalized, all of the locations in the series are based on actual locations in japan.", metadata={'anime_id': 18, 'cast': 'UN

In [41]:
db = Chroma.from_documents(split_anime_docs, embedding_function, persist_directory="./chroma_db")

In [12]:
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 5, "score_threshold": 0.5, "lambda_mult": 0}
)

In [13]:
query = "I want to be like monkey d luffy"

docs = retriever.invoke(
    query,
)

docs

[Document(page_content='monkey d. luffy is the series\' main protagonist, a young pirate who wishes to succeed gold roger, the deceased king of the pirates, by finding his treasure, the "one piece". throughout the series, luffy gathers himself a diverse crew named the straw hat pirates, including: the three-sword-wielding combatant roronoa zoro (sometimes referred to as roronoa zolo in the english manga); the thief and navigator nami; the cowardly marksman and inventor usopp; the amorous cook and martial artist sanji; the anthropomorphic reindeer and doctor tony tony chopper; the archaeologist nico robin; the cyborg shipwright franky; the living skeleton musician brook; and the fish-man helmsman jimbei. together they sail the seas in pursuit of their dreams, encountering other pirates, bounty hunters, criminal organizations, revolutionaries, secret agents and soldiers of the corrupt world government, and various other friends and foes.', metadata={'Duration': '24 min per ep', 'anime_id

In [None]:
# $eq - equal to (string, int, float)
# $ne - not equal to (string, int, float)
# $gt - greater than (int, float)
# $gte - greater than or equal to (int, float)
# $lt - less than (int, float)
# $lte - less than or equal to (int, float)

In [39]:
results = db.similarity_search(
        "I want to be like monkey d luffy", k=10, filter={"$or": [{"tokens":{'$in': ["lupin"]}}, {"score": {"$gte": 9.0}}]})

results

[Document(page_content='the series focuses on an eccentric samurai, gintoki sakata who works as an odd-jobs freelancer. he helps a teenager named shinpachi shimura save his sister tae from an alien group that wants to send her to a brothel. impressed by gintoki, shinpachi becomes his freelance apprentice to pay the bills and learn more about the enigmatic samurai. when the pair rescues a teenage alien girl with super-strength, kagura, from a yakuza group, they accept her into their odd-jobs freelancing business, and the three become known as "yorozuya" (万事屋, \'store of 10,000 business\' or \'we do everything\').\nwhile working, they regularly encounter the shinsengumi police force, who often ally with gintoki when work involves dangerous criminals. the trio also meets gintoki\'s former comrades from the amanto invasion, including the revolutionary kotaro katsura who is friendly toward them despite his terrorist activities against the alien-controlled government.', metadata={'Duration':

In [None]:
new_db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
