In [81]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from utils.textpreprocessing import TextPreprocessor

In [9]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

In [16]:
df = pd.read_json('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/fin_anime_dfv1.json')

In [17]:
def compare_and_replace(row):
    if len(row['anime_Synopsis']) > len(row['text']):
        row['text'] = row['anime_Synopsis']
    return row

In [18]:
df = df.apply(compare_and_replace, axis=1)

In [19]:
df['tokens'] = df['tokens'].apply(lambda x: " ".join(set(x)))

In [21]:
df['anime_Score'] = df['anime_Score'].replace("UNKNOWN", 0)

In [22]:
df['anime_Score'] = df['anime_Score'].apply(lambda x: float(x))

In [32]:
anime_docs = [Document(page_content=row['text'].lower(), 
                       metadata={
                           "anime_id": row['anime_anime_id'],
                           "cast": row['imdb_name_basics_primaryName'],
                           'episodes': row['Episodes'],
                           'genre': row['Genres'],
                           'source': row['Source'],
                           'Duration': row['Duration'],
                           "name": row['Name'],
                           'tokens': row['tokens'],
                           'score': row['anime_Score'],
                           'producer': row['Producers'],
                           'studio': row['Studios'],
                           'licensors': row['Licensors']
                       },
                       
                       )
                       for _, row in tqdm(df.iterrows())
                    ]

11565it [00:00, 11829.73it/s]


In [33]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_anime_docs = text_splitter.split_documents(anime_docs)

In [34]:
split_anime_docs

[Document(page_content="initial d (japanese: 頭文字イニシャル d, hepburn: inisharu dī) is a japanese street racing manga series written and illustrated by shuichi shigeno. it was serialized in kodansha's seinen manga magazine weekly young magazine from 1995 to 2013, with the chapters collected into 48 tankōbon volumes. the story focuses on the world of illegal japanese street racing, where all the action is concentrated in the mountain passes and rarely in cities or urban areas, and with the drifting racing style emphasized in particular. professional race car driver and pioneer of drifting keiichi tsuchiya helped with editorial supervision. the story is centered on the prefecture of gunma, more specifically on several mountains in the kantō region and in their surrounding cities and towns. although some of the names of the locations the characters race in have been fictionalized, all of the locations in the series are based on actual locations in japan.", metadata={'anime_id': 18, 'cast': 'UN

In [35]:
db = FAISS.from_documents(split_anime_docs,
                          embedding_function)

In [37]:
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "score_threshold": 0.5, "lambda_mult": 0}
)

In [60]:
query = "I like comedic isekai"
docs = retriever.invoke(
    query,
)

docs

[Document(page_content="my isekai life, short for my isekai life: i gained a second character class and became the strongest sage in the world! (転生賢者の異世界ライフ 〜第二の職業を得て、世界最強になりました〜, tensei kenja no isekai raifu ~daini no shokugyō o ete, sekai saikyō ni narimashita~), is a japanese light novel series written by shinkoshoto and illustrated by huuka kazabana. it began serialization online in october 2017 on the user-generated novel publishing website shōsetsuka ni narō. it was later acquired by sb creative, who has released the series since may 2018 under their ga novel label.\na manga adaptation with art by ponjea has been serialized online since july 2018 via square enix's online manga magazine manga up!. the manga is licensed in north america by square enix. an anime television series adaptation by revoroot aired from july to september 2022.", metadata={'anime_id': 47163, 'cast': 'John Gremillion, Cyrus Rodas, Shinkô Shotô, Christina Marie Kelly, Blake Shepard, Jack Stansbury, John Swase

In [44]:
db.save_local("faiss_anime_index_v2")

In [45]:
new_db = FAISS.load_local("faiss_anime_index_v2", embedding_function,allow_dangerous_deserialization=True)

In [48]:
retriever2 = new_db.as_retriever(
    search_type="mmr", search_kwargs={"k": 20, "score_threshold": 0.5, "lambda_mult": 0}
)

In [49]:
query = "I am a space cowboy"

docs = retriever2.invoke(
    query,
)

docs

[Document(page_content="welcome to the space show (japanese: 宇宙ショーへようこそ, hepburn: uchū shō e yōkoso) is a 2010 japanese animated science fiction film produced by a-1 pictures and distributed by aniplex. it was directed by koji masunari from a script by hideyuki kurata, and stars the voices of tomoyo kurosawa, honoka ikezuki, shōtarō uzawa, tamaki matsumoto, takuto yoshinaga, and keiji fujiwara. in the film, five children rescue an alien dog, and are rewarded a trip to the moon in response. after events cause the group to become stranded in space, they must find a way to return to earth before the kids' parents arrive.", metadata={'anime_id': 8115, 'cast': 'Takuto Yoshinaga, Atsuhiro Iwakami, Tomoyo Kurosawa, Hideyuki Kurata, Tomonori Ochikoshi, Masaaki Yuasa, Honoka Ikezuki, Tamaki Matsumoto, Stephanie Sheh, Kôji Masunari', 'episodes': '1.0', 'genre': 'Adventure, Fantasy', 'source': 'Original', 'Duration': '2 hr 16 min', 'name': 'Uchuu Show e Youkoso', 'tokens': 'shock kichi crop beam 