In [9]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

In [3]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='tomaarsen/mpnet-base-nli-matryoshka',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

In [5]:
df = pd.read_json('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/fin_anime_dfv1.json')

In [21]:
anime_docs = [Document(page_content=row['text'].lower(), 
                       metadata={
                           "anime_id": row['anime_anime_id'],
                           "cast": row['imdb_name_basics_primaryName'],
                           "synopsis": row['anime_Synopsis'],
                           'img_url': row['Image URL'],
                           'episodes': row['Episodes'],
                           'genre': row['Genres'],
                           'source': row['Source'],
                           'Duration': row['Duration'],
                           "name": row['Name'],
                           'pop_recs': row['popular_recs'],
                           'cf_recs': row['cf_recs'],
                           'tokens': row['tokens'],
                           'score': row['anime_Score'],
                           'producer': row['Producers'],
                           'studio': row['Studios'],
                           'licensors': row['Licensors']
                       },
                       
                       )
                       for _, row in tqdm(df.iterrows())
                    ]


[A
[A
[A
[A
[A
[A
[A
11565it [00:00, 14752.05it/s]


In [23]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
split_anime_docs = text_splitter.split_documents(anime_docs)

0it [02:07, ?it/s]


In [24]:
db = FAISS.from_documents(split_anime_docs,
                          embedding_function)

In [None]:
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 5, "score_threshold": 0.5, "lambda_mult": 0}
)

In [None]:
query = "What are some good sichuan foods"

docs = retriever.invoke(
    query,
)

docs

In [None]:
def filter_fn(metadata):
    return "scallions" in metadata["ingredients"].lower() or metadata["peanuts"] == "Yes"


results = db.similarity_search("What are some good sichuan foods", filter=filter_fn, k=5)
results

In [None]:
db.save_local("faiss_anime_index")

In [None]:
new_db = FAISS.load_local("faiss_anime_index", embedding_function)