In [1]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [84]:
import guidance

from guidance import gen, models
from tqdm import tqdm

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from tqdm import tqdm
from utils.textpreprocessing import TextPreprocessor

In [3]:
textprepo = TextPreprocessor()

In [4]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

In [97]:
db_chroma = Chroma(persist_directory="./Users/justinvhuang/Desktop/CSE-6242-Group-Project/vector_database_creation/chroma_db", embedding_function=embedding_function)
db_faiss = FAISS.load_local("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/vector_database_creation/faiss_anime_index_v3", embeddings = embedding_function)


In [98]:
query = "i like space adventures"
query_token = textprepo.preprocess_text(query)

def filter_tokens(metadata):
        metadata_tokens = metadata.get("tokens", [])
        return any(token in metadata_tokens for token in query_token) or metadata["score"] > 5.0


results = db_faiss.similarity_search(query, filter=filter_tokens, k=50)
results

[Document(page_content='space ghost is a fictional superhero created by hanna-barbera productions in the 1960s for tv network cbs. he was designed by alex toth.in his original incarnation, he was a superhero who, with his teen sidekicks, jan and jace, and blip the monkey, fought supervillains in outer space. in the 1990s, space ghost was brought back as a host for his own fictional late-night talk show, space ghost coast to coast, on cartoon network, adult swim and gametap. in the 2000s, he was revamped as a serious superhero once again in a mini-series by dc comics.\n\n\n== television series ==\n\n\n', metadata={'anime_id': 40159, 'cast': 'UNKNOWN', 'episodes': 1, 'genre': 'Avant Garde', 'source': 'Original', 'Duration': '1 min', 'name': 'The Ghost of Cartoon', 'tokens': "['space', 'ghost', 'fictional', 'superhero', 'create', 'hanna', 'barbera', 'production', '1960', 'tv', 'network', 'cbs', 'design', 'alex', 'toth.in', 'original', 'incarnation', 'superhero', 'teen', 'sidekick', 'jan',

In [14]:
import yaml

# Load API key from config.yaml
with open("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/config.yaml", "r") as file:
    config = yaml.safe_load(file)

api_key = config["api_key"]

In [15]:
import google.generativeai as genai
userdata = {"GOOGLE_API_KEY": api_key}
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [79]:
retriever = db.as_retriever(search_kwargs={"k": 50, "filter": filter_tokens})

In [19]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI

In [18]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")

In [20]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0,google_api_key = GOOGLE_API_KEY)

In [68]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [81]:
metadata_field_info = [
    AttributeInfo(
        name="anime_id",
        description="The anime identifier",
        type="int",
    ),
    AttributeInfo(
        name="cast",
        description="The actors that are are in the anime",
        type="string",
    ),
       AttributeInfo(
        name="episodes",
        description="The number of episodes in the anime",
        type="int",
    ),
    AttributeInfo(
        name="source",
        description="Where the anime came from for example a manga or light novel",
        type="string",
    ),
       AttributeInfo(
        name="Duration",
        description="How long the anime is in minutes",
        type="string",
    ),
    AttributeInfo(
        name="name",
        description="The name of the anime",
        type="string",
    ),
    AttributeInfo(
        name="tokens",
        description="The token of the embedded text to do hybrid search",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="score",
        description="The score of the rating of the anime",
        type="float",
    ),
    AttributeInfo(
        name="producer", description="Producer of the anime", type="string"
    ),
     AttributeInfo(
        name="studio", description="Which studio produced the anime", type="string"
    ), 
    AttributeInfo(
        name="producer", description="Producer of the anime", type="string"
    ), 
    AttributeInfo(
        name="licensors", description="Which tv licensor is running the anime online for streaming", type="string"
    ), 
    AttributeInfo(
        name="cf_recs", description="A list of strings of anime that is recommended based on collaborative filtering", type="string"
    ), 
    AttributeInfo(
        name="pop_recs", description="Popular recommended animes from this dataset", type="string"
    ),
     AttributeInfo(
        name="rating", description="The television rating for the anime that rates it for age groups", type="string"
    ),
       AttributeInfo(
        name="air_date", description="When the anime first was on tv/aired and when did it end", type="string"
    ),
]
document_content_description = "Description of animes"

In [92]:
self_retriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    document_content_description,
    metadata_field_info,
)

In [75]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [77]:
query = "What are some good treasure seeking anime?"
query_token = textprepo.preprocess_text(query)
rag_chain.invoke(query)

"I'm sorry, but I cannot answer your question based on the provided context. The provided text does not mention any treasure seeking anime."

In [78]:
retriever.get_relevant_documents("hi")

[Document(page_content='hiyokoi (japanese: ひよ恋) is a shōjo manga series written and illustrated by moe yukimaru. it is serialized in shueisha\'s monthly shōjo manga magazine ribon, and had been published into fourteen tankōbon volumes. the name of the series is a pun between "hiyoko" (ひよこ, which means "chick") and "koi" (恋, which means "love").\na twenty-two-minute-long movie based on the manga and produced by production i.g. was released in the ribon summer festival on july 30, 2010. the movie was later released on a dvd along with the yumeiro patissiere ova.\n\n\n\nhiyori nishiyama is a 15-year-old girl who is extremely short. after being in an accident that caused her bones to stop growing, she starts going to high school as a freshman, along with her best friend ritsuka. in her class there is a boy whose name is yuushin hirose and becomes hiyori\'s friend (despite the fact that hiyori is not very happy with his height or his constant happy attitude). after an accident where her pan