In [7]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [47]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from utils.textpreprocessing import TextPreprocessor
textprepo = TextPreprocessor()

import pickle 

In [38]:
def create_retriever(faiss_db: str):
    """
    Creates a retriever using a FAISS index.

    Args:
        faiss_db (str): Path to the FAISS index file.

    Returns:
        retriever: A retriever object configured with FAISS index.
    """
    encode_kwargs = {"normalize_embeddings": True}
    
    # Initialize Hugging Face embeddings
    embedding_function = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={"device": "cpu"},
        encode_kwargs=encode_kwargs,
    )

    # Load FAISS index
    db_faiss = FAISS.load_local(faiss_db, embeddings=embedding_function)

    def filter_tokens(metadata: dict) -> bool:
        """
        Filter function to apply on retrieved documents based on metadata.

        Args:
            metadata (dict): Metadata of the document.
            query_token (list): List of tokens to filter.

        Returns:
            bool: True if the document passes the filter, False otherwise.
        """
        metadata_tokens = metadata.get("tokens", [])
        metadata_studio = metadata.get("studio", [])
        metadata_producer = metadata.get("producer", [])
        metadata_licensors = metadata.get("licensors", [])
        metadata_genre = metadata.get("genre", [])

        return (
            any(token in metadata_tokens for token in query_token)
            or metadata.get("score", 0.0) > 5.0
            or any(token in metadata_studio for token in query_token)
            or any(token in metadata_producer for token in query_token)
            or any(token in metadata_licensors for token in query_token)
            or any(token in metadata_genre for token in query_token)
        )

    # Create retriever object
    retriever = db_faiss.as_retriever(search_kwargs={"k": 50, "filter": filter_tokens})
    return retriever

In [39]:
retriever = create_retriever("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/faiss_anime_index_v3")

In [40]:
query = 'what are some good anime that is hand drawn'
query_token = textprepo.preprocess_text(query)

In [41]:
results = retriever.get_relevant_documents(query)

In [43]:
df = pd.read_json("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/fin_anime_dfv2.json")

In [76]:
# Load data from pickle file
with open('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/anime_recommendations_item_knn_CF_10k_num_fin.pkl', 'rb') as f:
    cf_recs = pickle.load(f)

# Load data from pickle file
with open('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/popular_dict_10.pkl', 'rb') as f:
    pop_recs = pickle.load(f)

In [63]:
# Mapping function
def map_anime_ids(anime_id):
    return cf_recs.get(anime_id, [])

# Applying mapping function to create new column
df['anime_values'] = df['anime_id'].apply(map_anime_ids)

In [78]:
popular_list = [x['anime_id'] for x in pop_recs]

In [83]:
pd.set_option('display.max_colwidth', None)

In [88]:
popular_anime_descriptions = df[df['anime_id'].isin(popular_list)]['text'].head(5).tolist()

In [65]:
indexes = {x.metadata['anime_id']: index for index, x in enumerate(results)}

In [67]:
cf_list = list(df[df['anime_id'].isin(list(indexes.keys()))]['anime_values'])

In [69]:
if cf_list is not None:
    joined_list = [item for sublist in cf_list if sublist is not None for item in sublist if item is not None]

In [101]:
top3_posters = df[df['anime_id'].isin(list(indexes.keys())[0:3])]['image_y'].tolist()
top3_name = df[df['anime_id'].isin(list(indexes.keys())[0:3])]['Name'].tolist()

In [108]:
vd_recs = list(set(list(indexes.keys())))

In [109]:

recs = df[df['anime_id'].isin(joined_list + pop_recs + vd_recs)]
recs2 = df[df['anime_id'].isin(joined_list +  vd_recs)]
descriptions = recs['anime_Synopsis'].tolist()