In [7]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [47]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import pickle 

In [38]:
def create_retriever(faiss_db: str):
    """
    Creates a retriever using a FAISS index.

    Args:
        faiss_db (str): Path to the FAISS index file.

    Returns:
        retriever: A retriever object configured with FAISS index.
    """
    encode_kwargs = {"normalize_embeddings": True}
    
    # Initialize Hugging Face embeddings
    embedding_function = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={"device": "cpu"},
        encode_kwargs=encode_kwargs,
    )

    # Load FAISS index
    db_faiss = FAISS.load_local(faiss_db, embeddings=embedding_function)

    def filter_tokens(metadata: dict) -> bool:
        """
        Filter function to apply on retrieved documents based on metadata.

        Args:
            metadata (dict): Metadata of the document.
            query_token (list): List of tokens to filter.

        Returns:
            bool: True if the document passes the filter, False otherwise.
        """
        metadata_tokens = metadata.get("tokens", [])
        metadata_studio = metadata.get("studio", [])
        metadata_producer = metadata.get("producer", [])
        metadata_licensors = metadata.get("licensors", [])
        metadata_genre = metadata.get("genre", [])

        return (
            any(token in metadata_tokens for token in query_token)
            or metadata.get("score", 0.0) > 5.0
            or any(token in metadata_studio for token in query_token)
            or any(token in metadata_producer for token in query_token)
            or any(token in metadata_licensors for token in query_token)
            or any(token in metadata_genre for token in query_token)
        )

    # Create retriever object
    retriever = db_faiss.as_retriever(search_kwargs={"k": 50, "filter": filter_tokens})
    return retriever

In [39]:
retriever = create_retriever("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/faiss_anime_index_v3")

In [40]:
query = 'what are some good anime that is hand drawn'
query_token = textprepo.preprocess_text(query)

In [41]:
results = retriever.get_relevant_documents(query)
indexes = {x.metadata['anime_id']: index for index, x in enumerate(results)}

In [111]:
def load_data(json_file_path, cf_pickle_path, pop_pickle_path):
    # Load data from JSON file
    df = pd.read_json(json_file_path)
    
    # Load collaborative filtering recommendations from pickle file
    with open(cf_pickle_path, 'rb') as f:
        cf_recs = pickle.load(f)

    # Load popular recommendations from pickle file
    with open(pop_pickle_path, 'rb') as f:
        pop_recs = pickle.load(f)
    
    return df, cf_recs, pop_recs

In [112]:
json_file_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/fin_anime_dfv2.json"
cf_pickle_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/anime_recommendations_item_knn_CF_10k_num_fin.pkl"
pop_pickle_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/popular_dict_10.pkl"

df, cf_recs, pop_recs = load_data(json_file_path, cf_pickle_path, pop_pickle_path)

In [122]:
def process_recommendations(pop_recs, df, indexes):
    # Mapping function
    def map_anime_ids(anime_id):
        return cf_recs.get(anime_id, [])
    # Applying mapping function to create new column
    df['anime_values'] = df['anime_id'].apply(map_anime_ids)
    popular_list = [x['anime_id'] for x in pop_recs]
    popular_anime_descriptions = df[df['anime_id'].isin(popular_list)]['text'].head(5).tolist()
    
    cf_list = list(df[df['anime_id'].isin(list(indexes.keys()))]['anime_values'])
    joined_list = [item for sublist in cf_list if sublist is not None for item in sublist if item is not None] if cf_list else []
    
    vd_recs = list(set(list(indexes.keys())))
    
    return popular_anime_descriptions, joined_list, vd_recs

In [123]:
popular_anime_descriptions, joined_list, vd_recs = process_recommendations(pop_recs, df, indexes)

In [118]:
def get_top3_posters_and_names(df, indexes):
    top3_anime_ids = list(indexes.keys())[:3]
    top3_posters = df[df['anime_id'].isin(top3_anime_ids)]['image_y'].tolist()
    top3_names = df[df['anime_id'].isin(top3_anime_ids)]['Name'].tolist()
    return top3_posters, top3_names

In [119]:
top3_posters, top3_names = get_top3_posters_and_names(df, indexes)

In [120]:
def get_recommendations_descriptions(df, joined_list, pop_recs, vd_recs):
    recs = df[df['anime_id'].isin(joined_list + pop_recs + vd_recs)]
    recs2 = df[df['anime_id'].isin(joined_list +  vd_recs)]
    descriptions = recs['anime_Synopsis'].tolist()
    return recs, recs2, descriptions

In [121]:
recs, recs2, descriptions = get_recommendations_descriptions(df, joined_list, pop_recs, vd_recs)
