In [94]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
import numpy as np
import re
import pickle
#from utils.textpreprocessing import TextPreprocessor

In [3]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

In [152]:
encode_kwargs = {"normalize_embeddings": True}
embedding_function2 = HuggingFaceEmbeddings(
    model_name='colbert-ir/colbertv2.0',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

No sentence-transformers model found with name colbert-ir/colbertv2.0. Creating a new one with MEAN pooling.


config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [82]:
df = pd.read_json('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/fin_anime_dfv2.json')

In [83]:
def compare_and_replace(row):
    if len(row['anime_Synopsis']) > len(row['text']):
        row['text'] = row['anime_Synopsis']
    return row

In [84]:
df = df.apply(compare_and_replace, axis=1)

In [85]:
# Convert "UNKNOWN" to NaN
df['anime_Score'] = df['anime_Score'].replace("UNKNOWN", np.nan)

# Convert column to numeric
df['anime_Score'] = pd.to_numeric(df['anime_Score'], errors='coerce')

# Replace NaN with mean
mean_score = df['anime_Score'].mean()
df['anime_Score'] = df['anime_Score'].fillna(mean_score)

In [86]:
df['anime_Rating'] = df['anime_Rating'].astype(str)
df['Members'] = df['Members'].astype(int)

In [87]:
# Convert "UNKNOWN" to NaN
df['Episodes'] = df['Episodes'].replace("UNKNOWN", np.nan)

# Fill NaN with mode
mode_value = df['Episodes'].mode()[0]  # Get the first mode value
df['Episodes'].fillna(mode_value, inplace=True)

# Convert to integer
df['Episodes'] = df['Episodes'].apply(lambda x: int(float(x)))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Episodes'].fillna(mode_value, inplace=True)


In [92]:
# Define the regex pattern
pattern = r'[^\w\s\d]'
# Apply regex pattern to the 'Text' column
df['text'] = df['text'].str.lower().str.replace(pattern, '')

In [132]:
# Load data from pickle file
with open('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/popular_dict_10.pkl', 'rb') as f:
    pop_recs = pickle.load(f)

In [105]:
# Load data from pickle file
with open('/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/anime_recommendations_item_knn_CF_10k_fin.pkl', 'rb') as f:
    cf_recs = pickle.load(f)

In [124]:
# Map dictionary values to the DataFrame based on 'Key' column
df['cf_recs'] = df['anime_id'].map(cf_recs)

In [125]:
# Function to join lists to strings
def join_lists(x):
    if isinstance(x, list):
        return ", ".join(x)
    else:
        return x

# Apply the function to the 'cf_recs' column
df['cf_recs'] = df['cf_recs'].apply(join_lists)

In [137]:
pop_recs = pop_df['Name'].tolist()

In [140]:
df['pop_recs'] = [pop_recs] * len(df)

In [144]:
df['pop_recs'] = df['pop_recs'].apply(lambda x: ", ".join(x))

In [146]:
anime_docs = [Document(page_content=row['text'].lower(), 
                       metadata={
                           "anime_id": row['anime_id'],
                           "cast": row['imdb_name_basics_primaryName'],
                           'episodes': row['Episodes'],
                           'genre': row['Genres'],
                           'source': row['Source'],
                           'Duration': row['Duration'],
                           "name": row['Name'],
                           'tokens': row['tokens'],
                           'score': row['anime_Score'],
                           'producer': row['Producers'],
                           'studio': row['Studios'],
                           'licensors': row['Licensors'],
                           'cf_recs': row['cf_recs'],
                           'pop_recs': row['pop_recs'],
                           'rating': row['anime_Rating'],
                           'air_date': row['Aired']
                       },
                       
                       )
                       for _, row in tqdm(df.iterrows())
                    ]

8495it [00:00, 14696.26it/s]


In [147]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
split_anime_docs = text_splitter.split_documents(anime_docs)

In [150]:
db1 = FAISS.from_documents(anime_docs,
                          embedding_function)

In [151]:
db2 = FAISS.from_documents(split_anime_docs,
                          embedding_function)

In [154]:
# db3 = FAISS.from_documents(anime_docs,
#                           embedding_function2)

In [156]:
retriever = db1.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "score_threshold": 0.5, "lambda_mult": 0}
)

In [157]:
query = "I like comedic isekai"
docs = retriever.invoke(
    query,
)

docs

[Document(page_content="an adventure in another world with cute girls by your side and video game-like powers—sounds like an anime fan's dream, right? not so for melancholic author osamu dazai, who would quite literally prefer to drop dead. video games haven't even been invented yet when he gets yanked into another world in 1948. really, all the fantastical adventure he keeps running into is just getting in the way of his poetic dream of finding the perfect place to die. but no matter how much he risks his hide, everything seems to keep turning out okay. follow a miserable hero like no other in this cheerfully bleak isekai comedy!\n\n(source: seven seas entertainment)", metadata={'anime_id': 52367, 'cast': 'UNKNOWN', 'episodes': 1, 'genre': 'Adventure, Comedy, Fantasy', 'source': 'Web manga', 'Duration': 'Unknown', 'name': 'Isekai Shikkaku', 'tokens': "['adventure', 'world', 'cute', 'girl', 'video', 'game', 'like', 'power', 'sound', 'like', 'anime', 'fan', 'dream', 'right', 'melancholi

In [155]:
db1.save_local("faiss_anime_index_v3")
db2.save_local("faiss_anime_index_v4")
#db3.save_local("faiss_anime_index_colbert")

In [45]:
new_db = FAISS.load_local("faiss_anime_index_v2", embedding_function,allow_dangerous_deserialization=True)

In [48]:
retriever2 = new_db.as_retriever(
    search_type="mmr", search_kwargs={"k": 20, "score_threshold": 0.5, "lambda_mult": 0}
)

In [49]:
query = "I am a space cowboy"

docs = retriever2.invoke(
    query,
)

docs

[Document(page_content="welcome to the space show (japanese: 宇宙ショーへようこそ, hepburn: uchū shō e yōkoso) is a 2010 japanese animated science fiction film produced by a-1 pictures and distributed by aniplex. it was directed by koji masunari from a script by hideyuki kurata, and stars the voices of tomoyo kurosawa, honoka ikezuki, shōtarō uzawa, tamaki matsumoto, takuto yoshinaga, and keiji fujiwara. in the film, five children rescue an alien dog, and are rewarded a trip to the moon in response. after events cause the group to become stranded in space, they must find a way to return to earth before the kids' parents arrive.", metadata={'anime_id': 8115, 'cast': 'Takuto Yoshinaga, Atsuhiro Iwakami, Tomoyo Kurosawa, Hideyuki Kurata, Tomonori Ochikoshi, Masaaki Yuasa, Honoka Ikezuki, Tamaki Matsumoto, Stephanie Sheh, Kôji Masunari', 'episodes': '1.0', 'genre': 'Adventure, Fantasy', 'source': 'Original', 'Duration': '2 hr 16 min', 'name': 'Uchuu Show e Youkoso', 'tokens': 'shock kichi crop beam 