In [1]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [2]:
import pandas as pd
import plotly.express as px
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from utils.textpreprocessing import TextPreprocessor
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go

In [3]:
textprepo = TextPreprocessor()
encode_kwargs = {"normalize_embeddings": True}
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cpu"},
    encode_kwargs=encode_kwargs,
)

In [4]:
new_db = FAISS.load_local("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/vector_database_creation/faiss_anime_index_v2", embedding_function)
df = pd.read_json("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/fin_anime_dfv1.json")

In [5]:
query = "I like pirate adventures"

In [30]:

query_token = textprepo.preprocess_text(query)

def filter_tokens(metadata):
    metadata_tokens = metadata.get("tokens", [])
    return any(token in metadata_tokens for token in query_token)


#results = new_db.similarity_search(query, filter=filter_fn, k=10)
results = new_db.similarity_search(query, filter= filter_tokens, k = 20)

In [31]:
results

[Document(page_content='several characters have been stated to be based on actual pirates and sailors such as: eustass kid (eustace the monk and william kidd), x. drake (sir francis drake), basil hawkins (basil ringrose and john hawkins), capone bege (al capone and william le sauvage), jewelry bonney (anne bonny), urouge (aruj and oruç reis), alvida (awilda), bartolomeo (bartholomew roberts), bellamy (samuel bellamy), blackbeard (edward teach), cavendish (thomas cavendish), big mom (charlotte badger), gold roger (olivier levasseur), lafitte (jean lafitte), roronoa zoro (françois l\'olonnais), silvers rayleigh (sir walter raleigh), thatch (edward thatch), yorki (calico jack), zeff and sanji ("red legs" greaves), trafalgar law (edward low), barbarossa (hayreddin barbarossa), and scratchmen apoo (chui a-poo).[ch. 508] the cross dresser emporio ivankov is based on dr. frank n. furter and norio imamura. norio had asked oda to draw more okama (homosexual) characters and became ivankov\'s fir

In [28]:
results[0].metadata

{'anime_id': 50385,
 'cast': 'UNKNOWN',
 'episodes': '9.0',
 'genre': 'Action, Adventure, Comedy, Drama',
 'source': 'Unknown',
 'Duration': '24 min per ep',
 'name': 'One Piece Characters Log',
 'tokens': "treasure awilda chop character roronoa fruit majority man = hayreddin chui helmeppo bege bonney story extensive dwarf zoro furter morgan technique usopp corrupt aruj series kid 508 piece settle revolutionary eat bellamy possess kidd apoo frank monkey big dressrosa gather thomas ringrose manga sea william ask sir hawkins final oruc actor.[ch storyline wield n. japanese editor use pursuit okama cyborg conception actual encounter thatch greave voice 570]oda originally wrestler \n succeed musician tell government diverse bonny log creation oda badger group \n\n\n naval arc mink cross chopper jimbei thief eustass l'olonnais monk olivi Shui agent anthropomorphic secret leg mermaid trafalgar ability inventor live nico giant drake state protagonist poo).[ch roger mom reis samuel franky myth

In [7]:
indexes = {x.metadata['anime_id']: index for index, x in enumerate(results)}
cf_list = list(df[df['anime_id'].isin(list(indexes.keys()))]['cf_recs'])
if cf_list is not None:
    joined_list = [item for sublist in cf_list if sublist is not None for item in sublist if item is not None]

pop_recs = list(df.head(1)['popular_recs'])[0]
vd_recs = list(indexes.keys())

In [16]:
recs = df[df['anime_id'].isin(joined_list + pop_recs + vd_recs)]
recs2 = df[df['anime_id'].isin(joined_list + vd_recs)]

top_anime_rating = recs2[recs2['anime_Score']!='UNKNOWN'].sort_values(by='anime_Score', ascending=False).head(10)
top_studios = recs2.sort_values(by='Favorites', ascending=False).head(5)
top_anime_rating['anime_Score'] = top_anime_rating['anime_Score'].astype(float)

In [17]:
# Reset bar plot to default state
fig_bar = px.bar(top_anime_rating, x='anime_Score', y='Name', color='Name',
                 title="Ratings of Popular Anime", orientation='h')
blue_palette = ['#aec7e8', '#7b9fcf', '#1f77b4', '#03539e', '#003f5c']
# Set colors for bars
fig_bar.update_traces(marker_color=blue_palette)
# Show the plot
fig_bar.show()

In [147]:
# Create vertical box plot for the filtered data
fig_box = px.box(df[df['Studios'].isin(set(list(top_studios['Studios'])))], 
                 x='Studios', 
                 y='Favorites', 
                 color='Studios', 
                 title="Favorites to Studios Distribution", 
                 orientation='v',
                 custom_data=['Name'])

# Get the indices of maximum values
max_indices = df.groupby('Studios')['Favorites'].idxmax()

# Add text labels for maximum values
fig_box.update_traces(
    hovertemplate="<b>Name:</b> %{customdata[0]}<br><b>Favorites:</b> %{y}",
    selector=dict(type='box')
)

# Show the plot
fig_box.show()

In [19]:
recs_umap = recs[['Studios', 'anime_Synopsis', 'Name', 'anime_id']]
# Initialize 'rec_label' column with empty strings
recs_umap['rec_label'] = ''

# Iterate through the lists and update the 'rec_label' column
for rec_type, lst in [('collab_filter', joined_list), ('vector_rec', vd_recs), ('pop_rec', pop_recs)]:
    recs_umap.loc[recs_umap['anime_id'].isin(lst), 'rec_label'] = rec_type
new_row = {'Studios': 'user query', 'anime_Synopsis': query, 'Name': 'user query', 'anime_id': 'none', 'rec_label': 'none'}
recs_umap = pd.concat([pd.DataFrame(new_row, index=[0]), recs_umap], ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [25]:
# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode the anime synopsis using the Sentence Transformer model
embeddings = model.encode(recs_umap['anime_Synopsis'].tolist())

# Apply UMAP for dimensionality reduction
umap_model = UMAP(n_components=2, n_neighbors=5, min_dist=0.05)
umap_result = umap_model.fit_transform(embeddings)

# Convert UMAP result to DataFrame
umap_df = pd.DataFrame(umap_result, columns=['UMAP_1', 'UMAP_2'])

# Add 'Studios' and 'Name' columns to the UMAP DataFrame
umap_df['Studios'] = recs_umap['Studios'].tolist()
umap_df['Name'] = recs_umap['Name'].tolist()
umap_df['rec_label'] = recs_umap['rec_label'].tolist()
umap_df['anime_id'] = recs_umap['anime_id'].tolist()

# Plot the UMAP with color by 'rec_label'
fig_umap = px.scatter(umap_df, x='UMAP_1', y='UMAP_2', color='rec_label', 
                      hover_data={'Studios': True, 'Name': True},
                      title='UMAP of Anime Recommendations from Collab Filter, Vector Database and Popular Recommendations')

# Modify the marker symbol for points labeled 'pop_rec' to be a star with yellow color and bigger size
fig_umap.for_each_trace(lambda t: t.update(marker=dict(symbol='star', size=12, color='yellow') if t.name == 'pop_rec' else {}))

# Add annotation for a specific point
x_coord = umap_df.loc[0, 'UMAP_1']
y_coord = umap_df.loc[0, 'UMAP_2']
fig_umap.add_annotation(x=x_coord, y=y_coord, text="X", showarrow=True, font=dict(color="purple", size=20))

# Find the three closest points to the marked point
nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn_model.fit(umap_result)
distances, indices = nn_model.kneighbors([umap_result[0]])

# Collect anime IDs of the three closest points
closest_anime_ids = umap_df.loc[indices[0][1:], 'Name'].tolist()



# Update hover template to include only 'Name' and 'Studios'
fig_umap.update_traces(customdata=umap_df[['Studios', 'Name']],
                        hovertemplate="<b>%{customdata[1]}</b><br>" +
                                      "Studios: %{customdata[0]}<br>" +
                                      "<extra></extra>")
# Plot red X symbol on the closest points (excluding the marked point)
for i, idx in enumerate(indices[0][1:], start=1):
    target_x = umap_df.loc[idx, 'UMAP_1']
    target_y = umap_df.loc[idx, 'UMAP_2']
    fig_umap.add_trace(go.Scatter(x=[target_x], y=[target_y], mode='markers', showlegend=True,
                                  marker=dict(symbol='x', size=10, color='red'), name=f'rec {i}'))
# Show the plot
fig_umap.show()

In [246]:
closest_anime_ids

[50385, 21, 23393]