In [3]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [4]:
import pandas as pd
import plotly.express as px
from app.utils.textpreprocessing import TextPreprocessor
from app.utils.data_manipulation import (create_retriever, 
                                         get_documents,
                                         load_data, 
                                         process_recommendations, 
                                         get_top3_posters_and_names, 
                                         get_recommendations_descriptions)
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
textprepo = TextPreprocessor()

In [5]:
sbert = "sentence-transformers/all-MiniLM-L6-v2"
vdb = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/faiss_anime_index_v3"
db_faiss = create_retriever(vdb, sbert)

In [6]:
def filter_tokens(metadata: dict) -> bool:
    """
    Filter function to apply on retrieved documents based on metadata.

    Args:
        metadata (dict): Metadata of the document.
        query_token (list): List of tokens to filter.

    Returns:
        bool: True if the document passes the filter, False otherwise.
    """
    metadata_tokens = metadata.get("tokens", [])
    metadata_studio = metadata.get("studio", [])
    metadata_producer = metadata.get("producer", [])
    metadata_licensors = metadata.get("licensors", [])
    metadata_genre = metadata.get("genre", [])

    return (
        any(token in metadata_tokens for token in query_token)
        or metadata.get("score", 0.0) > 5.0
        or any(token in metadata_studio for token in query_token)
        or any(token in metadata_producer for token in query_token)
        or any(token in metadata_licensors for token in query_token)
        or any(token in metadata_genre for token in query_token)
    )

In [8]:
retriever = db_faiss.as_retriever(search_kwargs={"k": 50, "filter": filter_tokens})

In [6]:
from functools import partial
filter_tokens_with_query_token = partial(filter_tokens, query_token=query_token)

In [9]:
query = 'I like pirate adventures'
query_token = textprepo.preprocess_text(query)
results = get_documents(retriever, query, query_token)

TypeError: filter_tokens() got an unexpected keyword argument 'query_token'

NameError: name 'query_token' is not defined

In [7]:

results = retriever.get_relevant_documents(query)
indexes = {x.metadata['anime_id']: index for index, x in enumerate(results)}

NameError: name 'query_token' is not defined

In [33]:


top_anime_rating = recs2[recs2['anime_Score']!='UNKNOWN'].sort_values(by='anime_Score', ascending=False).head(10)
top_studios = recs2.sort_values(by='Favorites', ascending=False).head(5)
top_anime_rating['anime_Score'] = top_anime_rating['anime_Score'].astype(float)

In [34]:
# Reset bar plot to default state
fig_bar = px.bar(top_anime_rating, x='anime_Score', y='Name', color='Name',
                 title="Ratings of Popular Anime", orientation='h')
blue_palette = ['#aec7e8', '#7b9fcf', '#1f77b4', '#03539e', '#003f5c']
# Set colors for bars
fig_bar.update_traces(marker_color=blue_palette)
# Show the plot
fig_bar.show()

In [35]:
# Create vertical box plot for the filtered data
fig_box = px.box(df[df['Studios'].isin(set(list(top_studios['Studios'])))], 
                 x='Studios', 
                 y='Favorites', 
                 color='Studios', 
                 title="Favorites to Studios Distribution", 
                 orientation='v',
                 custom_data=['Name'])

# Get the indices of maximum values
max_indices = df.groupby('Studios')['Favorites'].idxmax()

# Add text labels for maximum values
fig_box.update_traces(
    hovertemplate="<b>Name:</b> %{customdata[0]}<br><b>Favorites:</b> %{y}",
    selector=dict(type='box')
)

# Show the plot
fig_box.show()

In [36]:
recs_umap = recs[['Studios', 'anime_Synopsis', 'Name', 'anime_id']]
# Initialize 'rec_label' column with empty strings
recs_umap['rec_label'] = ''

# Iterate through the lists and update the 'rec_label' column
for rec_type, lst in [('collab_filter', joined_list), ('vector_rec', vd_recs), ('pop_rec', pop_recs)]:
    recs_umap.loc[recs_umap['anime_id'].isin(lst), 'rec_label'] = rec_type
new_row = {'Studios': 'user query', 'anime_Synopsis': query, 'Name': 'user query', 'anime_id': 'none', 'rec_label': 'none'}
recs_umap = pd.concat([pd.DataFrame(new_row, index=[0]), recs_umap], ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode the anime synopsis using the Sentence Transformer model
embeddings = model.encode(recs_umap['anime_Synopsis'].tolist())

# Apply UMAP for dimensionality reduction
umap_model = UMAP(n_components=2, n_neighbors=5, min_dist=0.05)
umap_result = umap_model.fit_transform(embeddings)

# Convert UMAP result to DataFrame
umap_df = pd.DataFrame(umap_result, columns=['UMAP_1', 'UMAP_2'])

# Add 'Studios' and 'Name' columns to the UMAP DataFrame
umap_df['Studios'] = recs_umap['Studios'].tolist()
umap_df['Name'] = recs_umap['Name'].tolist()
umap_df['rec_label'] = recs_umap['rec_label'].tolist()
umap_df['anime_id'] = recs_umap['anime_id'].tolist()

# Plot the UMAP with color by 'rec_label'
fig_umap = px.scatter(umap_df, x='UMAP_1', y='UMAP_2', color='rec_label', 
                      hover_data={'Studios': True, 'Name': True},
                      title='UMAP of Anime Recommendations from Collab Filter, Vector Database and Popular Recommendations')

# Modify the marker symbol for points labeled 'pop_rec' to be a star with yellow color and bigger size
fig_umap.for_each_trace(lambda t: t.update(marker=dict(symbol='star', size=12, color='yellow') if t.name == 'pop_rec' else {}))

# Add annotation for a specific point
x_coord = umap_df.loc[0, 'UMAP_1']
y_coord = umap_df.loc[0, 'UMAP_2']
fig_umap.add_annotation(x=x_coord, y=y_coord, text="X", showarrow=True, font=dict(color="purple", size=20))

# Find the three closest points to the marked point
nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn_model.fit(umap_result)
distances, indices = nn_model.kneighbors([umap_result[0]])

# Collect anime IDs of the three closest points
closest_anime_ids = umap_df.loc[indices[0][1:], 'Name'].tolist()

# Plot red X symbol on the closest points (excluding the marked point)
for i, idx in enumerate(indices[0][1:], start=1):
    target_x = umap_df.loc[idx, 'UMAP_1']
    target_y = umap_df.loc[idx, 'UMAP_2']
    fig_umap.add_trace(go.Scatter(x=[target_x], y=[target_y], mode='markers', showlegend=True,
                                  marker=dict(symbol='x', size=10, color='red'), name=f'rec {i}'))


# Update hover template to include only 'Name' and 'Studios'
fig_umap.update_traces(customdata=umap_df[['Studios', 'Name']],
                        hovertemplate="<b>%{customdata[1]}</b><br>" +
                                      "Studios: %{customdata[0]}<br>" +
                                      "<extra></extra>")

# Show the plot
fig_umap.show()

In [246]:
closest_anime_ids

[50385, 21, 23393]