In [1]:
import sys 
sys.path.append('/Users/justinvhuang/Desktop/CSE-6242-Group-Project')

In [2]:
import pandas as pd
import plotly.express as px
from app.utils.textpreprocessing import TextPreprocessor
textprepo = TextPreprocessor()
from app.utils.data_manipulation import (create_retriever, 
                                         load_data, 
                                         process_recommendations, 
                                         get_top3_posters_and_names, 
                                         get_recommendations_descriptions)
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
import numpy as np
from scipy.spatial.distance import cdist

In [3]:
sbert = "sentence-transformers/all-MiniLM-L6-v2"
vdb = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/faiss_anime_index_v3"
db_faiss = create_retriever(vdb, sbert)

In [4]:
def filter_tokens(metadata: dict) -> bool:
    """
    Filter function to apply on retrieved documents based on metadata.

    Args:
        metadata (dict): Metadata of the document.
        query_token (list): List of tokens to filter.

    Returns:
        bool: True if the document passes the filter, False otherwise.
    """
    metadata_tokens = metadata.get("tokens", [])
    metadata_studio = metadata.get("studio", [])
    metadata_producer = metadata.get("producer", [])
    metadata_licensors = metadata.get("licensors", [])
    metadata_genre = metadata.get("genre", [])

    return (
        any(token in metadata_tokens for token in query_token)
        or metadata.get("score", 0.0) > 5.0
        or any(token in metadata_studio for token in query_token)
        or any(token in metadata_producer for token in query_token)
        or any(token in metadata_licensors for token in query_token)
        or any(token in metadata_genre for token in query_token)
    )

In [5]:
retriever = db_faiss.as_retriever(search_kwargs={"k": 50, "filter": filter_tokens})

In [6]:
query = 'i like space cowboy adventures'
query_token = textprepo.preprocess_text(query)
results = retriever.get_relevant_documents(query)

In [7]:

results = retriever.get_relevant_documents(query)
indexes = {x.metadata['anime_id']: index for index, x in enumerate(results)}

In [48]:
json_file_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/fin_anime_dfv3.json"
cf_pickle_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/anime_recommendations_item_knn_CF_10k_num_fin.pkl"
pop_pickle_path = "/Users/justinvhuang/Desktop/CSE-6242-Group-Project/app/popular_dict_10.pkl"

df, cf_recs, pop_recs = load_data(json_file_path, cf_pickle_path, pop_pickle_path)

In [50]:
pop_recs, popular_anime_descriptions, joined_list, vd_recs = process_recommendations(pop_recs, df, indexes, cf_recs)

In [52]:
pop_recs
cf_list = list(df[df['anime_id'].isin(pop_recs)]['anime_values'])

In [57]:
joined_list = [item for sublist in cf_list for item in sublist]

In [58]:
joined_list

[5,
 227,
 205,
 30,
 121,
 6,
 1535,
 889,
 1575,
 43,
 182,
 5,
 1,
 317,
 1361,
 54,
 617,
 43,
 48,
 164,
 136,
 121,
 1482,
 45,
 71,
 60,
 857,
 61,
 129,
 72,
 1698,
 2034,
 1689,
 5630,
 4477,
 877,
 240,
 6045,
 1222,
 853,
 245,
 136,
 1559,
 15,
 33,
 11061,
 627,
 5114,
 1535,
 1575,
 645,
 136,
 263,
 139,
 138,
 137,
 121,
 245,
 15,
 71,
 1,
 227,
 43,
 205,
 6,
 30,
 889,
 467,
 164,
 121,
 395,
 226,
 1818,
 1535,
 889,
 33,
 245,
 121,
 790,
 2025,
 433,
 1689,
 1004,
 2236,
 578,
 387,
 431,
 227,
 43,
 164,
 137,
 139,
 263,
 121,
 20,
 1535,
 269,
 1482,
 245,
 1575]

In [38]:
top3_posters, top3_names = get_top3_posters_and_names(df, indexes)

In [39]:
recs, recs2, descriptions = get_recommendations_descriptions(df, joined_list, pop_recs, vd_recs)

In [40]:
top_anime_rating = recs2[recs2['anime_Score']!='UNKNOWN'].sort_values(by='anime_Score', ascending=False).head(10)
top_studios = recs2.sort_values(by='Favorites', ascending=False).head(5)
top_anime_rating['anime_Score'] = top_anime_rating['anime_Score'].astype(float)

In [41]:
# Reset bar plot to default state
fig_bar = px.bar(top_anime_rating, x='anime_Score', y='Name', color='Name',
                 title="Ratings of Popular Anime", orientation='h')
blue_palette = ['#aec7e8', '#7b9fcf', '#1f77b4', '#03539e', '#003f5c']
# Set colors for bars
fig_bar.update_traces(marker_color=blue_palette)
# Show the plot
fig_bar.show()

In [42]:
# Create vertical box plot for the filtered data
fig_box = px.box(df[df['Studios'].isin(set(list(top_studios['Studios'])))], 
                 x='Studios', 
                 y='Favorites', 
                 color='Studios', 
                 title="Favorites to Studios Distribution", 
                 orientation='v',
                 custom_data=['Name'])

# Get the indices of maximum values
max_indices = df.groupby('Studios')['Favorites'].idxmax()

# Add text labels for maximum values
fig_box.update_traces(
    hovertemplate="<b>Name:</b> %{customdata[0]}<br><b>Favorites:</b> %{y}",
    selector=dict(type='box')
)

# Show the plot
fig_box.show()

In [47]:
df.columns

Index(['anime_Rating', 'anime_Score', 'anime_Synopsis', 'plot', 'Producers',
       'Licensors', 'Studios', 'Image URL', 'Episodes', 'Genres', 'Source',
       'Favorites', 'Aired', 'Members', 'Duration', 'text', 'anime_id',
       'tokens', 'Name', 'image_y', 'imdb_name_basics_primaryName', 'cf_recs',
       'pop_recs', 'anime_values'],
      dtype='object')

In [43]:
recs_umap = recs[['Studios', 'anime_Synopsis', 'Name', 'anime_id','Image URL', 'Producers', 'anime_Score', 'Source', 'Favorites', 'Members', 'Aired', 'imdb_name_basics_primaryName']]
# Initialize 'rec_label' column with empty strings
recs_umap['rec_label'] = ''

# Iterate through the lists and update the 'rec_label' column
for rec_type, lst in [('collab_filter', joined_list), ('vector_rec', vd_recs), ('pop_rec', pop_recs)]:
    recs_umap.loc[recs_umap['anime_id'].isin(lst), 'rec_label'] = rec_type
new_row = {'Studios': 'user query', 'anime_Synopsis': query, 'Name': 'user query', 'anime_id': 'none', 'rec_label': 'none', 'Image URL': 'none', 'Producers': 'none', 'anime_Score': 8.0, 'Source': 'none', 'Favorites':'none', 'Members':'none', ' Aired':'none', 'imdb_name_basics_primaryName': 'none'}
recs_umap = pd.concat([pd.DataFrame(new_row, index=[0]), recs_umap], ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [45]:

# Encode the anime synopsis using the Sentence Transformer model
embeddings_synopsis = model.encode(recs_umap['anime_Synopsis'].tolist())
embedding_producers = model.encode(recs_umap['Producers'].tolist())
embedding_aired = model.encode(recs_umap['Aired'].tolist())
embedding_studios = model.encode(recs_umap['Studios'].tolist())
embedding_actors = model.encode(recs_umap['imdb_name_basics_primaryName'].tolist())
ratings_encoded = np.array(recs_umap['anime_Score']).reshape(-1, 1)
combined_features = np.concatenate([embeddings_synopsis,embedding_producers,embedding_aired,embedding_studios,embedding_actors,ratings_encoded], axis = 1)

In [46]:
# Apply UMAP for dimensionality reduction
umap_model = UMAP(n_components=2, n_neighbors=5, min_dist=0.05,  metric= 'euclidean',random_state=0)
umap_result = umap_model.fit_transform(combined_features)

# Convert UMAP result to DataFrame
umap_df = pd.DataFrame(umap_result, columns=['UMAP_1', 'UMAP_2'])

# Add 'Studios' and 'Name' columns to the UMAP DataFrame
umap_df['Studios'] = recs_umap['Studios'].tolist()
umap_df['Name'] = recs_umap['Name'].tolist()
umap_df['rec_label'] = recs_umap['rec_label'].tolist()
umap_df['anime_id'] = recs_umap['anime_id'].tolist()

# Plot the UMAP with color by 'rec_label'
fig_umap = px.scatter(umap_df, x='UMAP_1', y='UMAP_2', color='rec_label', 
                        hover_data={'Studios': True, 'Name': True},
                        title='UMAP of Anime Recommendations from Collab Filter, Vector Database and Popular Recommendations')

# Modify the marker symbol for points labeled 'pop_rec' to be a star with yellow color and bigger size
fig_umap.for_each_trace(lambda t: t.update(marker=dict(symbol='star', size=12, color='blue')) if t.name == 'pop_rec' else None)

# Add annotation for a specific point
x_coord = umap_df.loc[0, 'UMAP_1']
y_coord = umap_df.loc[0, 'UMAP_2']
fig_umap.add_annotation(x=x_coord, y=y_coord, text="X", showarrow=True, font=dict(color="purple", size=20))
# Calculate pairwise distances between row 0 and all other rows
distances = cdist(umap_df[['UMAP_1', 'UMAP_2']].iloc[[0]], umap_df[['UMAP_1', 'UMAP_2']], metric='euclidean')[0]
# Sort distances and get the indices of the three closest rows (excluding row 0 itself)
closest_indices = np.argsort(distances)[1:6]

# Extract the closest rows based on the indices
closest_rows = umap_df.iloc[closest_indices]

for i, (index, row) in enumerate(closest_rows.iterrows(), start=1):
    x_coord = row['UMAP_1']
    y_coord = row['UMAP_2']
    fig_umap.add_trace(go.Scatter(x=[x_coord], y=[y_coord], mode='markers', marker=dict(symbol='x', size=10, color='red'), name=f'rec {i}'))


# Remove x-axis and y-axis labels
fig_umap.update_layout(xaxis=dict(title_text=''), yaxis=dict(title_text=''))
# Remove x-axis and y-axis labels, ticks, and gridlines
fig_umap.update_layout(xaxis=dict(showticklabels=False, showgrid=False, zeroline=False),
                        yaxis=dict(showticklabels=False, showgrid=False, zeroline=False))
# Set light gray background with higher opacity
fig_umap.update_layout(plot_bgcolor='rgba(220, 220, 220, 0.1)')

# Show the plot
fig_umap.show()


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [192]:
closest_rows

Unnamed: 0,UMAP_1,UMAP_2,Studios,Name,rec_label,anime_id
31,10.952653,7.537581,UNKNOWN,Robot King,collab_filter,16620
59,10.839399,7.498396,UNKNOWN,Space Gundam V,collab_filter,35137
33,10.826811,7.462918,UNKNOWN,Super Titans 15,collab_filter,16806
9,11.612801,7.984448,ACiD FiLM,Platonic Chain: Web,collab_filter,5492
85,10.596025,7.565387,Triangle Staff,Junkers Come Here Pilot Film,collab_filter,42228


In [84]:
closest_rows

Unnamed: 0,UMAP_1,UMAP_2,Studios,Name,rec_label,anime_id
16,17.835577,14.291047,UNKNOWN,The Ghost of Cartoon,vector_rec,40159
4,17.297911,14.594871,UNKNOWN,Mormorando,collab_filter,31834
1,17.195335,14.068709,ACiD FiLM,Platonic Chain: Web,collab_filter,5492


In [85]:

# # Find the three closest points to the marked point
# nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
# nn_model.fit(umap_result)
# distances, indices = nn_model.kneighbors([umap_result[0]])

# # Collect anime Name of the three closest points
# closest_anime_names = umap_df.loc[indices[0][1:], 'Name'].tolist()

#     # Collect anime IDs of the three closest points
# closest_anime_ids = umap_df.loc[indices[0][1:], 'anime_id'].tolist()

# # Update hover template to include only 'Name' and 'Studios'
# fig_umap.update_traces(customdata=umap_df[['Studios', 'Name']],
#                         hovertemplate="<b>%{customdata[1]}</b><br>" +
#                                         "Studios: %{customdata[0]}<br>" +
#                                         "<extra></extra>")

# Plot red X symbol on the closest points (excluding the marked point)


IndexError: invalid index to scalar variable.