In [1]:
import os

os.chdir('/Users/edwardchiu/Desktop/projects/baha-anime-analysis')
os.getcwd()

'/Users/edwardchiu/Desktop/projects/baha-anime-analysis'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

In [None]:
import yaml
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext")

In [None]:
with open('conf/app.yml') as f:
    app_config = yaml.safe_load(f)

In [None]:
url = 'https://ani.gamer.com.tw/animeVideo.php?sn=40525'
user_agent = app_config['website']['user_agent']
# ua = UserAgent()
# user_agent = ua.random
headers = {'User-Agent': user_agent}

In [None]:
req = requests.get(url, headers=headers)
req

In [None]:
soup = BeautifulSoup(req.text, 'html.parser')

In [None]:
soup.select_one('.ACG-box1listB > li > a')

In [None]:
intro = soup.select_one('.data-intro > p').text

In [None]:
intro.split('\r＜')[0].strip()

In [None]:
# Tokenize and encode the input text
inputs = tokenizer(intro, return_tensors="pt", truncation=True, max_length=512)
outputs = model(**inputs)

In [None]:
outputs

In [None]:
# Get the hidden states
hidden_states = outputs.last_hidden_state

In [None]:
# Take the embedding of the [CLS] token (first token)
cls_embedding = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_dim)

In [None]:
cls_embedding.squeeze().detach().tolist()

In [None]:
def intro_feature_extraction(intro):
    try:
        # Tokenize and encode the input text
        inputs = tokenizer(intro, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)

        # Get the hidden states
        hidden_states = outputs.last_hidden_state

        # Take the embedding of the [CLS] token (first token)
        cls_embedding = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_dim)
        feature = cls_embedding.squeeze().detach().tolist()
        return feature
    except:
        return None

In [None]:
df_all_anime = pd.read_csv('data/all_anime.csv')
df_all_anime.head()

In [None]:
df_all_anime['intro'] = df_all_anime['intro'].fillna(df_all_anime['name'])
df_all_anime['intro_feature'] = df_all_anime['intro'].apply(intro_feature_extraction)

In [None]:
df_all_anime.head()

In [None]:
df_intro = df_all_anime[['name', 'intro', 'intro_feature']].dropna(ignore_index=True)
df_intro.head()

In [None]:
features = np.vstack(df_intro['intro_feature'])
features

In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(features)

# cosine_sim = (cosine_sim + 1) / 2
cosine_sim = (cosine_sim - cosine_sim.min()) / (cosine_sim.max() - cosine_sim.min())
cosine_sim

In [None]:
plt.hist(cosine_sim[0, :])
plt.show()

In [None]:
np.fill_diagonal(cosine_sim, -np.inf)

In [None]:
anime_names = df_intro['name']
cosine_sim_df = pd.DataFrame(cosine_sim, index=anime_names, columns=anime_names)
cosine_sim_df

In [None]:
df_all_anime['intro'].fillna(df_all_anime['name'])[df_all_anime['intro'].isna()]

In [None]:
# Extract feature vectors as a 2D array
df_sub = df_intro[:100]
features = np.vstack(df_sub['intro_feature'])
features

In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(features)

# Convert to distance matrix
distance_matrix = 1 - cosine_sim
distance_matrix

In [None]:
# Apply MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
mds_coords = mds.fit_transform(distance_matrix)

# Add the 2D coordinates to the DataFrame
df_sub['x'] = mds_coords[:, 0]
df_sub['y'] = mds_coords[:, 1]

In [None]:
# Plot the MDS results
plt.figure(figsize=(10, 6))
plt.scatter(df_sub['x'], df_sub['y'], s=50, c='blue', alpha=0.7)

# Annotate points with anime names
for i, name in enumerate(df_sub['name']):
    plt.text(df_sub['x'][i], df_sub['y'][i], i, fontsize=9)

plt.title("MDS Visualization of Anime Intros")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()

In [None]:
cosine_sim[29, 90]

In [None]:
df_sub.iloc[[29, 90]]

In [None]:
df_sub.loc[27, 'intro']

In [None]:
df_sub.loc[77, 'intro']

In [None]:
df_sub['name']

In [None]:
from modules.recommend import AnimeRecommend

In [None]:
ar = AnimeRecommend()

In [None]:
ar.compute_similarity_score()

In [None]:
df_similar = pd.read_csv('data/anime_type_similarity.csv')

In [None]:
df_similar = df_similar.set_index('name')

In [None]:
df_similar.iloc[0, 0]

In [None]:
anime_intro_similarity = pd.read_csv('data/anime_intro_similarity.csv')
anime_intro_similarity = anime_intro_similarity.set_index('name')
anime_intro_similarity.head()

In [None]:
anime_type_similarity = pd.read_csv('data/anime_type_similarity.csv')
anime_type_similarity = anime_type_similarity.set_index('name')
anime_type_similarity.head()

In [None]:
anime_type_similarity = pd.read_csv('data/anime_type_similarity.csv')
anime_type_similarity = anime_type_similarity.set_index('name')

anime_intro_similarity = pd.read_csv('data/anime_intro_similarity.csv')
anime_intro_similarity = anime_intro_similarity.set_index('name')

In [None]:
target_similarities = pd.concat(
    [anime_type_similarity.loc[:, ['2.5 次元的誘惑']], anime_intro_similarity.loc[:, ['2.5 次元的誘惑']]], axis=1).mean(axis=1)
target_similarities

In [None]:
target_similarities = target_similarities.reset_index().rename(columns={0: 'similarity_score'})

In [None]:
df[['咒術迴戰'] + ['a']].mean(axis=1)

In [None]:
anime_intro_similarity['2.5 次元的誘惑']

In [2]:
from modules.recommend import AnimeRecommend

In [7]:
ar = AnimeRecommend()

Finish reading and transform anime data!!!


In [6]:
os.path.exists('data/anime_type_similarity.csv')

True

In [None]:
df = ar.df_anime

In [None]:
scaled_metrics = df.loc[:, ['name', 'scaled_launch', 'scaled_view', 'scaled_score', 'link']]
# scaled_metrics = scaled_metrics.set_index('name')

In [None]:
target_similarities = target_similarities.merge(scaled_metrics, on='name', how='inner')

In [None]:
target_similarities['scaled_launch']

In [None]:
parameters = ['scaled_view']
target_similarities.loc[:, ['similarity_score'] + parameters].mean(axis=1)

In [None]:
target_similarities[['scaled_launch', 'scaled_view', 'scaled_score', 'link']] = scaled_metrics

In [None]:
target_similarities['scaled_launch'] = scaled_metrics['scaled_launch']

In [None]:
target_similarities