In [51]:
import os

os.chdir('/Users/edwardchiu/Desktop/projects/baha-anime-analysis')
os.getcwd()

'/Users/edwardchiu/Desktop/projects/baha-anime-analysis'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

In [52]:
import yaml
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext")

In [None]:
with open('conf/app.yml') as f:
    app_config = yaml.safe_load(f)

In [None]:
url = 'https://ani.gamer.com.tw/animeVideo.php?sn=40525'
user_agent = app_config['website']['user_agent']
# ua = UserAgent()
# user_agent = ua.random
headers = {'User-Agent': user_agent}

In [None]:
req = requests.get(url, headers=headers)
req

In [None]:
soup = BeautifulSoup(req.text, 'html.parser')

In [None]:
soup.select_one('.ACG-box1listB > li > a')

In [None]:
intro = soup.select_one('.data-intro > p').text

In [None]:
intro.split('\r＜')[0].strip()

In [None]:
# Tokenize and encode the input text
inputs = tokenizer(intro, return_tensors="pt", truncation=True, max_length=512)
outputs = model(**inputs)

In [None]:
outputs

In [None]:
# Get the hidden states
hidden_states = outputs.last_hidden_state

In [None]:
# Take the embedding of the [CLS] token (first token)
cls_embedding = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_dim)

In [None]:
cls_embedding.squeeze().detach().tolist()

In [None]:
def intro_feature_extraction(intro):
    try:
        # Tokenize and encode the input text
        inputs = tokenizer(intro, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)

        # Get the hidden states
        hidden_states = outputs.last_hidden_state

        # Take the embedding of the [CLS] token (first token)
        cls_embedding = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_dim)
        feature = cls_embedding.squeeze().detach().tolist()
        return feature
    except:
        return None

In [None]:
df_all_anime = pd.read_csv('data/all_anime.csv')
df_all_anime.head()

In [None]:
df_all_anime['intro'] = df_all_anime['intro'].fillna(df_all_anime['name'])
df_all_anime['intro_feature'] = df_all_anime['intro'].apply(intro_feature_extraction)

In [None]:
df_all_anime.head()

In [None]:
df_intro = df_all_anime[['name', 'intro', 'intro_feature']].dropna(ignore_index=True)
df_intro.head()

In [None]:
features = np.vstack(df_intro['intro_feature'])
features

In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(features)

# cosine_sim = (cosine_sim + 1) / 2
cosine_sim = (cosine_sim - cosine_sim.min()) / (cosine_sim.max() - cosine_sim.min())
cosine_sim

In [None]:
plt.hist(cosine_sim[0, :])
plt.show()

In [None]:
np.fill_diagonal(cosine_sim, -np.inf)

In [None]:
anime_names = df_intro['name']
cosine_sim_df = pd.DataFrame(cosine_sim, index=anime_names, columns=anime_names)
cosine_sim_df

In [None]:
df_all_anime['intro'].fillna(df_all_anime['name'])[df_all_anime['intro'].isna()]

In [None]:
# Extract feature vectors as a 2D array
df_sub = df_intro[:100]
features = np.vstack(df_sub['intro_feature'])
features

In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(features)

# Convert to distance matrix
distance_matrix = 1 - cosine_sim
distance_matrix

In [None]:
# Apply MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
mds_coords = mds.fit_transform(distance_matrix)

# Add the 2D coordinates to the DataFrame
df_sub['x'] = mds_coords[:, 0]
df_sub['y'] = mds_coords[:, 1]

In [None]:
# Plot the MDS results
plt.figure(figsize=(10, 6))
plt.scatter(df_sub['x'], df_sub['y'], s=50, c='blue', alpha=0.7)

# Annotate points with anime names
for i, name in enumerate(df_sub['name']):
    plt.text(df_sub['x'][i], df_sub['y'][i], i, fontsize=9)

plt.title("MDS Visualization of Anime Intros")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()

In [None]:
cosine_sim[29, 90]

In [None]:
df_sub.iloc[[29, 90]]

In [None]:
df_sub.loc[27, 'intro']

In [None]:
df_sub.loc[77, 'intro']

In [None]:
df_sub['name']

In [None]:
from modules.recommend import AnimeRecommend

In [None]:
ar = AnimeRecommend()

In [None]:
ar.compute_similarity_score()

In [None]:
df_similar = pd.read_csv('data/anime_type_similarity.csv')

In [None]:
df_similar = df_similar.set_index('name')

In [None]:
df_similar.iloc[0, 0]

In [None]:
anime_intro_similarity = pd.read_csv('data/anime_intro_similarity.csv')
anime_intro_similarity = anime_intro_similarity.set_index('name')
anime_intro_similarity.head()

In [None]:
anime_type_similarity = pd.read_csv('data/anime_type_similarity.csv')
anime_type_similarity = anime_type_similarity.set_index('name')
anime_type_similarity.head()

In [53]:
anime_type_similarity = pd.read_csv('data/anime_type_similarity.csv')
anime_type_similarity = anime_type_similarity.set_index('name')

anime_intro_similarity = pd.read_csv('data/anime_intro_similarity.csv')
anime_intro_similarity = anime_intro_similarity.set_index('name')

In [105]:
target_similarities = pd.concat(
    [anime_type_similarity.loc[:, ['2.5 次元的誘惑']], anime_intro_similarity.loc[:, ['2.5 次元的誘惑']]], axis=1).mean(axis=1)
target_similarities

name
青之壬生浪                      0.393495
香蕉喵遊世界                     0.235218
平凡職業造就世界最強 第三季             0.318980
香格里拉・開拓異境～糞作獵手挑戰神作～ 第二季    0.295723
魔王 2099                    0.364502
                             ...   
城市獵人 2                     0.406265
妙手小廚師                      0.270022
妖獸都市 1987 劇場版              0.298172
城市獵人                       0.406265
魯邦三世 卡里奧斯特羅城               0.251601
Length: 1722, dtype: float64

In [106]:
target_similarities = target_similarities.reset_index().rename(columns={0: 'similarity_score'})

In [66]:
df[['咒術迴戰'] + ['a']].mean(axis=1)

name
青之壬生浪                      0.318931
香蕉喵遊世界                     0.235330
平凡職業造就世界最強 第三季             0.369369
香格里拉・開拓異境～糞作獵手挑戰神作～ 第二季    0.393674
魔王 2099                    0.379912
                             ...   
城市獵人 2                     0.343682
妙手小廚師                      0.284120
妖獸都市 1987 劇場版              0.421368
城市獵人                       0.343682
魯邦三世 卡里奧斯特羅城               0.322411
Length: 1722, dtype: float64

In [68]:
anime_intro_similarity['2.5 次元的誘惑']

name
青之壬生浪                      0.644133
香蕉喵遊世界                     0.470436
平凡職業造就世界最強 第三季             0.637961
香格里拉・開拓異境～糞作獵手挑戰神作～ 第二季    0.591447
魔王 2099                    0.617893
                             ...   
城市獵人 2                     0.645864
妙手小廚師                      0.540043
妖獸都市 1987 劇場版              0.596344
城市獵人                       0.645864
魯邦三世 卡里奧斯特羅城               0.503202
Name: 2.5 次元的誘惑, Length: 1722, dtype: float64

In [74]:
from modules.recommend import AnimeRecommend

In [75]:
ar = AnimeRecommend()

Finish reading and transform anime data!!!


In [99]:
df = ar.df_anime

In [107]:
scaled_metrics = df.loc[:, ['name', 'scaled_launch', 'scaled_view', 'scaled_score', 'link']]
# scaled_metrics = scaled_metrics.set_index('name')

In [115]:
target_similarities = target_similarities.merge(scaled_metrics, on='name', how='inner')

In [116]:
target_similarities['scaled_launch']

0       1.000000
1       0.998013
2       0.998013
3       0.997616
4       0.997616
          ...   
1719    0.003491
1720    0.003152
1721    0.002856
1722    0.002823
1723    0.000000
Name: scaled_launch, Length: 1724, dtype: float64

In [119]:
parameters = ['scaled_view']
target_similarities.loc[:, ['similarity_score'] + parameters].mean(axis=1)

0       0.356837
1       0.250748
2       0.452811
3       0.465926
4       0.464221
          ...   
1719    0.460521
1720    0.374452
1721    0.245126
1722    0.481972
1723    0.163697
Length: 1724, dtype: float64

In [86]:
target_similarities[['scaled_launch', 'scaled_view', 'scaled_score', 'link']] = scaled_metrics

In [84]:
target_similarities['scaled_launch'] = scaled_metrics['scaled_launch']

In [88]:
target_similarities

Unnamed: 0_level_0,2.5 次元的誘惑,2.5 次元的誘惑,scaled_launch,scaled_view,scaled_score,link
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
青之壬生浪,0.142857,0.644133,1.000000,0.320179,0.701801,https://ani.gamer.com.tw/animeRef.php?sn=113677
香蕉喵遊世界,0.000000,0.470436,0.998013,0.266277,0.587879,https://ani.gamer.com.tw/animeRef.php?sn=113633
平凡職業造就世界最強 第三季,0.000000,0.637961,0.998013,0.586642,0.837755,https://ani.gamer.com.tw/animeRef.php?sn=113635
香格里拉・開拓異境～糞作獵手挑戰神作～ 第二季,0.000000,0.591447,0.997616,0.636128,1.000000,https://ani.gamer.com.tw/animeRef.php?sn=113665
魔王 2099,0.111111,0.617893,0.997616,0.563939,0.837755,https://ani.gamer.com.tw/animeRef.php?sn=113662
...,...,...,...,...,...,...
城市獵人 2,0.166667,0.645864,0.003491,0.514778,0.701801,https://ani.gamer.com.tw/animeRef.php?sn=30499
妙手小廚師,0.000000,0.540043,0.003152,0.478882,0.587879,https://ani.gamer.com.tw/animeRef.php?sn=112806
妖獸都市 1987 劇場版,0.000000,0.596344,0.002856,0.192081,0.412426,https://ani.gamer.com.tw/animeRef.php?sn=113058
城市獵人,0.166667,0.645864,0.002823,0.557679,0.701801,https://ani.gamer.com.tw/animeRef.php?sn=19171
