In [1]:
import os

os.chdir('/Users/edwardchiu/Desktop/projects/baha-anime-analysis')
os.getcwd()

'/Users/edwardchiu/Desktop/projects/baha-anime-analysis'

In [None]:
import jieba
import jieba.analyse
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from collections import Counter

In [None]:
def episode_comments(link):
    # Set up headless mode
    options = Options()
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    # Set up Selenium with the WebDriver
    driver = webdriver.Chrome(options=options)  # or use webdriver.Firefox() if you're using Firefox

    # Open the target URL
    driver.get(link)
    time.sleep(np.random.uniform(0.5, 1))

    # Danmu scroll
    scroll_item = driver.find_element(By.CLASS_NAME, 'danmu-scroll')

    # Scroll until all items are loaded
    previous_height = 0
    while True:
        # Scroll down the element
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_item)
        time.sleep(np.random.uniform(0.5, 1))  # Wait for items to load

        # Get the new scroll height after scrolling
        new_height = scroll_item.get_attribute('scrollHeight')

        # Break the loop if scrolling has reached the bottom (no new items loaded)
        if new_height == previous_height:
            break
        previous_height = new_height

    # Get danmu string list
    scroll_item = BeautifulSoup(scroll_item.get_attribute('innerHTML'), 'html.parser')
    danmu_list = scroll_item.select('.sub-list-li > div > .sub_content > span')
    danmus = [danmu.text.strip() for danmu in danmu_list]

    # Find and click "Load more" buttons to expand hidden comments, if they exist
    while True:
        try:
            # Locate the "Load more" button or similar to load hidden comments
            load_more_button = driver.find_element(By.CLASS_NAME, 'c-more-msg')
            ActionChains(driver).move_to_element(load_more_button).click(load_more_button).perform()

            # Wait for the content to load after clicking
            time.sleep(np.random.uniform(0.5, 1))
        except:
            # Exit loop if there is no more button to click
            break

    # Comment string list
    comment_item = driver.find_element(By.CLASS_NAME, 'webview_commendlist')
    comment_item = BeautifulSoup(comment_item.get_attribute('innerHTML'), 'html.parser')

    comments = comment_item.select('.reply-content > .reply-content__cont > p')
    comments = [c.text.strip() for c in comments]

    # Close the WebDriver session
    driver.quit()

    # Print or process the episode_dict as needed
    # print(episode_dict)
    return comments, danmus

In [None]:
url = 'https://ani.gamer.com.tw/animeVideo.php?sn=37868'
comments, danmus = episode_comments(url)

In [None]:
# Paths to your files
dict_file = './word-cloud/dict.txt'
stopwords_file = './word-cloud/stopwords.txt'
font_path = './word-cloud/CactusClassicalSerif-Regular.ttf'
mask_image_path = './word-cloud/comment.png'

# Set Jieba dictionary and stopwords
jieba.set_dictionary(dict_file)
jieba.analyse.set_stop_words(stopwords_file)

In [None]:
text = ' '.join(danmus)
tags = jieba.analyse.extract_tags(text, topK=25)
tags

In [None]:
# Create the mask from the image
mask = np.array(Image.open(mask_image_path))

In [None]:
seg_list = jieba.lcut(text, cut_all=False)
dictionary = Counter(seg_list)

freq = {}
for ele in dictionary:
    if ele in tags:
        freq[ele] = dictionary[ele]
print(freq)  # 計算出現的次數

wordcloud = WordCloud(
    background_color="white",
    mask=mask,
    contour_width=3,
    contour_color='steelblue',
    font_path=font_path
).generate_from_frequencies(freq)

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
import pandas as pd
import gspread
from google.oauth2.service_account import Credentials
from gspread_dataframe import set_with_dataframe
import modules.settings as settings

In [None]:
# Authenticate with Google
creds = Credentials.from_service_account_file(settings.service_account_file, scopes=settings.scope)
client = gspread.authorize(creds)

In [None]:
df = pd.DataFrame(list(freq.items()), columns=['tag', 'count'])
sheetname = settings.sheetname
spreadsheet = client.open(sheetname)

In [None]:
worksheet = spreadsheet.worksheet('Word Cloud')
set_with_dataframe(worksheet, df)

In [None]:
def convert_to_utf8(val):
    if isinstance(val, str):
        return val.encode('utf-8', errors='ignore').decode('utf-8')
    return str(val)

In [None]:
df = df.map(convert_to_utf8)

In [None]:
header = [df.columns.tolist()]
values = df.values.tolist()
values

In [None]:
header + values

In [None]:
worksheet.update(range_name='A:B', values=header + values, raw=False)

In [None]:
worksheet.batch_clear(['A:B'])

In [None]:
# Save the word cloud as a PNG file
output_image_path = './word-cloud/danmu_wordcloud.png'  # File path to save the image
wordcloud.to_file(output_image_path)
print(f"Word cloud image saved to {output_image_path}")

In [None]:
from fastapi.responses import FileResponse

In [None]:
FileResponse(output_image_path, media_type='image/png', filename='wordcloud.png')

In [None]:
import requests

In [None]:
url = 'http://127.0.0.1:8000'
payload = {
    'anime_name': '咒術迴戰',
    'episode_name': '[14]'
}
req = requests.post(url + '/word_freq', json=payload)
# req = requests.get(url)

In [None]:
req.json()

In [None]:
df = pd.read_csv('data/all_episode.csv')

In [None]:
df.loc[(df['anime_name'] == '咒術迴戰') & (df['episode_name'] == '[14]'), 'episode_link'].iloc[0]

In [None]:
req.json()

In [None]:
url + '/' + 'comment_cloud.jpg'

In [None]:
file_path = req.json()['comment']['path'][2:]
req2 = requests.get(url + f'/files/{file_path}')

In [None]:
file_path

In [2]:
from modules.review_analysis import ReviewAnalysis

In [3]:
ra = ReviewAnalysis()

In [12]:
ra.dynamic_web_page(link='https://ani.gamer.com.tw/animeVideo.php?sn=33047')

Web crawl for https://ani.gamer.com.tw/animeVideo.php?sn=33047...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
comments = ra.comment_crawler()

In [14]:
comments

['凌晨就跟著日本首播虐過自己十次了 再來一百次好了我的里韓⋯⋯⋯⋯⋯⋯封面選的真好(;´༎ຶД༎ຶ`)💚💜難過到死掉 受不了（ ; ; ）巴哈辛苦了 好早就上！',
 '漢吉，我最愛的團長，心臟撒薩給油',
 '「誰會想在凌晨四點看進擊的巨人？」我：「好耶～四點囉～～」',
 '熬夜看巨人的我',
 '這集沒有讓我花錢去電影院看我真的是良心不安了',
 '絕對的神作讓人等了這麼久，但看到作品後覺得一切值得每個人都有他的正義每個人的想法都可以去理解不論最後結局是什麼，我相信巨人是我一輩子會和他人推薦的作品',
 '我針真的好想哭.....艾倫不過也才幾歲.....為什麼一個人承受這種痛苦.....😭😭😭😭',
 '如果Part 1的品質是80分，Part 2的品質是90分，這集的品質我敢保證絕對有95分以上！太神啦！',
 '畫到睡著起來繼續畫',
 '雖然神作這句話很氾濫 也許這句話會讓人反感也許巨人不是神作但是在我心目中 他就是神作這集太棒了 感謝MAPPA超愛巨人',
 '聲優很應景的留言',
 '其實除了賈碧和法爾科，弗洛克也是第二部的重點角色吧，塑造的挺成功的，從一個對朋友的死感到不甘的菜兵，到為了國家能奉獻出一切的狂人，成長很多阿',
 '不上劇院真的太可惜 只能說體貼觀眾怕我們在戲院哭成狗心揪成一團離不了場 等待是值得的謝謝MAPPA也謝謝木棉花代理 更感謝所有辛苦的製作人員 有生能見到巨人的完結真的三生有幸一切都值了 希望秋季的後篇各位要撐住我不行了漢吉……😭😭😭😭',
 '這次原畫師滿多台灣人的!!',
 '用這一小時把第33卷播完，有些部分節奏有點趕，但精彩的作畫真的五體投地的神。要說巨人有什麼缺點，那也只有太神太好看…艾連，我愛你🧣創哥，我愛你先這樣…我要先去擦乾眼淚準備再虐我自己了',
 '這部真的根本是電影品質了',
 '巨人 我愛你MAPPA 我愛你巴哈 我愛你木棉花 我愛你心臓を捧げよ！',
 '故事性好完整，大家一路走來看各個角色變成熟，以及主角立場轉變1.車力巨人穿上了調查兵團的自由之翼2.大家整備+吃飯，這種生活感好棒...還有曾經被視為叛徒的萊納和亞妮也同陣線了3.韓吉的最後身影，有夠帥，獨自面對她最感興趣的巨人4.萊納超帥，衝鋒擒抱--第一集有1小時有夠爽各個角色心境轉變、立場的轉變真是令人值得思考的地方看到萊納和亞妮回同陣線

In [7]:
from transformers import pipeline
import torch

# Load the summarization pipeline for Chinese
summarizer = pipeline("summarization", model="fnlp/bart-large-chinese")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/259k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [21]:
# Join comments to create a single input for summarization
input_text = " ".join(comments)

In [24]:
summary = summarizer(input_text, max_length=60, min_length=25, do_sample=False)

In [25]:
summary[0]['summary_text']

'這 集 是 一 二 三 季 動 畫 中 我 最 喜 歡 的 一 集 ， 對 我 來 說 這 部 作 品 處 理 友 情 的 手 法 比 愛 情 更 好 ， 也 可 能 只 是 我 自 己 比 起 愛 情 還 是 更 喜 歡 友'