# ライブラリ読み込み

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import gc
import pickle
from tqdm import tqdm
%matplotlib inline

import glob
plt.rcParams["font.size"] = 18

In [2]:
import itertools
import requests
from bs4 import BeautifulSoup
import os
import datetime, pytz
from selenium import webdriver
import lxml.html
from lxml import html
import copy
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [4]:
def set_ylim_zero_start(ax):
    ylim = ax.get_ylim()
    if ax.get_yscale()=='log':
        ylim = [10 ** -1, np.max(ylim)]
    else:
        ylim = [np.min(ylim, 0), np.max(ylim)]
    
    ax.set_ylim(ylim)

In [5]:
def set_sep3(ax, which):
    if which=='x':
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    elif which=='y':
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    elif which=='both':
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

In [6]:
mpl.rcParams["font.family"] = "sans-serif"
mpl.rcParams["font.sans-serif"] = ["Meiryo"]

In [7]:
sns.set_palette('bright')

In [8]:
import MeCab
import re

# 読込

In [9]:
li_files = glob.glob('./youtube_chats/*')

In [10]:
li_chat_exists = pd.Series(li_files).str.extract(r'./youtube_chats\\(.*?).pkl')[0].to_list()

In [11]:
df_holostats = pd.read_pickle('./アーカイブ同接推移データ/df_holostats.pkl')
df_koyori = df_holostats[df_holostats['stats_id']=='koyori'].copy()

In [12]:
df_chats = [] # 最後にconcatしてデータフレームに直す
for video_id in li_chat_exists:
    df_chats.append(
        pd.read_pickle(f'./youtube_chats/{video_id}.pkl')
    )
df_chats = pd.concat(df_chats, ignore_index=True)

# 前処理

In [13]:
df_for_agg = pd.read_pickle('./KOYORI/df_for_agg.pkl')

# 集計

In [14]:
df_for_agg.head(1)

Unnamed: 0,video_id,scheduled_start_datetime,start_datetime,end_datetime,duration,title,live_category,live_category_sub,year,month,day,year_month,view_count,like_count,favorite_count,comment_count
0,-bEB2Utn3P4,2022-04-28 22:00:00,2022-04-28 22:00:39,2022-04-29 00:08:50,7691.0,【Surgeon Simulator2】#KoLuC で力を合わせて手術だ！Part2～！【...,Surgeon Simulator2,鷹嶺ルイ・博衣こより・沙花叉クロヱ/ホロライブ,2022,4,28,2022-04,76667.0,7123.0,0.0,109.0


In [15]:
df_chats.head(1)

Unnamed: 0,video_id,datetime,elapsedTime,timestamp,id,amountString,amountValue,author_name,message,messageEx,author_isChatSponsor,author_isChatOwner,author_isVerified,author_badgeUrl,author_channelId,author_channelUrl,author_imageUrl,author_isChatModerator,author_type,bgColor,currency,type
0,-bEB2Utn3P4,2022-04-28 21:59:31,-62.0,1883330309,ChwKGkNMTGctUGJudHZjQ0ZVUER3Z1FkVjJnQUVn,¥540,540.0,すかーど🧪,手術成功祈願代です……,[手術成功祈願代です……],False,False,False,,UChVtdysrseX4Ubyc18Q5iCQ,http://www.youtube.com/channel/UChVtdysrseX4Ub...,https://yt4.ggpht.com/dELQfXqZwc8WXmx3-xVt_DIb...,False,,4280150454,¥,superChat


## 形態素解析

In [16]:
import mojimoji

In [17]:
# 数字の削除
num_regex = re.compile('\d+,?\d*')
    
def text_process(text):
    # 全角から半角に変換（カナは除く）
    result = mojimoji.zen_to_han(text, kana=False)
    # 半角カナから全角カナに変換
    result = mojimoji.han_to_zen(result, ascii=False)
    # 全ての文字を小文字に変換
    result = result.lower()
    # 数字は全て0に置換する
    # result = num_regex.sub('0', result)
    return result

In [18]:
import emoji

In [19]:
emoji.is_emoji('🥀')

True

In [20]:
df_chats = df_chats[['video_id', 'message', 'messageEx']].copy()

In [38]:
df_chats.head(5)

Unnamed: 0,video_id,message,messageEx
0,-bEB2Utn3P4,手術成功祈願代です……,[手術成功祈願代です……]
1,-bEB2Utn3P4,:_くるよー::_くるよー::_くるよー:,"[{'id': 'ACLUYeXxLfeI_9EP8Ou_0A8', 'txt': ':_く..."
2,-bEB2Utn3P4,:_くるよー::_くるよー::_くるよー:,"[{'id': 'ACLUYeXxLfeI_9EP8Ou_0A8', 'txt': ':_く..."
3,-bEB2Utn3P4,お邪魔しますm(_ _)m,[お邪魔しますm(_ _)m]
4,-bEB2Utn3P4,:_くるよー::_くるよー::_くるよー:,"[{'id': 'ACLUYeXxLfeI_9EP8Ou_0A8', 'txt': ':_く..."


In [22]:
li_fullchats = []
li_ja_fullchats = []

for i, li_msgs in tqdm(enumerate(df_chats['messageEx'])):
    
    li_emoji_words = [
        msg if isinstance(msg,str) else msg['id'] if emoji.is_emoji(msg['id']) else msg['txt'] for msg in li_msgs
    ]
    
    li_ja_words = [
        msg  for msg in li_emoji_words if (not (msg[0]==':' and msg[-1]==':')) and (not msg.isascii())
    ]
    # li_sens = [msg for msg in li_msgs ]
    
    
    s1 = ''.join(li_emoji_words)
    s2 = ''.join(li_ja_words)
    li_fullchats.append(s1)
    li_ja_fullchats.append(s2)
    
    # if i==100:
    #     break
    

8787412it [00:08, 1036741.48it/s]


In [23]:
li_unique_chats = list(set(li_fullchats))

In [25]:
li_unique_ja_chats = list(set(li_ja_fullchats))

In [27]:
gc.collect()

9

In [28]:
len(li_fullchats)

8787412

In [29]:
len(li_unique_chats)

3353305

In [30]:
with open('./KOYORI/li_koyori_fullchats.pkl', 'wb') as f:
    pickle.dump(li_fullchats, f)

In [31]:
with open('./KOYORI/li_koyori_uinque_chats.pkl', 'wb') as f:
    pickle.dump(li_unique_chats, f)

In [32]:
with open('./KOYORI/koyori_fullchats.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(li_fullchats))

In [33]:
with open('./KOYORI/koyori_unique_chats.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(li_unique_chats))

In [34]:
with open('./KOYORI/li_koyori_ja_fullchats.pkl', 'wb') as f:
    pickle.dump(li_ja_fullchats, f)

In [35]:
with open('./KOYORI/li_koyori_ja_uinque_chats.pkl', 'wb') as f:
    pickle.dump(li_unique_ja_chats, f)

In [36]:
with open('./KOYORI/koyori_ja_fullchats.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(li_ja_fullchats))

In [37]:
with open('./KOYORI/koyori_ja_unique_chats.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(li_unique_ja_chats))