<a href="https://colab.research.google.com/github/fffw2/colaboratory/blob/main/word_cloud_of_tweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get -y install fonts-ipafont-gothic
!pip install janome
!pip install wordcloud
import datetime as dt
import io
import json
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# tweet.js (+ あれば tweet-part1.js ) をアップロード
from google.colab import files
uploaded = files.upload()

In [None]:
# tweet.js を dataframe に変換
tweet_data_part0 = pd.read_json(io.StringIO(uploaded['tweet.js'].decode('utf-8').replace("window.YTD.tweet.part0 = ","")))
df = pd.json_normalize(tweet_data_part0.to_dict('records'))

In [None]:
# ( tweet-part1.js があるときだけ実行 )
# tweet-part1.js を dataframe に変換してマージ
# tweet_data_part1 = pd.read_json(io.StringIO(uploaded['tweet-part1.js'].decode('utf-8').replace("window.YTD.tweet.part1 = ","")))
# df1 = pd.json_normalize(tweet_data_part1.to_dict('records'))
# df = pd.concat([df, df1])

In [None]:
# 日付ごとに集計してグラフを生成
df['tweet.created_at'] = pd.to_datetime(df['tweet.created_at'])
df['date'] = df['tweet.created_at'].dt.date
tweets_cnt = df.groupby('date').size()
tweets_cnt.plot()

In [None]:
# テキストの前処理を定義
import re
def normalize_text(text):
    text = re.sub(r'#.*', "", text)
    text = re.sub(r'&gt', "", text)
    text = re.sub(r'\n', "", text)
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
    text = re.sub('よう', "", text)
    text = re.sub('これ', "", text)
    text = re.sub('こと', "", text)
    text = re.sub('さん', "", text)
    text = re.sub('もの', "", text)
    text = re.sub('とき', "", text)
    text = re.sub('そう', "", text)
    text = re.sub('ため', "", text)
    text = re.sub('はず', "", text)
    text = re.sub('ほう', "", text)
    text = re.sub('ところ', "", text)
    text = re.sub('みたい', "", text)
    text = re.sub('あたり', "", text)
    text = re.sub('くん', "", text)
    text = re.sub('たち', "", text)
    text = re.sub('ぶり', "", text)
    text = re.sub('ちゃん', "", text)
    text = re.sub('あと', "", text)
    text = re.sub('うち', "", text)
    text = re.sub('ここ', "", text)
    text = re.sub('それ', "", text)
    text = re.sub('わけ', "", text)
    text = re.sub('あれ', "", text)
    text = re.sub('もん', "", text)
    text = re.sub('たん', "", text)
    text = re.sub('まま', "", text)
    text = re.sub('なん', "", text)
    text = re.sub('せい', "", text)
    text = re.sub('がち', "", text)
    text = re.sub('うろ', "", text)
    text = re.sub('今日', "", text)
    text = re.sub('昨日', "", text)
    text = re.sub('明日', "", text)
    text = re.sub('時間', "", text)
    text = re.sub('今年', "", text)
    text = re.sub('去年', "", text)
    text = re.sub('昨年', "", text)
    text = re.sub('来年', "", text)
    text = re.sub('fffw', "", text)
    text = re.sub(r'^@.*', "", text)
    text = re.sub(r'^RT .*', "", text)
    text = re.sub('RT', "", text)
    text = text.strip()
    return text

日付範囲を変更したらここから下を再処理

In [None]:
# 日付範囲を指定してテキストのリストを生成
start_date = dt.date(2020,1,1)
end_date = dt.date(2020,12,31)
text_list = df[(start_date <= df['date']) & (df['date'] <= end_date)]['tweet.full_text'].values.tolist()

In [None]:
# テキストのリストに前処理を適用
normalized_text_list = [normalize_text(t) for t in text_list]

In [None]:
# テキストのリストから単語のリストを生成
from janome.tokenizer import Tokenizer
t = Tokenizer()
word_list = []
for text in normalized_text_list:
    tokens = t.tokenize(text)
    for token in tokens:
        if token.part_of_speech.split(',')[0] == '名詞':    #名詞のみを抽出
            word_list.append(token.base_form)

In [None]:
# 単語を頻出順に表示
from collections import Counter
filtered_word_list = [w for w in word_list if len(w)>1]   #1文字の単語を除去
common_word_list = Counter(filtered_word_list).most_common()
print(common_word_list)

In [None]:
# WordCloud を生成
from wordcloud import WordCloud
fpath = '/usr/share/fonts/opentype/ipafont-gothic/ipagp.ttf'
words = ' '.join(word_list)   #単語リストを半角スペース区切りの文字列に変換
wordcloud = WordCloud(background_color="white", font_path=fpath, width=900, height=500, collocations = False).generate(words)
plt.figure(figsize=(15,12))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
# 参考サイト
# https://karaage.hatenadiary.jp/entry/2018/03/21/073000
# https://www.hitowaft.work/entry/2020/05/11/203550
# https://qiita.com/kbs/items/33b3dd6dae15f7b20b9e
# https://limited-exp-bug.hatenablog.com/entry/2020/05/23/205957