### 7-4．トピック抽出のためのデータ準備
#### 全記事の形態素解析

In [1]:
# Janomeのインストール
# !pip install janome

In [2]:
import os
import re
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter

# 記事フォルダの指定
dirs = ['it-life-hack', 'movie-enter']

# 単語とラベルを格納するリストを生成
docterm = [] # 説明変数
label = [] # 目的変数
tmp1 = []
tmp2 = ''

# 形態素解析の準備
# オブジェクト生成
t = Tokenizer()
# 名詞の単語のみ抽出するフィルターを生成
token_filters = [POSKeepFilter(['名詞'])]
# フィルターを搭載した解析フレームワークの生成
a = Analyzer([], t, token_filters)

# 各フォルダのファイルを1つずつ読み込んで表示
for i, d in enumerate(dirs):
    # ファイルの取得
    files = os.listdir('./data/' + d)
    
    for file in files:
        # ファイルオープンと読み込み
        f = open('./data/' + d + '/' + file, 'r', encoding='utf-8')
        text = f.read()
        
        # 正規表現で不要な文字列を除去して表示
        reg_text = re.sub(r'[0-9a-zA-Z]+', '', text)
        reg_text = re.sub(r'[:;/+\.-]', '', reg_text)
        reg_text = re.sub(r'[\s\n]', '', reg_text)
        reg_text = reg_text.replace('\n','')        
        
        # フィルタ付き形態素解析
        for token in a.analyze(reg_text):
            tmp1.append(token.surface)
            tmp2 = ' '.join(tmp1)
        # 記事ごとに単語を格納
        docterm.append(tmp2)
        tmp1 = []
        
        # 記事ごとにラベルの格納
        label.append(i)
        
        # ファイルクローズ
        f.close()

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

# 単語のTF-IDFを計算
tv = TfidfVectorizer(min_df=0.05, max_df=0.5)
docterm_tv = tv.fit_transform(np.array(docterm))
docterm_tfidf = docterm_tv.toarray()

# データフレーム形式で先頭5行まで表示
docterm_tfidf = pd.DataFrame(docterm_tfidf)
docterm_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,506
0,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.140271,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.041122,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.365883,0.000000,0.000000,0.034061,0.000000,0.000000,0.000000,0.034309,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.056238,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.124622,0.000000,0.000000,0.000000,0.000000,0.097902
5,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.172101
6,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.075153,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.118826,0.000000,...,0.000000,0.000000,0.000000,0.073862,0.068859,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.063105,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.062387,0.000000,0.000000,0.000000,0.000000,0.000000,0.058639,0.052650,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
# データセットにラベル列を結合
label = pd.DataFrame(label)
label = label.rename(columns={0:'label'})

docterm_df = pd.concat([docterm_tfidf, label], axis=1)
docterm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,498,499,500,501,502,503,504,505,506,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140271,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.041122,0.0,0.0,0.0,0.0,0.0,0.0,...,0.365883,0.0,0.0,0.034061,0.0,0.0,0.0,0.034309,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056238,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.124622,0.0,0.0,0.0,0.0,0.097902,0


#### 類似度の計算

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# labelが0の単語文書行列のみ選択
docterm_0 = docterm_df[docterm_df['label'] == 0]
docterm_0 = docterm_0.drop('label', axis=1)

# 単語ペアのコサイン類似度を計算
sim0 = cosine_similarity(docterm_0.T)
sim0_df = pd.DataFrame(sim0)

# 類似度を行列形式で確認
sim0_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,506
0,1.000000,0.000000,0.000000,0.155156,0.000000,0.055320,0.000000,0.043896,0.113562,0.000000,...,0.000000,0.000000,0.111072,0.000000,0.000000,0.146878,0.067592,0.015477,0.018005,0.078362
1,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.184860,0.250185,0.042842,...,0.000000,0.000000,0.000000,0.334841,0.151063,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.100779,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.124585,0.000000,0.126987,0.253078,0.000000,0.000000
3,0.155156,0.000000,0.000000,1.000000,0.000000,0.027540,0.185999,0.019417,0.103317,0.000000,...,0.000000,0.219708,0.118544,0.065468,0.010728,0.051270,0.068213,0.117281,0.079754,0.000000
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.056933,0.041681,0.000000,0.036957,0.025481,...,0.009494,0.000000,0.010423,0.030202,0.041761,0.041977,0.000000,0.016574,0.020493,0.070330
5,0.055320,0.000000,0.000000,0.027540,0.056933,1.000000,0.000000,0.124353,0.041928,0.016330,...,0.000000,0.108990,0.016673,0.041267,0.356336,0.000000,0.024808,0.022356,0.133563,0.141060
6,0.000000,0.000000,0.000000,0.185999,0.041681,0.000000,1.000000,0.134900,0.141514,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.164301,0.022761,0.000000,0.109039,0.036468,0.081728
7,0.043896,0.184860,0.100779,0.019417,0.000000,0.124353,0.134900,1.000000,0.000000,0.000000,...,0.118622,0.072733,0.019598,0.005640,0.012556,0.000000,0.244796,0.025505,0.004717,0.012495
8,0.113562,0.250185,0.000000,0.103317,0.036957,0.041928,0.141514,0.000000,1.000000,0.116336,...,0.009298,0.143923,0.139648,0.364165,0.147936,0.000000,0.018818,0.159488,0.130804,0.105691
9,0.000000,0.042842,0.000000,0.000000,0.025481,0.016330,0.000000,0.000000,0.116336,1.000000,...,0.000000,0.000000,0.039761,0.039225,0.000000,0.000000,0.021077,0.000000,0.095659,0.008270


#### 共起語リストの作成

In [22]:
# 類似度行列をリスト形式へ変換
sim0_stack = sim0_df.stack()
# print(sim0_stack)
# 単語インデックスと類似度を分割
index = pd.Series(sim0_stack.index.values)
value = pd.Series(sim0_stack.values)

print(index.head())
print(value.head())

0    (0, 0)
1    (0, 1)
2    (0, 2)
3    (0, 3)
4    (0, 4)
dtype: object
0    1.000000
1    0.000000
2    0.000000
3    0.155156
4    0.000000
dtype: float64


In [17]:
tmp3 = []
tmp4 = []
for i in range(len(index)):
    # 類似度が0.5以上0.9以下の単語ペアを抽出
    if value[i] >=0.5 and value[i] <= 0.9:
        # 単語ペアを抽出
        tmp1 = str(index[i][0]) + ' ' + str(index[i][1])
        print(tmp1)
        tmp2 = [int(s) for s in tmp1.split()]
        tmp3.append(tmp2)
        # 類似度を抽出
        tmp4 = np.append(tmp4, value[i])

# 共起語リストを作成
tmp3 = pd.DataFrame(tmp3)
tmp3 = tmp3.rename(columns={0:'node1', 1:'node2'})
tmp4 = pd.DataFrame(tmp4)
tmp4 = tmp4.rename(columns={0:'weight'})
sim0_list = pd.concat([tmp3, tmp4], axis=1)

# 作成したリストを確認
sim0_list.head()

1 148
2 76
2 101
2 310
4 61
7 24
7 45
7 88
7 98
7 167
7 185
7 239
7 267
7 313
7 421
9 29
9 204
9 411
12 390
18 40
18 438
21 36
21 181
21 193
21 202
21 359
22 268
23 393
23 410
24 7
24 88
24 98
24 167
24 185
24 239
24 368
24 421
28 371
28 410
29 9
29 204
29 411
35 125
36 21
36 181
36 193
36 202
36 359
39 121
39 279
40 18
40 116
40 317
42 396
44 112
44 164
44 192
44 252
44 283
44 396
44 437
44 441
44 466
44 493
44 497
45 7
45 88
45 98
45 167
45 185
45 239
45 368
45 421
46 60
46 186
46 219
46 267
46 272
46 386
46 392
46 486
47 203
48 414
49 422
50 128
50 233
52 104
52 195
52 346
52 447
54 90
54 158
54 310
56 315
57 299
57 323
57 409
60 46
60 186
60 486
61 4
61 137
63 110
63 124
63 132
63 133
63 370
63 441
63 470
63 494
64 164
64 403
64 446
64 484
65 68
65 197
65 246
65 367
65 408
65 416
66 121
66 279
67 167
67 267
67 368
68 65
68 117
68 173
68 197
68 246
68 260
68 264
68 277
68 306
68 362
68 367
68 408
68 416
68 441
68 464
69 101
73 109
73 455
74 321
74 505
75 77
75 99
75 150
75 184
75 21

Unnamed: 0,node1,node2,weight
0,1,148,0.539952
1,2,76,0.559733
2,2,101,0.564668
3,2,310,0.522363
4,4,61,0.521283


In [8]:
# 練習問題5
# labelが1の単語文書行列のみ選択
docterm_1 = docterm_df[docterm_df['label'] == 1]
docterm_1 = docterm_1.drop('label', axis=1)

# 単語ペアのコサイン類似度を計算
sim1 = cosine_similarity(docterm_1.T)
sim1_df = pd.DataFrame(sim1)

# 類似度を行列形式で確認
sim1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,506
0,1.000000,0.020351,0.157256,0.103388,0.000000,0.000000,0.061962,0.062154,0.073794,0.064370,...,0.138581,0.0,0.0,0.049541,0.023284,0.000000,0.105989,0.000000,0.142215,0.240337
1,0.020351,1.000000,0.109777,0.052887,0.000000,0.050628,0.063129,0.063864,0.155105,0.076726,...,0.025167,0.0,0.0,0.097312,0.092807,0.015529,0.004187,0.000000,0.057902,0.123801
2,0.157256,0.109777,1.000000,0.117946,0.052325,0.050844,0.055315,0.063515,0.100650,0.217393,...,0.502812,0.0,0.0,0.249436,0.003845,0.000000,0.003127,0.000000,0.086483,0.237121
3,0.103388,0.052887,0.117946,1.000000,0.000000,0.000000,0.050548,0.000000,0.081133,0.000000,...,0.088532,0.0,0.0,0.000000,0.148354,0.094445,0.098515,0.000000,0.000000,0.068841
4,0.000000,0.000000,0.052325,0.000000,1.000000,0.052980,0.120871,0.008906,0.075461,0.089360,...,0.000000,0.0,0.0,0.000000,0.013621,0.000000,0.041333,0.000000,0.000000,0.002236
5,0.000000,0.050628,0.050844,0.000000,0.052980,1.000000,0.041396,0.007816,0.003862,0.008115,...,0.000000,0.0,0.0,0.077805,0.007683,0.000000,0.000000,0.000000,0.000000,0.024043
6,0.061962,0.063129,0.055315,0.050548,0.120871,0.041396,1.000000,0.245917,0.208667,0.149250,...,0.017183,0.0,0.0,0.059807,0.320213,0.000000,0.160390,0.000000,0.000000,0.179289
7,0.062154,0.063864,0.063515,0.000000,0.008906,0.007816,0.245917,1.000000,0.017629,0.336784,...,0.118864,0.0,0.0,0.000000,0.109797,0.095293,0.073319,0.122075,0.000000,0.114754
8,0.073794,0.155105,0.100650,0.081133,0.075461,0.003862,0.208667,0.017629,1.000000,0.061067,...,0.016908,0.0,0.0,0.038367,0.037966,0.062103,0.003466,0.000000,0.000000,0.092717
9,0.064370,0.076726,0.217393,0.000000,0.089360,0.008115,0.149250,0.336784,0.061067,1.000000,...,0.256489,0.0,0.0,0.234124,0.102917,0.064364,0.025822,0.126737,0.000000,0.126287


In [9]:
# 練習問題6
# 類似度行列をリスト形式へ変換
sim1_stack = sim1_df.stack()

# 単語インデックスと類似度を分割
index = pd.Series(sim1_stack.index.values)
value = pd.Series(sim1_stack.values)

print(index.head())
print(value.head())

0    (0, 0)
1    (0, 1)
2    (0, 2)
3    (0, 3)
4    (0, 4)
dtype: object
0    1.000000
1    0.020351
2    0.157256
3    0.103388
4    0.000000
dtype: float64


In [10]:
# 練習問題6
tmp3 = []
tmp4 = []
for i in range(len(index)):
    # 類似度が0.5以上0.9以下の単語ペアを抽出
    if value[i] >=0.5 and value[i] <= 0.9:
        # 単語ペアを抽出
        tmp1 = str(index[i][0]) + ' ' + str(index[i][1])
        tmp2 = [int(s) for s in tmp1.split()]
        tmp3.append(tmp2)
        # 類似度を抽出
        tmp4 = np.append(tmp4, value[i])

# 共起語リストを作成
tmp3 = pd.DataFrame(tmp3)
tmp3 = tmp3.rename(columns={0:'node1', 1:'node2'})
tmp4 = pd.DataFrame(tmp4)
tmp4 = tmp4.rename(columns={0:'weight'})
sim1_list = pd.concat([tmp3, tmp4], axis=1)

# 作成したリストを確認
sim1_list.head()

Unnamed: 0,node1,node2,weight
0,2,497,0.502812
1,3,89,0.541855
2,4,383,0.662268
3,4,446,0.615523
4,5,155,0.56313
