In [1]:
import re
from ast import literal_eval

import pandas as pd
pd.options.display.max_rows = 999


from gensim.models import KeyedVectors
from sklearn.cluster import KMeans

from data.my_configs import positive_words, negative_words



## setup

In [2]:
%%time
# 這是別人訓練好的word2vec模型，使用整份中文wiki 文本
# https://github.com/lintseju/word_embedding
wiki_word2vec_model = KeyedVectors.load_word2vec_format("./src/wiki/zh_wiki_word2vec_300.txt")

CPU times: user 3min 5s, sys: 2.5 s, total: 3min 8s
Wall time: 3min 13s


# 載入斷詞結果

In [3]:
posts = pd.read_csv("./data/refined/post_with_seg.csv")
comments_by_author = pd.read_csv("./data/refined/comments_by_author_with_seg.csv")
posts['word_list'] = posts['word_list'].apply(literal_eval)
comments_by_author['word_list'] = comments_by_author['word_list'].apply(literal_eval)

# 貼文分群

In [4]:
post_word_threshold = 0 # 最少要在所有留言中出現幾次才納入分析
temp = posts.set_index(['post_url'])['word_list'].rename('word_count').explode()
val_count = pd.DataFrame(temp.value_counts())
val_count['word'] = val_count.index.values
val_count = val_count[(val_count['word'].apply(len)>1) & (val_count['word_count']>post_word_threshold)]

In [5]:
# 之中共有4.5k個字有在 wiki中
post_embedding = val_count['word'].apply(lambda x:wiki_word2vec_model[x] if x in wiki_word2vec_model else None).dropna()
post_embedding_df = pd.DataFrame.from_dict(dict(zip(post_embedding.index, post_embedding.values))).T
print(post_embedding_df.shape)


(1945, 300)


In [6]:
%%time
# within = []

num_cluster = 5
post_kmeans_model = KMeans(n_clusters=num_cluster)
post_embedding_df['cluster'] = post_kmeans_model.fit_predict(post_embedding_df[[i for i in range(300)]])
# within.append(kmeans_model.inertia_)

CPU times: user 1.7 s, sys: 374 ms, total: 2.08 s
Wall time: 362 ms


In [7]:
post_cluster_ser = post_embedding_df['cluster'].reset_index().groupby('cluster')['index'].agg(list).apply(lambda x:set([word for word in x if len(word)>1]))
post_cluster_df = post_cluster_ser.to_frame().rename({'index':'cluster_words'},axis='columns')


In [8]:
post_cluster_df['sentiment_pos'] = post_cluster_df['cluster_words'].apply(lambda x:sum([1 for word in x if word in positive_words and word not in negative_words]))
post_cluster_df['sentiment_neg'] = post_cluster_df['cluster_words'].apply(lambda x:sum([1 for word in x if word in negative_words and word not in positive_words]))
post_cluster_df['sentiment_score'] = post_cluster_df['sentiment_pos']/(post_cluster_df['sentiment_pos']+post_cluster_df['sentiment_neg'])


In [9]:
post_cluster_df

Unnamed: 0_level_0,cluster_words,sentiment_pos,sentiment_neg,sentiment_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"{新聞, 弱勢, 官員, 防範, 破損, 立場, 避開, 不當, 動態, 點火, 回應, 缺...",53,68,0.438017
1,"{鄭文燦, 好市, 北門, 基隆, 淡水, 海線, 嘉義, 秘書長, 市長, 立委, 中藝,...",0,0,
2,"{藍色, 破綻, 大門, 絞碎, 沒用, 痛罵, 發錢, 齊發, 走走, 小鎮, 蛋糕, 褲...",42,61,0.407767
3,"{人次, 市集, 付錢, 偽鈔, 日租, 綁定, 搶客, 手續費, 攤位, 舉債, 事務機,...",5,8,0.384615
4,"{回填, 理由, 出去, 成分, 以下, 最近, 控制, 收到, 做好, 效果, 正面, n...",58,48,0.54717


# 留言分群

In [10]:
comment_word_threshold = 20 # 最少要在所有留言中出現幾次才納入分析
temp = comments_by_author.set_index(['post_url','author'])['word_list'].rename('word_count').explode()
val_count = pd.DataFrame(temp.value_counts())
val_count['word'] = val_count.index.values
val_count = val_count[(val_count['word'].apply(len)>1) & (val_count['word_count']>comment_word_threshold)]

In [11]:
# 之中共有4.5k個字有在 wiki中
comment_embedding = val_count['word'].apply(lambda x:wiki_word2vec_model[x] if x in wiki_word2vec_model else None).dropna()
comment_embedding_df = pd.DataFrame.from_dict(dict(zip(comment_embedding.index, comment_embedding.values))).T
print(comment_embedding_df.shape)


(3893, 300)


In [12]:
%%time
# within = []

num_cluster = 5
comment_kmeans_model = KMeans(n_clusters=num_cluster)
comment_embedding_df['cluster'] = comment_kmeans_model.fit_predict(comment_embedding_df[[i for i in range(300)]])
# within.append(kmeans_model.inertia_)

CPU times: user 3.43 s, sys: 869 ms, total: 4.3 s
Wall time: 582 ms


In [13]:
comment_cluster_ser = comment_embedding_df['cluster'].reset_index().groupby('cluster')['index'].agg(list).apply(lambda x:set([word for word in x if len(word)>1]))
comment_cluster_df = comment_cluster_ser.to_frame().rename({'index':'cluster_words'},axis='columns')

In [14]:
comment_cluster_df['sentiment_pos'] = comment_cluster_df['cluster_words'].apply(lambda x:sum([1 for word in x if word in positive_words and word not in negative_words]))
comment_cluster_df['sentiment_neg'] = comment_cluster_df['cluster_words'].apply(lambda x:sum([1 for word in x if word in negative_words and word not in positive_words]))
comment_cluster_df['sentiment_score'] = comment_cluster_df['sentiment_pos']/(comment_cluster_df['sentiment_pos']+comment_cluster_df['sentiment_neg'])


In [15]:
comment_cluster_df

Unnamed: 0_level_0,cluster_words,sentiment_pos,sentiment_neg,sentiment_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"{新聞, 弱勢, 官員, 總機, 權利, 連署, 立場, 立委, 不當, 回應, 貪官, 查...",16,75,0.175824
1,"{所有人, 百科全书, 理由, 收回, 普遍, 字眼, 智慧, 最近, 以下, 控制, 本土...",105,108,0.492958
2,"{緊縮, 收據, 不動產, pay, 電話費, 消費券, 貸款, 年金, 帳單, 勞力, 掏...",9,20,0.310345
3,"{藍色, 基隆, 順序, 避開, 教會, 泰國, 歐美, 缺點, 部門, 障礙, 台南, 選...",89,63,0.585526
4,"{驚訝, 出去, 感動, 打炮, 走走, 漂亮, 蛋糕, 被當, 排骨, 外面, 同學, 見...",107,320,0.250585
