# 斷詞
以下使用中研院ckip斷詞套件將貼文與留言內容斷詞

In [1]:
# !pip install tensorflow ckiptagger gdown pandas

In [2]:
import re
import multiprocessing as mp

import numpy as np
import pandas as pd

from ckiptagger import data_utils, construct_dictionary, WS, POS

In [3]:
# 載入ckip斷詞模型
ws = WS("./src/ckip_data/")
pos = POS("./src/ckip_data/")



In [4]:
# 載入停用字與我定義的字典
from data.my_configs import remove_patterns, predefined_words, positive_words, negative_words, selected_pos_types
# 停用字這些是要在斷詞前先拿掉的
print(f'斷詞前要拿掉的字詞 共{len(remove_patterns)} 個 e.g. {remove_patterns[:5]}')
# 這是提供給斷詞套件參考的詞典
print(f'我預先定義的字詞 共{len(predefined_words)}個 e.g. {predefined_words[:5]}')
# 這是預先下載好了正向與負向情緒用字
print(f'正向用字: 共{len(positive_words)}個 e.g. {positive_words[:5]}')
print(f'負向用字: 共{len(negative_words)}個 e.g. {negative_words[:5]}')

斷詞前要拿掉的字詞 共130 個 e.g. ['\n', '新聞雲', '快下載新聞雲App掌握政治大小事', '熱門話題一手掌握', '下載']
我預先定義的字詞 共55個 e.g. ['振興券', '消費券', '三倍券', '三倍', '經濟部']
正向用字: 共2810個 e.g. ['一帆風順', '一帆風順的', '一流', '一致', '一致的']
負向用字: 共8274個 e.g. ['一下子爆發', '一下子爆發的一連串', '一巴掌', '一再', '一再叮囑']


In [5]:
# 依照詞典的內容，建構給ckip斷詞套件的辭典
word_to_weight = {word : 1 for word in predefined_words+positive_words+negative_words}
dictionary = construct_dictionary(word_to_weight)

# 貼文斷詞

In [6]:
#讀入貼文
posts = pd.read_csv("./data/refined/posts_v1.csv")

#把小編發文內容與連結中的新聞標題組在一起
posts['content_and_link_title'] = posts['post_content'].fillna('') + posts['link_title'].fillna('')

#把停用字詞拿掉
for remove_pattern in remove_patterns:
    posts['content_and_link_title'] = posts['content_and_link_title'].str.replace(remove_pattern, "")

  posts['content_and_link_title'] = posts['content_and_link_title'].str.replace(remove_pattern, "")
  posts['content_and_link_title'] = posts['content_and_link_title'].str.replace(remove_pattern, "")


In [7]:
%%time
# 斷詞
## ckip cut
post_seg_list = ws(
    posts['content_and_link_title'],
    sentence_segmentation = True, # To consider delimiters
    coerce_dictionary = dictionary, # words in this dictionary are forced
)

post_seg_list = [[ seg.replace("\n","") for seg in seg_list] for seg_list in post_seg_list]
post_pos_list = pos(post_seg_list)

CPU times: user 1min 6s, sys: 10.4 s, total: 1min 17s
Wall time: 14.8 s


In [8]:
# 把貼文的斷詞結果和詞性存成檔案
with open("./data/refined/post_seg.txt",'w') as f:
    for line in post_seg_list:
        f.write(" ".join(line))
        f.write("\n")
with open("./data/refined/post_pos.txt",'w') as f:
    for line in post_pos_list:
        f.write(" ".join(line))
        f.write("\n")

# 留言斷詞

In [9]:
%%time
comments = pd.read_csv("./data/refined/comments.csv")
# 去除空的留言，空的留言可能是只有留貼圖或其他無法辨識的內容
comments = comments[comments['comment'].notnull()].reset_index(drop=True) 

# 去除不要的字詞
for remove_pattern in remove_patterns:
    comments['comment'] = comments['comment'].str.replace(remove_pattern, "")
    
# 將每則貼文底下，同一個人的所有留言歸到同一則內
comments_by_author = comments.groupby(['post_url','author'])['comment'].apply(lambda x:"".join(x)).reset_index()

# 去除留言中tag人名，有些人的留言純粹只是想請朋友來看，我們暫時先把這類型排除在分析之外
name_pattern_ser = [pat.replace(" ","") for pat in comments_by_author['author'].unique()]
name_pattern = "|".join(name_pattern_ser)

pool = mp.Pool(processes=max(mp.cpu_count() - 1 ,1) )
pool_res = pool.starmap( re.sub,[[name_pattern, "", row] for row in comments_by_author['comment']])
pool.close()
pool.join()
comments_by_author['comment_no_ppl_tag'] = [i for i in pool_res]



CPU times: user 25.4 s, sys: 794 ms, total: 26.2 s
Wall time: 4min 39s


In [10]:
comments_by_author.head()

Unnamed: 0,post_url,author,comment,comment_no_ppl_tag
0,https://www.facebook.com/ETtoday/posts/3150078...,Angel Lee,現在不促進觀光時候全民自主管理最安全先把那個錢省下來防疫工作不知道何時才會停止還有隔離醫療先...,現在不促進觀光時候全民自主管理最安全先把那個錢省下來防疫工作不知道何時才會停止還有隔離醫療先...
1,https://www.facebook.com/ETtoday/posts/3150078...,Barry Shin,對於疫情處理好壞看看周圍亞洲各國你應該慶幸在台灣這篇留言串不知道什麼吊出很多群情激憤網民看起...,對於疫情處理好壞看看周圍亞洲各國你應該慶幸在台灣這篇留言串不知道什麼吊出很多群情激憤網民看起...
2,https://www.facebook.com/ETtoday/posts/3150078...,Bau Bear,好事把消費留在國內,好事把消費留在國內
3,https://www.facebook.com/ETtoday/posts/3150078...,Benny Hsieh,賴宥蓁不然呢哪個建設和政策不花人民錢很正常,不然呢哪個建設和政策不花人民錢很正常
4,https://www.facebook.com/ETtoday/posts/3150078...,Bling Bling Lin,花菲菲撐過215國內觀光就不怕,花菲菲撐過215國內觀光就不怕


In [11]:
%%time
#ckip cut
comment_seg_list = ws(
    comments_by_author['comment_no_ppl_tag'],
    sentence_segmentation = True, # To consider delimiters
    coerce_dictionary = dictionary, # words in this dictionary are forced
)
comment_seg_list = [[ seg.replace("\n","") for seg in seg_list] for seg_list in comment_seg_list]
comment_pos_list = pos(comment_seg_list)



CPU times: user 1h 46min 14s, sys: 15min 24s, total: 2h 1min 38s
Wall time: 22min 25s


In [12]:
# 把留言的斷詞結果和詞性存成檔案
with open("./data/refined/comment_seg.txt",'w') as f:
    for line in comment_seg_list:
        f.write(" ".join(line))
        f.write("\n")
with open("./data/refined/comment_pos.txt",'w') as f:
    for line in comment_pos_list:
        f.write(" ".join(line))
        f.write("\n")

In [13]:
# 
exclude_pos = True
if exclude_pos:
    print(f'選擇詞性{selected_pos_types}')
    posts['word_list'] = [[seg for seg, pos in zip(seg_list, pos_list) if pos in selected_pos_types] for seg_list, pos_list in zip(post_seg_list, post_pos_list)]
    comments_by_author['word_list'] = [[seg for seg, pos in zip(seg_list, pos_list) if pos in selected_pos_types] for seg_list, pos_list in zip(comment_seg_list, comment_pos_list)]
else:
    posts['word_list'] = post_seg_list
    comments_by_author['word_list'] = comment_seg_list

選擇詞性['A', 'Na', 'Nb', 'Nc', 'Ncd', 'Nd', 'Nv', 'VA', 'VAC', 'VB', 'VC', 'VCL', 'VD', 'VE', 'VF', 'VG', 'VH', 'VHC', 'VI', 'VJ', 'VK', 'V_2', 'FW']


In [14]:
%%time
posts['sentiment_pos'] = posts['word_list'].apply(lambda x:sum([i in x for i in positive_words]))
posts['sentiment_neg'] = posts['word_list'].apply(lambda x:sum([i in x for i in negative_words]))
posts['sentiment_score'] = (posts['sentiment_pos']/(posts['sentiment_pos']+posts['sentiment_neg']))#.fillna(0.5).mean()

posts["sentiment_score_filled"] = posts['sentiment_score'] 
posts["sentiment_score_filled"] = posts.groupby("fanpage")['sentiment_score'].transform(lambda x: x.fillna(x.mean()))

comments_by_author['sentiment_pos'] = comments_by_author['comment_no_ppl_tag'].apply(lambda x:sum([i in x for i in positive_words]))
comments_by_author['sentiment_neg'] = comments_by_author['comment_no_ppl_tag'].apply(lambda x:sum([i in x for i in negative_words]))
comments_by_author['sentiment_score'] = (comments_by_author['sentiment_pos']/(comments_by_author['sentiment_pos']+comments_by_author['sentiment_neg']))#.fillna(0.5).mean()
comments_by_author["sentiment_score_filled"] = comments_by_author['sentiment_score'].fillna(0.5)


CPU times: user 1min 59s, sys: 495 ms, total: 2min
Wall time: 2min


In [15]:
posts.to_csv("./data/refined/post_with_seg.csv",index=None)
comments_by_author.to_csv("./data/refined/comments_by_author_with_seg.csv",index=None)