In [2]:
import pandas as pd
import numpy
from collections import Counter
from snownlp import SnowNLP
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [5]:
%%time

# Get data
df = pd.read_csv('category_news_665.csv', sep='|') 

# Tokenize content
ws = CkipWordSegmenter(model="albert-tiny")
pos = CkipPosTagger(model="albert-tiny")
ner = CkipNerChunker(model="albert-tiny")

tokens = ws(df.content)
tokens_pos = pos(tokens)
word_pos_pair = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

entity_list = ner(df.content)

allowPOS = ['Na', 'Nb', 'Nc', 'VA', 'VAC', 'VB', 'VC']

tokens_v2 = []
for wp in word_pos_pair:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = word_pos_pair

# Count keyword frequencies
allowPOS = ['Na', 'Nb', 'Nc', 'VC']
def word_frequency(wp_pair):
    filtered_words = []
    for word, pos in wp_pair:
        if (pos in allowPOS) & (len(word) >= 2):
            filtered_words.append(word)
    counter = Counter(filtered_words)
    return counter.most_common(100)

keyfreqs = []
for wp in word_pos_pair:
    topwords = word_frequency(wp)
    keyfreqs.append(topwords)

df['top_key_freq'] = keyfreqs

# Get summary and sentiment
summary = []
sentiment = []
for text in df.content:
    
    sn = SnowNLP(text)
    summary.append(sn.summary())
    sentiment.append(round(sn.sentiments, 2))

df['summary'] = summary
df['sentiment'] = sentiment

# Rearrange dataframe
df = df[[
    'news_id', 'date','category', 'title', 'content', 'sentiment', 'summary',
    'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos', 'link'
]]

# Save data
df.to_csv('category_news_preprocessed_665.csv', sep='|', index=False)

print("Tokenize OK!")

Tokenization: 100%|██████████| 665/665 [00:02<00:00, 236.36it/s]
Inference: 100%|██████████| 9/9 [09:55<00:00, 66.17s/it]
Tokenization: 100%|██████████| 665/665 [00:01<00:00, 391.99it/s]
Inference: 100%|██████████| 223/223 [5:25:04<00:00, 87.47s/it]  
Tokenization: 100%|██████████| 665/665 [00:03<00:00, 200.10it/s]
Inference: 100%|██████████| 9/9 [08:47<00:00, 58.64s/it]


Tokenize OK!
CPU times: total: 21h 3min 21s
Wall time: 5h 51min 45s


## Read data

In [7]:
df2 = pd.read_csv('category_news_preprocessed_665.csv', sep='|')
df2.head(6)

Unnamed: 0,news_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link
0,bnext_20230612_1,2023-06-12,5G通訊,3GPP在台登場，談6G規範！聯發科、中華電信都來了，台廠搶得到6G通訊先機？,全球行動通訊標準組織3GPP（3rdGenerationPartnershipProject...,0.0,"['與國際夥伴共同推動下一個世代、也就是6G通訊發展的技術規範', '共同推動未來通訊技術發...","[('通訊', 20), ('技術', 18), ('網路', 14), ('台灣', 11...","['全球', '行動', '通訊', '標準', '組織', '3GPP', '（', '3...","['全球', '行動', '通訊', '標準', '組織', '舉辦', '會員', '大會...","[NerToken(word='3GPP', ner='ORG', idx=(10, 14)...","[('全球', 'Nc'), ('行動', 'Na'), ('通訊', 'Na'), ('標...",https://www.bnext.com.tw/article/75636/3gpp-th...
1,bnext_20230605_2,2023-06-05,5G通訊,5G專頻專網來了，首2年打4折！申請資格、限制是什麼？辦法重點一次看,「5G專頻專網」自6月5日起開放企業申請，台灣將正式邁入5G專頻專網時代！根據數位發展部公布...,0.0,"['數位部簡化5G專頻專網申請程序', '不得為陸資事業5G專網申請資格主要分作「一般申請者...","[('專網', 19), ('網路', 17), ('企業', 11), ('審驗', 9)...","['「', '5G', '專頻', '專網', '」', '自', '6月', '5日', ...","['專網', '開放', '企業', '台灣', '專網', '時代', '發展部', '專...","[NerToken(word='6月5日', ner='DATE', idx=(9, 13)...","[('「', 'PARENTHESISCATEGORY'), ('5G', 'FW'), (...",https://www.bnext.com.tw/article/75519/moda-5g...
2,bnext_20230526_3,2023-05-26,5G通訊,郭水義與股東攻防12小時！中華電信加薪僅千元、將來銀行虧損問題，他怎麼接招？,中華電信26日召開股東會，新任董事長郭水義首次主持。一如既往，員工股東踴躍發言提建議、爭取自...,0.0,"['中華電信工會「動員」許多員工股東來到現場提出加薪案', '員工股東聚焦在「加薪」方針',...","[('員工', 12), ('股東', 10), ('中華', 9), ('電信', 9),...","['中華', '電信', '26日', '召開', '股東會', '，', '新任', '董...","['中華', '電信', '召開', '股東會', '董事長', '郭水義', '主持', ...","[NerToken(word='中華電信', ner='ORG', idx=(0, 4)),...","[('中華', 'Nc'), ('電信', 'Na'), ('26日', 'Nd'), ('...",https://www.bnext.com.tw/article/75410/chunghw...
3,bnext_20230522_4,2023-05-22,5G通訊,台灣大哥大營收創新高！林之晨自曝2大關鍵原因，還想做「科技電信航空母艦」,台灣大哥大總經理林之晨就任滿4周年，他在臉書上分享，就整體營運來看，台灣大哥大繼逆轉6年長期...,0.0,"['將為台灣大電信本業帶來新的營收與獲利', '台灣大哥大運用電信與個資保護技術等天賦', ...","[('台灣', 33), ('大哥大', 27), ('電信', 20), ('用戶', 1...","['台灣', '大哥大', '總經理', '林之晨', '就', '任滿', '4', '周...","['台灣', '大哥大', '總經理', '林之晨', '臉書', '整體', '營運', ...","[NerToken(word='台灣', ner='GPE', idx=(0, 2)), N...","[('台灣', 'Nc'), ('大哥大', 'Na'), ('總經理', 'Na'), (...",https://www.bnext.com.tw/article/75335/jamie-l...
4,bnext_20230519_5,2023-05-19,5G通訊,遠傳併亞太，公平會受理了！井琪：最快Q4完成，如何在新三雄競爭中勝出？,針對電信業最受矚目的遠傳、亞太合併案，遠傳總經理井琪10日出席活動時發表最新進度，「今年第四...,0.0,"['」在遠傳合併亞太、台灣大哥大合併台灣之星後', '當遠傳合併亞太、台灣大哥大合併台灣之星...","[('遠傳', 24), ('電信', 15), ('台灣', 12), ('合併', 10...","['針對', '電信業', '最', '受', '矚目', '的', '遠傳', '、', ...","['電信業', '矚目', '遠傳', '合併案', '遠傳', '總經理', '井琪', ...","[NerToken(word='遠傳', ner='ORG', idx=(10, 12)),...","[('針對', 'P'), ('電信業', 'Na'), ('最', 'Dfa'), ('受...",https://www.bnext.com.tw/article/74761/fet-fac...
5,bnext_20230505_6,2023-05-05,5G通訊,中華電信新任董座交接！郭水義接任董事長喊「市值站穩兆元」，聚焦三面向,中華電信今（5）日召開臨時董事會通過董事長人事案，現任董事長謝繼茂卸任，由總經理郭水義接任董...,0.0,"['新任中華電信董事長郭水義曾任中華電信財務長、總經理等要職', '「事業三艦」意指個人家庭...","[('中華', 17), ('電信', 16), ('郭水義', 15), ('公司', 1...","['中華', '電信', '今', '（5）', '日', '召開', '臨時', '董事會...","['中華', '電信', '召開', '董事會', '通過', '董事長', '人事案', ...","[NerToken(word='中華電信', ner='ORG', idx=(0, 4)),...","[('中華', 'Nc'), ('電信', 'Na'), ('今', 'Nd'), ('（5...",https://www.bnext.com.tw/article/75144/gun-cn-...
