In [1]:
import pandas as pd
import numpy
from collections import Counter
from snownlp import SnowNLP
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [2]:
%%time

# Get data
df = pd.read_csv('3c_news_190.csv', sep='|') 

# Tokenize content
ws = CkipWordSegmenter(model="albert-tiny")
pos = CkipPosTagger(model="albert-tiny")
ner = CkipNerChunker(model="albert-tiny")

tokens = ws(df.content)
tokens_pos = pos(tokens)
word_pos_pair = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

entity_list = ner(df.content)

allowPOS = ['Na', 'Nb', 'Nc', 'VA', 'VAC', 'VB', 'VC']

tokens_v2 = []
for wp in word_pos_pair:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = word_pos_pair

# Count keyword frequencies
allowPOS = ['Na', 'Nb', 'Nc', 'VC']
def word_frequency(wp_pair):
    filtered_words = []
    for word, pos in wp_pair:
        if (pos in allowPOS) & (len(word) >= 2):
            filtered_words.append(word)
    counter = Counter(filtered_words)
    return counter.most_common(100)

keyfreqs = []
for wp in word_pos_pair:
    topwords = word_frequency(wp)
    keyfreqs.append(topwords)

df['top_key_freq'] = keyfreqs

# Get summary and sentiment
summary = []
sentiment = []
for text in df.content:
    
    sn = SnowNLP(text)
    summary.append(sn.summary())
    sentiment.append(round(sn.sentiments, 2))

df['summary'] = summary
df['sentiment'] = sentiment

# Rearrange dataframe
df = df[[
    'news_id', 'date','category', 'title', 'content', 'sentiment', 'summary',
    'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos', 'link'
]]

# Save data
df.to_csv('3c_news_preprocessed_190.csv', sep='|', index=False)

print("Tokenize OK!")

Tokenization: 100%|██████████| 190/190 [00:01<00:00, 140.08it/s]
Inference: 100%|██████████| 3/3 [04:30<00:00, 90.33s/it] 
Tokenization: 100%|██████████| 190/190 [00:00<00:00, 200.32it/s]
Inference: 100%|██████████| 54/54 [56:08<00:00, 62.38s/it]
Tokenization: 100%|██████████| 190/190 [00:01<00:00, 96.22it/s] 
Inference: 100%|██████████| 3/3 [04:16<00:00, 85.47s/it] 


Tokenize OK!
CPU times: total: 3h 13min 54s
Wall time: 1h 9min 16s


## Read data

In [3]:
df2 = pd.read_csv('3c_news_preprocessed_190.csv', sep='|')
df2.head(6)

Unnamed: 0,news_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link
0,3c_20230613_1,2023-06-13,3C生活,三星MicroLED電視首度登台，要價逼近450萬！為何有信心站穩台灣高階市場？,三星電子舉辦2023年首場電視發表會，宣布全球市場蟬聯第17年銷量第一，台灣市場銷量在去年也...,0.0,"['搶下更多高階電視市場分額', '三星高階電視最熱賣的機種NeoQLED量子MiniLED...","[('電視', 18), ('市場', 13), ('尺寸', 8), ('三星', 7),...","['三星', '電子', '舉辦', '2023年', '首', '場', '電視', '發...","['三星', '電子', '舉辦', '電視', '發表會', '全球', '市場', '台...","[NerToken(word='三星電子', ner='ORG', idx=(0, 4)),...","[('三星', 'Nb'), ('電子', 'Na'), ('舉辦', 'VC'), ('2...",https://www.bnext.com.tw/article/75654/samsung...
1,3c_20230613_2,2023-06-13,3C生活,VisionPro給蘋果強心針！股價創一年多新高，重回市值3兆美元只差一步,蘋果公司(AAPL-US)周一(12日)收盤股價創新高，凸顯大型科技股重拾領頭羊的氣勢，該公...,0.0,"['投資人也對蘋果擴大零售鏈的計畫樂觀以對', '蘋果是不受市況左右而能一直獲得投資人青睞的...","[('蘋果', 14), ('公司', 3), ('投資人', 3), ('計畫', 3),...","['蘋果', '公司', '(', 'AAPL-US)', '周一', '(', '12日)...","['蘋果', '公司', '股價', '大型', '科技股', '重拾', '領頭羊', '...","[NerToken(word='周一', ner='DATE', idx=(13, 15))...","[('蘋果', 'Na'), ('公司', 'Nc'), ('(', 'PARENTHESI...",https://www.bnext.com.tw/article/75644/apple-3...
2,3c_20230612_3,2023-06-12,3C生活,「紅魔8Pro」螢幕顯示中國台灣，一查竟揪米客邦違法進口！NCC要罰了,3C代理商米客邦(伊瑪格科技有限公司)自3月於網路募資平台flyingV募資預售「紅魔8Pr...,0.0,"['販賣未經審驗合格之智慧型手機', '應先確認該款手機是否經NCC審驗合格', '未經NC...","[('手機', 10), ('審驗', 5), ('消費者', 4), ('網路', 3),...","['3C', '代理商', '米客邦', '(', '伊瑪格', '科技', '有限公司',...","['代理商', '米客邦', '伊瑪格', '科技', '網路', '平台', '募資', ...","[NerToken(word='米客邦(伊瑪格科技有限公司', ner='ORG', idx...","[('3C', 'FW'), ('代理商', 'Na'), ('米客邦', 'Nb'), (...",https://www.bnext.com.tw/article/75632/redmagi...
3,3c_20230609_4,2023-06-09,3C生活,VisionPro實測｜殺手級應用在哪？近視能戴嗎？沉浸感如何？一次解答,這是第一次，我感覺到文字的匱乏與無力。WWDC23第二天，我代表愛范兒，成為了這個世界上最早...,0.0,"['成為了這個世界上最早體驗到VisionPro的一批人', '這種極佳的體驗還來自於視覺和...","[('蘋果', 16), ('空間', 12), ('體驗', 11), ('佩戴', 9)...","['這', '是', '第一', '次', '，', '我', '感覺到', '文字', '...","['文字', '世界', '現場', '錄影錄', '面對', '高科技', '設備', '...","[NerToken(word='第一', ner='ORDINAL', idx=(2, 4)...","[('這', 'Nep'), ('是', 'SHI'), ('第一', 'Neu'), ('...",https://www.bnext.com.tw/article/75588/vision-...
4,3c_20230609_5,2023-06-09,3C生活,華為WatchBuds內藏超迷你耳機、小米手環打Switch拳擊，新款智慧錶亮點一次看,"「預計2028年，全球的智慧手表市場規模會達到582.1億美元（約新台幣1兆7,800萬元）...",0.0,"['可聽歌3小時、通話2.5小時', '可支援聽歌4小時、通話2.5小時', '最創新的是「...","[('小米', 6), ('功能', 6), ('手表', 5), ('小時', 5), (...","['「', '預計', '2028年', '，', '全球', '的', '智慧', '手表...","['全球', '智慧', '手表', '市場', '規模', '新台幣', '研調', '機...","[NerToken(word='2028年', ner='DATE', idx=(3, 8)...","[('「', 'PARENTHESISCATEGORY'), ('預計', 'VE'), (...",https://www.bnext.com.tw/article/75557/smart-w...
5,3c_20230609_6,2023-06-09,3C生活,華碩Zenfone10現身，「相機」藏最大亮點？搭載高通最新處理器、AI運算也提升,華碩最新5G旗艦手機Zenfone10現身了！華碩宣布，6月29日晚間9點將線上發表最新5G...,0.0,"['華碩Zenfone10線上發表會將於6月29日晚間9點舉行', '6月29日晚間9點將線...","[('華碩', 5), ('旗艦', 3), ('手機', 2), ('發表', 2), (...","['華碩', '最', '新5G', '旗艦', '手機', 'Zenfone10', '現...","['華碩', '旗艦', '手機', '現身', '華碩', '宣布', '發表', '旗艦...","[NerToken(word='華碩', ner='ORG', idx=(0, 2)), N...","[('華碩', 'Nb'), ('最', 'Dfa'), ('新5G', 'VH'), ('...",https://www.bnext.com.tw/article/75605/asus-ze...
