In [1]:
import pandas as pd
import numpy
from collections import Counter
from snownlp import SnowNLP
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [3]:
%%time

# Get data
df = pd.read_csv('horoscopes.csv', sep='|') 

# Tokenize content
ws = CkipWordSegmenter(model="albert-tiny")
pos = CkipPosTagger(model="albert-tiny")
ner = CkipNerChunker(model="albert-tiny")

tokens = ws(df.content)
tokens_pos = pos(tokens)
word_pos_pair = [list(zip(w, p)) for w, p in zip(tokens, tokens_pos)]

entity_list = ner(df.content)

allowPOS = ['Na', 'Nb', 'Nc', 'VA', 'VAC', 'VB', 'VC']

tokens_v2 = []
for wp in word_pos_pair:
    tokens_v2.append([w for w, p in wp if (len(w) >= 2) and p in allowPOS])

df['tokens'] = tokens
df['tokens_v2'] = tokens_v2
df['entities'] = entity_list
df['token_pos'] = word_pos_pair

# Count keyword frequencies
allowPOS = ['Na', 'Nb', 'Nc', 'VC']
def word_frequency(wp_pair):
    filtered_words = []
    for word, pos in wp_pair:
        if (pos in allowPOS) & (len(word) >= 2):
            filtered_words.append(word)
    counter = Counter(filtered_words)
    return counter.most_common(200)

keyfreqs = []
for wp in word_pos_pair:
    topwords = word_frequency(wp)
    keyfreqs.append(topwords)

df['top_key_freq'] = keyfreqs

# Get summary and sentiment
summary = []
sentiment = []
for text in df.content:
    
    sn = SnowNLP(text)
    summary.append(sn.summary())
    sentiment.append(round(sn.sentiments, 2))

df['summary'] = summary
df['sentiment'] = sentiment

# Rearrange dataframe
df = df[[
    'horoscope','article_year','article_month','content','sentiment', 'summary',
    'top_key_freq', 'tokens', 'tokens_v2', 'entities', 'token_pos'
]]

# Save data
df.to_csv('articles_preprocessed.csv', sep='|', index=False)

print("Tokenize OK!")

Tokenization: 100%|██████████| 288/288 [00:00<00:00, 401.41it/s]
Inference: 100%|██████████| 3/3 [02:40<00:00, 53.49s/it]
Tokenization: 100%|██████████| 288/288 [00:00<00:00, 536.54it/s]
Inference: 100%|██████████| 86/86 [05:51<00:00,  4.08s/it]
Tokenization: 100%|██████████| 288/288 [00:00<00:00, 385.84it/s]
Inference: 100%|██████████| 3/3 [02:45<00:00, 55.32s/it]


Tokenize OK!
CPU times: total: 45min 43s
Wall time: 14min 12s


## Read data

In [5]:
df2 = pd.read_csv('articles_preprocessed.csv', sep='|')
df2.head(6)

Unnamed: 0,horoscope,article_year,article_month,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos
0,雙魚座,2022,12,家庭仍舊是你的生活重心，在2023年3月25日之前，家族中的某人依然需要你多加關注。這主要是...,0.03,"['12月7日滿月將落入雙子座16°', '金星是雙魚座代表他人錢財的8宮的守護星', '這...","[('滿月', 3), ('家庭', 2), ('生活', 2), ('某人', 2), (...","['家庭', '仍舊', '是', '你', '的', '生活', '重心', '，', '...","['家庭', '生活', '重心', '家族', '某人', '火星', '行運', '雙子...","[NerToken(word='2023年3月25日之前', ner='DATE', idx...","[('家庭', 'Na'), ('仍舊', 'D'), ('是', 'SHI'), ('你'..."
1,巨蟹座,2022,12,說到巨蟹座的工作啊，你一直選擇默默無聞地在幕後付出，作為掌控全局的角色。巨蟹座領導會毫不吝嗇...,0.0,"['12月23日的摩羯座新月將標誌著巨蟹和伴侶迎來新的紀元', '巨蟹也將更加關注自己的個人...","[('巨蟹座', 10), ('巨蟹', 7), ('工作', 5), ('木星', 4),...","['說到', '巨蟹座', '的', '工作', '啊', '，', '你', '一直', ...","['巨蟹座', '工作', '選擇', '幕後', '付出', '掌控', '角色', '巨...","[NerToken(word='巨蟹', ner='PRODUCT', idx=(84, 8...","[('說到', 'VE'), ('巨蟹座', 'Nb'), ('的', 'DE'), ('工..."
2,天蠍座,2022,12,天蠍座有兩個守護星，這兩顆行星與其他行星的互動通常對你的運勢非常重要。第一個守護星火星正位於...,0.0,"['也在考慮是做些投資還是保持現狀', '這說明你將擁有家宅穩定帶來的安全感和經濟收益', ...","[('火星', 4), ('財務', 4), ('天蠍座', 2), ('守護星', 2),...","['天蠍座', '有', '兩', '個', '守護星', '，', '這', '兩', '...","['天蠍座', '守護星', '行星', '行星', '運勢', '守護星', '火星', ...","[NerToken(word='天蠍座', ner='FAC', idx=(0, 3)), ...","[('天蠍座', 'Nb'), ('有', 'V_2'), ('兩', 'Neu'), ('..."
3,雙子座,2022,12,火星，這顆高能量的行星，已經在你的星座停留了異常長的時間，在8月20日首次進入到雙子座，並一...,0.0,"['12月23日的新月將與財務機會有關', '而且在你的一生中不會再與海王星一起落在雙魚座'...","[('雙子座', 4), ('木星', 4), ('雙魚座', 4), ('海王星', 4)...","['火星', '，', '這', '顆', '高能量', '的', '行星', '，', '...","['火星', '行星', '星座', '停留', '時間', '雙子座', '停留', '精...","[NerToken(word='8月20日', ner='DATE', idx=(30, 3...","[('火星', 'Nb'), ('，', 'COMMACATEGORY'), ('這', '..."
4,天秤座,2022,12,你關注的焦點並不局限於本土，而是放眼全球的，因為火星從8月20日起就在你的第九宮國際關係與企...,0.0,"['所以這兩顆行星在我們這輩子都不會再在雙魚座重逢了', '不過木星將在12月20日離開雙魚...","[('木星', 9), ('海王星', 6), ('雙魚座', 6), ('項目', 4),...","['你', '關注', '的', '焦點', '並', '不', '局限於', '本土', ...","['焦點', '本土', '全球', '火星', '國際', '關係', '企業宮', '營...","[NerToken(word='8月20日', ner='DATE', idx=(27, 3...","[('你', 'Nh'), ('關注', 'VJ'), ('的', 'DE'), ('焦點'..."
5,水瓶座,2022,12,一進入12月，就會感受到節日的熱鬧。你最喜歡的會是12月7日的那個雙子座滿月（影響期：12月...,0.0,"['你最喜歡的會是12月7日的那個雙子座滿月（影響期：12月5日-12月11日）', '這種...","[('影響', 3), ('節日', 2), ('雙子座', 2), ('主星', 2), ...","['一', '進入', '12月', '，', '就', '會', '感受到', '節日',...","['節日', '雙子座', '影響期', '點亮', '真愛宮', '土星', '發射', ...","[NerToken(word='12月', ner='DATE', idx=(3, 6)),...","[('一', 'D'), ('進入', 'VCL'), ('12月', 'Neu'), ('..."
