In [1]:
import csv
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger

# Initialize drivers
ws_driver  = CkipWordSegmenter(model="bert-base", device=0)
pos_driver = CkipPosTagger(model="bert-base", device=0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# open Gossiping-QA-Dataset.csv
with open('Gossiping-QA-Dataset-2_0.csv', newline='') as csvfile:
    # read Gossiping-QA-Dataset.csv
    rows = csv.reader(csvfile)
    # create a list to store the data
    data = []
    # skip the first row
    next(rows)
    # append the data to the list
    for row in rows:
        data.append(row)

sentences = []
for i in range(len(data)):
    sentences.append(data[i][0])
    sentences.append(data[i][1])


In [3]:
# replace the space ''
sentences = [s.replace(' ', '') for s in sentences]

In [4]:
len(sentences)

1548228

In [10]:
sea_sentences = [s for s in sentences if '海' in s]
print(len(sea_sentences))

11226


In [16]:
sea_sentences_seg = ws_driver(sea_sentences)
sea_sentences_pos = pos_driver(sea_sentences_seg)

Tokenization: 100%|██████████| 11226/11226 [00:00<00:00, 74724.86it/s]
Inference: 100%|██████████| 44/44 [00:01<00:00, 23.99it/s]
Tokenization: 100%|██████████| 11226/11226 [00:00<00:00, 98425.24it/s]
Inference: 100%|██████████| 53/53 [00:02<00:00, 23.73it/s]


In [21]:
# zip
sea_sentences = list(zip(sea_sentences_seg, sea_sentences_pos))

In [30]:
noun_tags = ['Na', 'Nb', 'Nc', 'Nd']  # Add or remove noun types as needed

filtered_sentences = []
for segs, tags in sea_sentences:
    filtered_segs = [seg for seg, tag in zip(segs, tags) if tag in noun_tags]
    filtered_sentences.append(filtered_segs)

print(filtered_sentences)
# frequency count of the words
from collections import Counter
word_counts = Counter()
for sentence in filtered_sentences:
    word_counts.update(sentence)
# print top 20
print(word_counts.most_common(20))

# convert the word_counts into a csv and save
with open('ptt_noun_sea_counts.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'count'])
    for word, count in word_counts.most_common():
        writer.writerow([word, count])

[['海陸', '娘', '炮兵'], ['海陸', '基地', '常態'], ['海鮮', '生魚片', '海鮮', '飲料'], ['海洛英', '古柯鹼'], ['昨天', '海', '首映會'], ['情侶', '麥當勞', '海賊王'], ['大魚海棠', '八卦'], ['海馬', '杏子', '海馬', '大樓'], ['吳昇桓', '海堂', '高中', '眉村'], ['海倫清桃'], ['路克', '彌海砂', '彌海砂'], ['海洋', '首選'], ['人', '海倫清桃', '前公', '戴', '先生'], ['肥宅', '北海道'], ['立偉', '麵食', '福州丸', '李海', '控肉飯'], ['上海', '東京'], ['遼寧號', '台海'], ['船', '公海', '八卦'], ['龍五', '龍九', '船', '海珊', '人', '船', '領海'], ['海悅', '國民黨', '關係'], ['飛機', '海關', '人權', '嫌疑'], ['海島型', '氣候', '台灣', '阿'], ['海關', '人'], ['行李', '海關'], ['俄羅斯', '帝國', '海外', '殖民地'], ['瀏海', '額頭', '運氣'], ['瀏海'], ['人', '現在', '海賊王', '單行本'], ['時候'], ['空氣', '瀏海', '警政署'], ['海賊王', '完結篇'], ['海線', '八卦', '海', '保育類'], ['海關', '機器', '功能'], ['美國', '中國', '大使館', '領事館', '海軍', '陸戰隊', '佩槍'], ['海大', '商船', '邊緣人'], ['海棠', '小姐', '頭髮'], ['琉球群島', '海域', '海', '遺跡'], ['師', '國共', '上海', '戰爭'], ['生物', '海豚'], ['台灣', '企業', '鴻海', '牛'], ['海內外', '大聯盟', '八卦'], ['海獅'], ['中共', '實彈', '台灣海峽'], ['黃海', '航母'], ['國中生', '腦子', '刀', '山', '火海'], ['家', '長輩', '阿茲海默', '前兆'], ['海外', '美國'],

In [31]:
adj_tags = ['A', 'VH', 'VHC', 'VI', 'VJ', 'VK', 'VL']
adj_filtered_sentences = []
for segs, tags in sea_sentences:
    filtered_segs = [seg for seg, tag in zip(segs, tags) if tag in adj_tags]
    adj_filtered_sentences.append(filtered_segs)

# frequency count of the words
from collections import Counter
adj_word_counts = Counter()
for sentence in adj_filtered_sentences:
    adj_word_counts.update(sentence)
# print top 20
print(adj_word_counts.most_common(20))

# convert the word_counts into a csv and save
with open('ptt_adj_sea_counts.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['word', 'count'])
    for word, count in adj_word_counts.most_common():
        writer.writerow([word, count])

[('好', 462), ('大', 212), ('知道', 193), ('沒', 179), ('小', 143), ('強', 112), ('怎麼辦', 105), ('沒有', 96), ('死', 94), ('怎樣', 88), ('覺得', 87), ('讓', 87), ('一樣', 85), ('愛', 83), ('喜歡', 77), ('高', 73), ('好吃', 64), ('開始', 62), ('爽', 60), ('多', 58)]
