In [1]:
from ckiptagger import WS
from collections import Counter
import json
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### data input

In [2]:
def get_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [3]:
def write_counter(counter, filename):
    with open(filename, 'w+', encoding='utf-8') as f:
        N = sum(counter.values())
        for word, count in counter.most_common():
            f.write('{}\t{}\t{:7.4f}%\n'.format(word, count, count/N*100))

In [4]:
data1 = get_data('肺炎.json')['result']
data2 = get_data('武漢肺炎.json')['result']
data3 = get_data('新冠肺炎.json')['result']

In [5]:
print(data1[:2])
print(data2[:2])
print(data3[:2])

[{'title': '武漢肺炎蔓延中該如何預防？一次看懂新型、SARS、MERS的差別', 'content': '近來引發關注的中國大陸武漢不明原因肺炎，根據中國大陸提供的資訊，病原體初步\n判定為新型冠狀病毒，它與過往的SARS、MERS又有什麼不同？我們該如何預防？'}, {'title': '衛生福利部疾病管制署: 首頁', 'content': '中國大陸武漢發生肺炎疫情 ... 500 1000 1500 2000 12/31 通報日 通報數 擴大監測\n送驗 居家檢疫送驗 法定傳染病通報 COVID-19(武漢肺炎) 監測趨勢圖-依通報來源.'}]
[{'title': '嚴重特殊傳染性肺炎- 衛生福利部疾病管制署', 'content': '2020年1月9日 ... 2019年12月以來，湖北省武漢市展開呼吸道疾病及相關疾病監測，發現不明原因\n病毒性肺炎病例。個案臨床表現主要為發熱，少數病人呼吸困難，\xa0...'}, {'title': '武漢肺炎即時疫情最新整理不斷更新懶人包', 'content': '武漢肺炎延燒，目前最新確診人數、死亡人數有多少？想得到最新最快最正確的新冠\n肺炎數據，請看Yahoo整理的新冠肺炎即時疫情數據整理。'}]
[{'title': '新冠肺炎特別報導- 中時電子報', 'content': '武漢肺炎/新冠肺炎covid-19最新新聞,疫情統計追蹤,特別報導-中時電子報.'}, {'title': '武漢肺炎即時疫情最新整理不斷更新懶人包', 'content': '武漢肺炎延燒，目前最新確診人數、死亡人數有多少？想得到最新最快最正確的新冠\n肺炎數據，請看Yahoo整理的新冠肺炎即時疫情數據整理。'}]


In [6]:
titles1 = [news['title'] for news in data1]
content1 = [news['content'] for news in data1]
titles2 = [news['title'] for news in data2]
content2 = [news['content'] for news in data2]
titles3 = [news['title'] for news in data3]
content3 = [news['content'] for news in data3]

### brute-force get grams

In [7]:
def ngram(sentences, n):
    grams = []
    for sent in sentences:
        grams.extend([sent[i:i+n] for i in range(len(sent)-n+1)])
    return grams

In [8]:
prune_content1 = [re.findall(r'\w+', sent) for sent in titles1+content1]
prune_content2 = [re.findall(r'\w+', sent) for sent in titles2+content2]
prune_content3 = [re.findall(r'\w+', sent) for sent in titles3+content3]

In [9]:
prune_content1[:3]

[['武漢肺炎蔓延中該如何預防', '一次看懂新型', 'SARS', 'MERS的差別'],
 ['衛生福利部疾病管制署', '首頁'],
 ['針對武漢肺炎研發及製作快篩試劑及疫苗之作為', '科技部']]

In [10]:
def get_all_gram(sentences, n):
    grams = []
    for i in range(2, n+1):
        ngrams = [ngram(sent, i) for sent in sentences]
        grams += [gram for sent in ngrams for gram in sent]
    return grams

In [11]:
allgrams1 = get_all_gram(prune_content1, 6)
allgrams2 = get_all_gram(prune_content2, 6)
allgrams3 = get_all_gram(prune_content3, 6)

In [13]:
allgrams1[:10]

['武漢', '漢肺', '肺炎', '炎蔓', '蔓延', '延中', '中該', '該如', '如何', '何預']

In [14]:
c1 = Counter(allgrams1)
c2 = Counter(allgrams2)
c3 = Counter(allgrams3)

In [15]:
c1.most_common(20)

[('肺炎', 236),
 ('傳染', 83),
 ('武漢', 79),
 ('專區', 79),
 ('嚴重', 79),
 ('20', 79),
 ('染性', 77),
 ('性肺', 77),
 ('傳染性', 77),
 ('染性肺', 77),
 ('性肺炎', 77),
 ('傳染性肺', 77),
 ('染性肺炎', 77),
 ('傳染性肺炎', 77),
 ('重特', 76),
 ('特殊', 76),
 ('嚴重特', 76),
 ('重特殊', 76),
 ('嚴重特殊', 76),
 ('漢肺', 74)]

In [16]:
c2.most_common(20)

[('肺炎', 247),
 ('武漢', 186),
 ('漢肺', 176),
 ('武漢肺', 176),
 ('漢肺炎', 175),
 ('武漢肺炎', 175),
 ('20', 104),
 ('疫情', 86),
 ('19', 74),
 ('防疫', 67),
 ('CO', 55),
 ('OV', 55),
 ('VI', 55),
 ('ID', 55),
 ('COV', 55),
 ('OVI', 55),
 ('VID', 55),
 ('COVI', 55),
 ('OVID', 55),
 ('COVID', 55)]

In [17]:
c3.most_common(20)

[('肺炎', 229),
 ('新冠', 194),
 ('冠肺', 183),
 ('新冠肺', 183),
 ('冠肺炎', 183),
 ('新冠肺炎', 183),
 ('20', 121),
 ('疫情', 70),
 ('專區', 58),
 ('02', 55),
 ('防疫', 51),
 ('202', 51),
 ('020', 51),
 ('2020', 51),
 ('19', 49),
 ('0年', 40),
 ('20年', 39),
 ('CO', 38),
 ('020年', 38),
 ('2020年', 38)]

write to output

In [19]:
write_counter(c1, 'brute-force-1.out')
write_counter(c2, 'brute-force-2.out')
write_counter(c3, 'brute-force-3.out')

### ckiptagger

In [20]:
ws = WS('./data')


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [None]:
# (not used now)
# punctuations = ['！', '？', '｡', '＂', '＃', '＄', '％', '＆', '＇', '（', '）', '＊', '＋', '，', '－', '／', '：', '；', '＜', '＝', '＞', '＠', '［', '＼', '］', '＾', '＿', '｀', '｛', '｜', '｝', '～', '｟', '｠', '｢', '｣', '､', '、', '〃', '》', '「', '」', '『', '』', '【', '】', '〔', '〕', '〖', '〗', '〘', '〙', '〚', '〛', '〜', '〝', '〞', '〟', '〰', '〾', '〿', '–', '—', '‘', '\'', '‛', '“', '”', '„', '‟', '…', '‧', '﹏', '.']

In [23]:
def filter_punc(seg):
    return [w for w in seg if re.sub(r'[^\w]', '', w) != '']

In [24]:
seg1 = [filter_punc(sent) for sent in ws(titles1+content1)]
seg2 = [filter_punc(sent) for sent in ws(titles2+content2)]
seg3 = [filter_punc(sent) for sent in ws(titles3+content3)]

In [25]:
seg1[:3]

[['武漢',
  '肺炎',
  '蔓延',
  '中',
  '該',
  '如何',
  '預防',
  '一',
  '次',
  '看懂',
  '新型',
  'SARS',
  'MERS',
  '的',
  '差別'],
 ['衛生', '福利部', '疾病', '管制署', '首', '頁'],
 ['針對', '武漢', '肺炎', '研發', '及', '製作', '快', '篩試劑', '及', '疫苗', '之', '作為', '科技部']]

In [26]:
def get_all_gram(sentences, n):
    grams = [w for sent in sentences for w in sent]
    for i in range(2, n+1):
        grams += [''.join(gram) for gram in ngram(sentences, i)]
    return grams

In [27]:
allgram1 = get_all_gram(seg1, 3)
allgram2 = get_all_gram(seg2, 3)
allgram3 = get_all_gram(seg3, 3)

In [28]:
allgram1[:10]

['武漢', '肺炎', '蔓延', '中', '該', '如何', '預防', '一', '次', '看懂']

In [29]:
c1 = Counter(allgram1)
c2 = Counter(allgram2)
c3 = Counter(allgram3)

In [30]:
c1.most_common(10)

[('肺炎', 208),
 ('專區', 79),
 ('嚴重', 79),
 ('武漢', 77),
 ('傳染性', 77),
 ('特殊', 76),
 ('嚴重特殊', 76),
 ('傳染性肺炎', 76),
 ('特殊傳染性', 75),
 ('嚴重特殊傳染性', 75)]

In [31]:
c2.most_common(10)

[('肺炎', 235),
 ('武漢', 176),
 ('武漢肺炎', 172),
 ('疫情', 86),
 ('防疫', 67),
 ('因應', 53),
 ('專區', 51),
 ('的', 47),
 ('肺炎疫情', 47),
 ('及', 46)]

In [32]:
c3.most_common(10)

[('新冠肺炎', 181),
 ('肺炎', 123),
 ('新冠', 90),
 ('疫情', 70),
 ('的', 60),
 ('專區', 58),
 ('新', 50),
 ('防疫', 50),
 ('2020年', 38),
 ('最', 36)]

output file

In [33]:
write_counter(c1, 'ckiptagger-1.out')
write_counter(c2, 'ckiptagger-2.out')
write_counter(c3, 'ckiptagger-3.out')