# Cofacts articles trending keywords extraction

### In this notebook, we demonstrate trending keywords extraction using [jieba](https://github.com/fxsjy/jieba). Specifically, given an interested date, it will output corresponding keywords within 1, 3, 7, and 30 days.


In [1]:
# coding=utf-8

# import packages
import os
import sys
import numpy as np
import pandas as pd

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import jieba
import jieba.posseg as pseg

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
# record file names
repos = ['./raw_data']

repo_files = []
for repo in repos:
    files = os.listdir(repo)
    files = [file for file in files if 'json' in file]
    
    repo_files.append(files)

    print('There are '+ str(len(files)) +' files in '+repo )

There are 14908 files in ./raw_data


In [3]:
# load tags information in each repo
import json

article_list = []

for files in repo_files:
    
    for file in files:
        
        article = {}
        
        with open(os.path.join(repos[0], file), 'r') as f:
            data = json.load(f)

        article['id'] = file
        article['text'] = data['text']
        article['date'] = data['createdAt']
        
        article_list.append(article)

        
        
print(article_list[0])
    


{'id': '30552.json', 'text': '痛心！臉書為何離開臺灣？ 蘇煥智透露秘辛 賴清德該負責\n\n前台南縣長蘇煥智今天在臉書發文指出，臉書原本想在台南科學園區旁建立雲端數據中心，但被當時台南市長賴清德嫌太耗電，且賴也不願意協助其取得百分之百綠電。臉書因電力不足放棄台南後曾想移到彰化，但賴又升到中央當行政院長，臉書至此完全絕望，將亞洲數據中心轉往新加坡。這麼好的發展智慧軟體的機會，卻讓臉書離開台灣，令人痛心。\n\nhttps://www.chinatimes.com/realtimenews/20190430003907-260407', 'date': '2019-05-02T09:20:19.000Z'}


In [4]:
# sort data pool
df = pd.DataFrame(article_list)
df['date'] = pd.to_datetime(df.date)
df_sort = df.sort_values(by='date')
df_sort.head()

Unnamed: 0,id,text,date
8389,30604.json,"葡萄酒屬於強鹼性食品, 這個一定要看。\n癌症, 就這樣慢慢消失, 公德就這樣悄悄累積。 \...",2016-12-10 03:15:00+00:00
7321,20277.json,老人家冬天洗澡!! 《非常感謝這郵件的“始發者”和“轉發者”，對我太有益，很可能幫我“逃過一...,2016-12-16 12:00:00+00:00
14210,24800.json,轉貼\n\n重新認識九層塔\n\n 九層塔(香花草)證實會導致肝癌~\n\n 這一下，不得不...,2016-12-17 02:32:00+00:00
6613,24131.json,超強 ！用醋泡出來的8種養生偏方（治百病）...http://ezp9.com/p78327...,2016-12-17 02:34:00+00:00
10872,27162.json,天僅吃一勺一個月打通血管\n真後悔現在才知道！ 是真是假?!專家告訴你！...\nhttp:...,2016-12-17 02:35:00+00:00


In [5]:
# find documents at given date and within specified time windows
given_date = '2017-01-01' + 'T00:00:00.000Z'
time_windows = [1, 3, 7, 30]

# calculate time difference
df_sort['given_date'] = pd.to_datetime(given_date)
df_sort['difference'] = (df_sort['given_date'] - df_sort['date']).dt.days

# retrieve documents within specified time windows
documents = []
for time_window in time_windows:
    documents.append(df_sort[ df_sort['difference'].between(0, time_window, inclusive=True) ])

print(documents[0])

               id                                               text  \
13343  22937.json                                            這個是真的嗎？   
4226   28369.json  女生ㄧ定要看：甲狀腺癌 \n  \n上週三，奧茲醫生在一個婦女節目中演講現今增長最快的女性癌...   
11934  25586.json  了慶祝Line破億下載率，請將此訊息傳給50個人，系統將自動統計，若以成功達成任務，可獲得[...   
5147   27706.json  大廚不外傳的22個小訣竅：\nhttp://myytaoli.blogspot.tw/201...   
7034   19788.json  Click to Watch > 【平馬樁_ 靜坐這一檔子事2 – 導引功法】 in HD ...   

                           date                given_date  difference  
13343 2016-12-30 04:04:00+00:00 2017-01-01 00:00:00+00:00         1.0  
4226  2016-12-30 04:04:00+00:00 2017-01-01 00:00:00+00:00         1.0  
11934 2016-12-30 16:34:00+00:00 2017-01-01 00:00:00+00:00         1.0  
5147  2016-12-31 07:21:00+00:00 2017-01-01 00:00:00+00:00         0.0  
7034  2016-12-31 14:46:00+00:00 2017-01-01 00:00:00+00:00         0.0  


In [8]:
# Create tokenized content list for tf-idf sklearn
import jieba.analyse
topK = 100
withWeight = True

keywords_list = []
for document in documents: # loop over different time windows 
    
    doc_texts = str(list(document['text']))

    tags = jieba.analyse.extract_tags(doc_texts, topK=topK, withWeight=withWeight)
    
    keywords_list.append(tags)
    
    if withWeight is True:
        for tag in tags:
            print("tag: %s\t\t weight: %.3f" % (tag[0],tag[1]))
    else:
        print(",".join(tags))
        
    print('==========')


tag: X光		 weight: 0.451
tag: 檢查		 weight: 0.282
tag: 奧茲		 weight: 0.226
tag: 醫生		 weight: 0.226
tag: 乳房		 weight: 0.192
tag: 這個		 weight: 0.169
tag: 甲狀		 weight: 0.169
tag: 一個		 weight: 0.169
tag: 節目		 weight: 0.169
tag: 甲狀腺護		 weight: 0.169
tag: 我們		 weight: 0.169
tag: 腺癌		 weight: 0.154
tag: 癌病		 weight: 0.131
tag: 增長		 weight: 0.113
tag: 牙齒		 weight: 0.113
tag: 甲狀腺		 weight: 0.113
tag: 隨機		 weight: 0.113
tag: 22		 weight: 0.113
tag: 病人		 weight: 0.096
tag: 脖子		 weight: 0.065
tag: 使用		 weight: 0.064
tag: 女性		 weight: 0.063
tag: 中演		 weight: 0.059
tag: 上週		 weight: 0.056
tag: 婦女		 weight: 0.056
tag: 講現		 weight: 0.056
tag: 這是		 weight: 0.056
tag: 有關		 weight: 0.056
tag: 當牙		 weight: 0.056
tag: 醫幫		 weight: 0.056
tag: 牙齒照		 weight: 0.056
tag: 片時		 weight: 0.056
tag: 那個		 weight: 0.056
tag: 機設備		 weight: 0.056
tag: 活動		 weight: 0.056
tag: 舉起		 weight: 0.056
tag: 保護甲		 weight: 0.056
tag: 狀腺		 weight: 0.056
tag: 許多牙		 weight: 0.056
tag: 醫懶		 weight: 0.056
tag: 還有		 weight: 0.056
tag: 一種		

In [11]:
# generate output json files
output_repo = './output_keywords_result'
keywords_result_dict = {}

if not os.path.exists(output_repo):
    os.mkdir(output_repo)

keywords_result_dict['date'] = given_date

for time_window, keywords in zip(time_windows, keywords_list):
    
    # reduce precision for file size reduction
    keywords_dict = dict(keywords)
    for k, v in keywords_dict.items():
        keywords_dict[k]=round(v, 3)
    
    keywords_result_dict[str(time_window)+'d'] = keywords_dict
    
with open(os.path.join(output_repo, 'keywords.json'), 'w') as f:
    json.dump(keywords_result_dict, f, ensure_ascii=False, indent=4)

print(keywords_result_dict)    


{'date': '2017-01-01T00:00:00.000Z', '1d': {'X光': 0.451, '檢查': 0.282, '奧茲': 0.226, '醫生': 0.226, '乳房': 0.192, '這個': 0.169, '甲狀': 0.169, '一個': 0.169, '節目': 0.169, '甲狀腺護': 0.169, '我們': 0.169, '腺癌': 0.154, '癌病': 0.131, '增長': 0.113, '牙齒': 0.113, '甲狀腺': 0.113, '隨機': 0.113, '22': 0.113, '病人': 0.096, '脖子': 0.065, '使用': 0.064, '女性': 0.063, '中演': 0.059, '上週': 0.056, '婦女': 0.056, '講現': 0.056, '這是': 0.056, '有關': 0.056, '當牙': 0.056, '醫幫': 0.056, '牙齒照': 0.056, '片時': 0.056, '那個': 0.056, '機設備': 0.056, '活動': 0.056, '舉起': 0.056, '保護甲': 0.056, '狀腺': 0.056, '許多牙': 0.056, '醫懶': 0.056, '還有': 0.056, '一種': 0.056, '時用': 0.056, '醫院': 0.056, '問照': 0.056, '技師': 0.056, '技術員': 0.056, '抽屜裡': 0.056, '不經常': 0.056, '問為': 0.056, '什麼': 0.056, '技術員答': 0.056, '問才': 0.056, '會用': 0.056, '看過': 0.056, '怎麼': 0.056, '去問': 0.056, '雖然照': 0.056, '幾秒': 0.056, '幾分鐘': 0.056, '輻射': 0.056, '影響': 0.056, '以後記': 0.056, '每當': 0.056, '要照': 0.056, '記得': 0.056, '傳遞給': 0.056, '女兒': 0.056, '母親': 0.056, '告訴': 0.056, '腸的': 0.056, '轉發': 0.056, '了給'