In [46]:
import jieba
import re
import copy
from datetime import datetime
from pymongo import MongoClient
import pprint

In [47]:
### stopword list
stopwords_set = set([line.strip() for line in open('stopwords.txt',"r").readlines()])

### syn dict
syn_dict = {}
with open("syn.txt","r") as f :
    for line in f:
        for word in line.strip("\n").split("\t")[1:]:
            syn_dict[word] = line.strip("\n").split("\t")[0]

### 中文斷詞字典
jieba.load_userdict("dict.txt")

### MongoDB裡面的新聞
client = MongoClient("mongodb://127.0.0.1:27017")
db = client["<db name>"]
collection = db["<collection name>"]                 #<<<<<<<<<<<<<<<<<<<<================================= 輸入自己的新聞名子
news_list_temp = collection.find({},{"_id":0,"content":1})          
news_list = list(news_list_temp)

In [48]:
def stopwords(w):
    if w not in stopwords_set:
        return w

def syn(w):
    if w in syn_dict.keys():
        w=syn_dict[w]
        return w
    else:
        return w

def cut(news):
    w = jieba.cut(news, cut_all=False ,HMM=True)
    return w

def regular(w):
    line = re.findall('[\u4e00-\u9fa5]+', w)
    if len(line) > 0:
        return line

In [49]:
def text_cleaning(paragraph):
    me_words = []
    words = cut(paragraph) 
    for w in words:
        if len(w)>1:
            if not re.match(r"^[三|四|五|六|七|十]", w):
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    return " ".join(me_words)

In [50]:
def cleaned_news(news_list):
    cleaned_news_list = []
    for news in copy.deepcopy(news_list):
        news["title"]   = news["title"].replace("\u200b","").replace("\u3000","")
        news["datetime"]= datetime.strptime(news["datetime"].strip(),"%Y-%m-%d")
        news["content"] = text_cleaning(news["content"])
        cleaned_news_list.append(news)
    return cleaned_news_list

In [51]:
def main():
    db_cleaned = client["db name"]
    collection = db_cleaned["<collection name>"]  #<<<<<<<<<<<<<<<<<<<<=============================                
#     print(cleaned_news(news_list))
    collection.insert_many(cleaned_news(news_list))  
    content_list=[]
    
    ##All News data
    for n in cleaned_news(news_list):
        content_list.append(n["content"])
    All_news = str(content_list).replace("[","").replace("]","").replace("'","").replace(",","")
    with open("All_news.txt","w",encoding="utf-8") as f:
        f.write(All_news)

In [52]:
if __name__ == "__main__":
    main()  