In [1]:
import jieba
import re
import copy
from datetime import datetime
from pymongo import MongoClient
import pprint
import jieba.posseg as pseg

In [2]:
from pymongo import MongoClient
client = MongoClient("mongodb://127.0.0.1:27017")
db = client["<db name>"]
collection = db["<collection name>"]
contents = [news["content"] for news in collection.find({},{"_id":0}).limit(100)]


In [3]:
### 載入停用字檔案，並做成一個list
stopwords_list = [line.strip() for line in open('stopwords.txt',"r").readlines()]

### 載入同義字檔案，並做成一個字典
syn_dict = {}
with open("syn.txt","r") as f :
    for line in f:
        for word in line.strip("\n").split("\t")[1:]:
            syn_dict[word] = line.strip("\n").split("\t")[0]

### 載入中文斷詞字典
jieba.set_dictionary("dict.txt")

# ### 載入MongoDB裡面的新聞
# client = MongoClient("mongodb://10.120.37.108:27017")
# db = client["fb"]
# collection = db["iw_posts"]   
# news_list_temp = collection.find({},{"_id":0})              
# news_list = list(news_list_temp)

In [4]:
def stopwords(w):
    if w not in stopwords_list:
        return w

def syn(w):
    if w in syn_dict.keys():
        w=syn_dict[w]
        return w
    else:
        return w

def cut(news):
    w = jieba.cut(news, cut_all=False ,HMM=True)
    return w

def regular(w):
    line = re.findall('[\u4e00-\u9fa5]+', w)
    if len(line) > 0:
        return line

In [5]:
def text_cleaning(paragraph):
    me_words = []
    words = cut(paragraph) 
    for w in words:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    return " ".join(me_words)

In [6]:
def cleaned_news(news_list):
    cleaned_news_list = []
    for news in copy.deepcopy(news_list):
        if 'message' in news:
            news["message"] = text_cleaning(news["message"])
            cleaned_news_list.append(news)
    return cleaned_news_list

In [8]:
def noun_cleaned(content):   
    words = [(word, flag) for word, flag in pseg.cut(content)]
    noun_list = [w[0] for w in words if w[1]=="n"]
    me_words=[]
    for w in noun_list:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    return " ".join(me_words)

In [9]:
news_important_noun=[]
for content in contents:
    words = [(word, flag) for word, flag in pseg.cut(content)]
    noun_list = [w[0] for w in words if w[1]=="n"]
    me_words=[]
    for w in noun_list:
        if w.startswith("一"or"三"or"四"or"五"or"六"or"七"or"八"or"九"or"十") == False :
            if len(w)>1:
                w = regular(w)
                if w is not None:
                    w_stopwords = stopwords(w[0])
                    if w_stopwords is not None:
                        w_syn = syn(w_stopwords)
                        me_words.append(w_syn)
    news_important_noun.append(" ".join(me_words))        

Building prefix dict from C:\Users\Java\python無聊爬蟲\語意處理\gensim\dict.txt ...
Loading model from cache C:\Users\Java\AppData\Local\Temp\jieba.ua375d656abaf5e911c78a901a527f3ba.cache
Loading model cost 0.626 seconds.
Prefix dict has been built succesfully.


In [11]:
from  sklearn.feature_extraction.text  import  CountVectorizer  

vectorizer = CountVectorizer()  
X = vectorizer.fit_transform(news_important_noun)  
words = vectorizer.get_feature_names()  

from  sklearn.feature_extraction.text  import  TfidfTransformer  
  
transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(X)  
print(tfidf.toarray())

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [12]:
important_word_list = []

for important_word in tfidf.toarray():
    important_word_list.append(sorted(list(zip(words, important_word)), key = lambda x : x[1], reverse=True)[:10])
important_word_list

[[('薪水', 0.57384233431353004),
  ('程序', 0.48301643288878132),
  ('依法', 0.39700969409825332),
  ('勞局', 0.26627951694357582),
  ('法定', 0.2443437920324594),
  ('限期', 0.22878014752404294),
  ('理由', 0.1792086775478024),
  ('行政', 0.15378140868497714),
  ('公司', 0.14740932828227724),
  ('部分', 0.13906975271847735)],
 [('月薪', 0.46118205536426876),
  ('退休金', 0.46118205536426876),
  ('老化', 0.42319053871840712),
  ('人力', 0.39623512868260158),
  ('教育部', 0.23059102768213438),
  ('隔代', 0.21159526935920356),
  ('作法', 0.18766344415359162),
  ('基金', 0.17912180601836994),
  ('可能性', 0.17189995387682161),
  ('薪水', 0.16564410100046717)],
 [('政委', 0.4610448997113048),
  ('市府', 0.42231771538122614),
  ('先生', 0.40694855771178184),
  ('交通', 0.28325393288909667),
  ('小孩', 0.28325393288909667),
  ('警官', 0.28325393288909667),
  ('政府', 0.21609030138488197),
  ('情形', 0.21115885769061307),
  ('原本', 0.20347427885589092),
  ('公司', 0.15680617292595822)],
 [('事情', 0.44684569078450237),
  ('保留地', 0.34181047465450726),
  ('