## Using Kmeans to make a news cluster.

In [24]:
import pandas as pd
import numpy as np
import re
import jieba
import json
from sklearn.feature_extraction.text import TfidfVectorizer


def preprocess(contents):
    new_contents=[]
    for content in contents:
        content = ''.join(re.findall(r'[\d|\w]+', content))
        content = ' '.join(jieba.cut(content))
        new_contents.append(content)
    return new_contents

def convert(data):
    global type_dict
    global type_count
    type = json.loads(data['feature'])['type']
    if not type in type_dict:
        type_dict[type] = type_count
        type_count += 1
    return type_dict[type]


fname = '../lesson05/sqlResult_1558435.csv'
database = pd.read_csv(fname, encoding='gb18030', usecols=['feature', 'content'])
database = database.fillna('')

type_dict = dict()
type_count = 0

database['target'] = database.apply(convert, axis=1)

contents = database['content'].tolist()
new_contents = preprocess(contents)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(new_contents)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DINGLI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.050 seconds.
Prefix dict has been built succesfully.


In [38]:
database.shape

(89611, 3)

In [26]:
print(type_dict)

{'科技': 0, '国际新闻': 1, '新闻': 2, '市场': 3, '观点滚动': 4, '软件': 5, '军事': 6, '体育': 7, '国内新闻': 8, 'IT业界': 9, '游戏': 10, '国际财经': 11, '中国财经': 12, '金融市场': 13, '公司': 14, '商业?公司': 15, '娱乐': 16, '健康': 17, '财经': 18, '时尚': 19, '社会': 20, '旅游': 21, '趣闻': 22, '岛内扫描': 23, '期市': 24, '债市': 25, '个股评级': 26, '即时播报': 27, '数据资金': 28, '时事要闻': 29, '社会观察': 30, '港澳传真': 31, '神马新闻卡': 32, '社会滚动': 33, '社会滚动供wifi': 34, '科学': 35, '国际': 36, '其它': 37, '欧美新闻': 38, '影视': 39, '音乐': 40, '汽车': 41, '中国军情': 42, '基金': 43, '其他地区': 44, '环球博览': 45, '亚太新闻': 46, '海外': 47, '大趋势': 48, '中国观点': 49, '时事漫谈': 50, '时政': 51, '博览': 52, '参考快评': 53, '国际观点': 54, '探索发现': 55, '健康生活': 56, '评论': 57, '武器装备': 58, '周边动态': 59, '国际军情': 60, '海峡两岸': 61, '台湾要闻': 62, '新三板': 63, '台海': 64, '房地产': 65, '海外看中国': 66, '行业': 67, '中国外交': 68, '环境保护': 69, '宏观': 70, '中山新闻': 71, '区街事': 72, '深圳新闻': 73, '时尚荟': 74, '名利场': 75, '广告': 76, '车生活': 77, '追热点': 78, '国内': 79, '荟生活': 80, '佛山新闻': 81, '第1纸·今日天下': 82, '镇能量': 83, '生活圈': 84, '东莞新闻': 85, '悦生活·Chic方式': 86, '悦生活': 87, '揾食广东·健康有约': 

In [27]:
from sklearn.cluster import KMeans
from time import time

km = KMeans(n_clusters=5)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print(km.labels_)

done in 9333.511s

[2 2 2 ... 2 2 2]


In [28]:
len(km.labels_)

89611

In [36]:
km.labels_[82000:83000]

array([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3,
       3, 0, 3, 3, 4, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4,
       2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,

In [32]:
km.cluster_centers_

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.31898221e-05, 2.77865244e-06, 6.11516537e-07, ...,
        6.76575857e-07, 1.17949364e-06, 1.12937259e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.14803636e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.49730782e-07]])

In [37]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: 选手 网球 晋级 比赛 公开赛 战胜 2017 乒乓球 新华社 决赛
Cluster 1: 足球 球员 联赛 赛季 20162017 比赛 外代 新华社 当日 主场
Cluster 2: 新华社 中国 记者 2017 北京 照片 国际 发展 美国 企业
Cluster 3: nn 一带 一路 播发 国际 合作 稿件 中国 重要 高峰论坛
