In [3]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans


class KmeansClustering():
    def __init__(self, stopwords_path=None):
        self.stopwords = self.load_stopwords(stopwords_path)
        self.vectorizer = CountVectorizer()
        self.transformer = TfidfTransformer()

    def load_stopwords(self, stopwords=None):
        """
        加载停用词
        :param stopwords:
        :return:
        """
        if stopwords:
            with open(stopwords, 'r', encoding='utf-8') as f:
                return [line.strip() for line in f]
        else:
            return []

    def preprocess_data(self, corpus_path):
        """
        文本预处理，每行一个文本
        :param corpus_path:
        :return:
        """
        corpus = []
        with open(corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                corpus.append(' '.join([word for word in jieba.lcut(line.strip()) if word not in self.stopwords]))
        return corpus

    def get_text_tfidf_matrix(self, corpus):
        """
        获取tfidf矩阵
        :param corpus:
        :return:
        """
        tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(corpus))

        # 获取词袋中所有词语
        # words = self.vectorizer.get_feature_names()

        # 获取tfidf矩阵中权重
        weights = tfidf.toarray()
        return weights

    def kmeans(self, corpus_path, n_clusters=5):
        """
        KMeans文本聚类
        :param corpus_path: 语料路径（每行一篇）,文章id从0开始
        :param n_clusters: ：聚类类别数目
        :return: {cluster_id1:[text_id1, text_id2]}
        """
        corpus = self.preprocess_data(corpus_path)
        weights = self.get_text_tfidf_matrix(corpus)

        clf = KMeans(n_clusters=n_clusters)

        # clf.fit(weights)

        y = clf.fit_predict(weights)

        # 中心点
        # centers = clf.cluster_centers_

        # 用来评估簇的个数是否合适,距离约小说明簇分得越好,选取临界点的簇的个数
        # score = clf.inertia_

        # 每个样本所属的簇
        result = {}
        for text_idx, label_idx in enumerate(y):
            if label_idx not in result:
                result[label_idx] = [text_idx]
            else:
                result[label_idx].append(text_idx)
        return result


if __name__ == '__main__':
    Kmeans = KmeansClustering(stopwords_path='/home/mengyuan/workDir/SeedTaxonomy/data/stop_words.txt')
    result = Kmeans.kmeans('/home/mengyuan/workDir/ThirdKG/DacilinIsConcept.txt', n_clusters=50)
    print(result)


{2: [0, 19, 56, 248, 292, 310, 364, 434, 600, 611, 746, 776, 819, 836, 839, 860, 898, 917, 972, 1035, 1050, 1109, 1118, 1130, 1176, 1190, 1197, 1307, 1314, 1331, 1350, 1394, 1410, 1490, 1587, 1627, 1639, 1662, 1737, 1776, 1811, 1891, 1915, 1931, 1996, 2004, 2133, 2249, 2255, 2293, 2305, 2443, 2477, 2530, 2601, 2658, 2677, 2687, 2783, 2811, 2929, 3052, 3068, 3087, 3110, 3240, 3330, 3393, 3405, 3452, 3502, 3521, 3535, 3610, 3645, 3658, 3823, 3828, 3844, 3873, 3878, 3897, 3968, 3992, 4022, 4087, 4206, 4268, 4282, 4344, 4368, 4397, 4451, 4577, 4589, 4600, 4667, 4738, 4803, 4839, 4895, 4914, 5056, 5105, 5135, 5141, 5161, 5191, 5197, 5348, 5426, 5497, 5522, 5565, 5621, 5669, 5694, 5717, 5734, 5775, 5816, 5871, 5900, 5991, 6022, 6108, 6136, 6197, 6308, 6336, 6412, 6425, 6453, 6488, 6713, 6746, 6824, 6836, 6964, 7003, 7091, 7113, 7180, 7228, 7249, 7256, 7380, 7406, 7472, 7509, 7587, 7688, 7714, 7838, 7867, 7911, 8089, 8146, 8226, 8265, 8313, 8338, 8372, 8383, 8513, 8543, 8642, 8794, 8836, 8957