# 文本主题可视化

In [None]:
# !pip install pyldavis jieba -i  https://mirrors.163.com/pypi/simple/

In [6]:
import sys
import os
import jieba # pip install jieba
import urllib.request as ur
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn


In [18]:
data_file_url = 'https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/cnews_data.zip'
base_dir = './datasets/cnews/'

# input files
val_file = base_dir + 'cnews.val.txt'


## 下载训练数据

In [8]:
%%time
!pwd
if not os.path.exists(val_file):
    !mkdir ./datasets
    !mkdir ./datasets/cnews
    print('{}不存在, 开始下载文件'.format(val_file))
    ur.urlretrieve(data_file_url, "cnews_data.zip")
    !unzip ./cnews_data.zip
    !rm ./cnews_data.zip
    !mkdir ./datasets/cnews 
    !mv cnews.train.txt ./datasets/cnews/
    !mv cnews.test.txt ./datasets/cnews/
    !mv cnews.val.txt ./datasets/cnews/
    !rm -fr __MACOSX
else:
    print('文件已经存在')

/home/ec2-user/examples/machine-learning/stepbystep
./dataset/cnews/cnews.val.txt不存在, 开始下载文件
Archive:  ./cnews_data.zip
  inflating: cnews.test.txt          
   creating: __MACOSX/
  inflating: __MACOSX/._cnews.test.txt  
  inflating: cnews.val.txt           
  inflating: __MACOSX/._cnews.val.txt  
  inflating: cnews.train.txt         
  inflating: __MACOSX/._cnews.train.txt  
mkdir: cannot create directory ‘./datasets/cnews’: File exists
CPU times: user 251 ms, sys: 273 ms, total: 523 ms
Wall time: 3.55 s


In [10]:

def stopwordslist(stopwords_file):
    return [line.strip() for line in open(stopwords_file ,encoding='UTF-8').readlines()]

In [13]:
def create_train_data(stopwords, val_file, out_file):
    train_data = []


    with open(val_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        index = 0
        for index, line in enumerate(lines):
            label, content = line.strip('\r\n').split('\t')
            word_iter = jieba.cut(content)
            new_word_iter = []
            for w in word_iter:
                if w not in stopwords and w != ' ':
                    new_word_iter.append(w)
            train_data.append(' '.join(new_word_iter))


    with open(out_file, 'w') as f:
        for i in train_data:
            f.write(i.strip()+'\n')
    return train_data

In [14]:
def load_train_data(train_data_file):
    train_data = []
    with open(train_data_file, 'r', encoding='utf-8') as f:

        lines = f.readlines()
        for line in lines:
            train_data.append(line.replace('\n', ''))

    return train_data

In [11]:
stopwords_file = './sample_data/stopwords.txt'
stopwords = stopwordslist(stopwords_file)

In [None]:
out_file = './datasets/train_data.txt'
train_data = create_train_data(stopwords, val_file, out_file)

## 加载数据

In [20]:
train_data = load_train_data('./datasets/train_data.txt')

In [21]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)
tfidf_mat = tfidf_vectorizer.fit_transform(train_data)

print('字典长度：', len(tfidf_vectorizer.vocabulary_))

n_topics = 5      # 自定义主题个数
lda_model = LatentDirichletAllocation(n_components=n_topics, batch_size=8, random_state=0)
# 使用TF-IDF矩阵拟合LDA模型
lda_model.fit(tfidf_mat)


# 主题词打印函数
def print_top_words(model, feature_names, n_top_words):

    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:"%(topic_idx+1))
        print(" ".join([feature_names[i] for i in topic.argsort()[-n_top_words-1:-1]]))



字典长度： 20340
Topic #1:
设计 开发商 社区 论坛 位于 信息 号楼 地图搜索 相册 样板间 均价 平方米 开盘 项目 户型
Topic #2:
造型 最佳 奥斯卡 上映 电影节 明星 黑色 娱乐 性感 导演 时尚 导语 影片 组图 搭配
Topic #3:
球员 房价 行业 股票 中国 经济 湖人 球队 分红 热火 房地产 公司 投资 比赛 市场
Topic #4:
机身 光学 发展 企业 支持 消费者 佳能 采用 中国 拍摄 产品 像素 功能 活动 游戏
Topic #5:
题目 作文 学生 单词 文章 大学 听力 答案 阅读 六级 信息 四级 英语 考生 四六级


##  打印每个主题前15个关键字

In [None]:
n_top_words = 15
tf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda_model, tf_feature_names, n_top_words)

##   数据可视化

In [None]:
data = pyLDAvis.sklearn.prepare(lda_model, tfidf_mat, tfidf_vectorizer)
pyLDAvis.show(data)