In [1]:
# use LDA for  Content topic mining
import tarfile  
import os  
import jieba.posseg as pseg  
from gensim import corpora, models  
from bs4 import BeautifulSoup  

In [2]:
# Chinese word segmentation
def jieba_cut(text):
    rule_words = ['z', 'vn', 'v', 't', 'nz', 'nr', 'ns', 'n', 'l', 'i', 'j', 'an',
                  'a'] 
    words = pseg.cut(text) 
    seg_list = [] 
    for word in words:  
        if word.flag in rule_words:
            seg_list.append(word.word)  # 将分词追加到列表
    return seg_list

In [3]:
jieba_cut("我是一只猪")

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.932 seconds.
Prefix dict has been built succesfully.


['是', '猪']

In [5]:
# text process
def text_pro(words_list, tfidf_object=None, training=True):
    dic = corpora.Dictionary(words_list) 
    print ('{:*^60}'.format('token & word mapping review:'))
    for i, w in dic.items()[:5]:  
        print ('token:%s -- word:%s' % (i, w))
    corpus = []  
    for words in words_list: 
        corpus.append(dic.doc2bow(words))  
    print ('{:*^60}'.format('bag of words review:'))
    print (corpus[0]) 
    if training == True:
        tfidf = models.TfidfModel(corpus) 
        corpus_tfidf = tfidf[corpus] 
        print ('{:*^60}'.format('TF-IDF model review:'))
        for doc in corpus_tfidf:  
            print (doc)  
            break
        return dic, corpus_tfidf, tfidf
    else:
        return tfidf_object[corpus]


In [18]:
# 全角转半角
def str_convert(content):
    new_str = ''
    for each_char in content:  
        code_num = ord(each_char)  
        if code_num == 12288:  
            code_num = 32
        elif (code_num >= 65281 and code_num <= 65374):  
            code_num -= 65248
        new_str += unichr(code_num)
    return new_str


# parse text
def data_parse(data):
    raw_code = BeautifulSoup(data, "lxml")  
    content_code = raw_code.find_all('content')  
    content_list = [] 
    for each_content in content_code: 
        if len(each_content) > 0:  
            raw_content = each_content.text  
            #convert_content = str_convert(raw_content)  
            content_list.append(raw_content)  
    return content_list

In [19]:
if not os.path.exists('../../data/LDA/news_data'):
    print ('extract data from news_data.tar.gz...')
    tar = tarfile.open('../../data/LDA/news_data.tar.gz')  
    names = tar.getnames()  
    for name in names:  
        tar.extract(name, path='../../data/LDA/')  
    tar.close()  

In [20]:
print ('walk files and get content...')
all_content = [] 
for root, dirs, files in os.walk('../../data/LDA/news_data'):  
    for file in files:  
        file_name = os.path.join(root, file) 
        with open(file_name) as f:  
            data = f.read()  
        all_content.extend(data_parse(data)) 

walk files and get content...


In [21]:
print ('get word list...')
words_list = []  # 分词列表，用于存储所有文件的分词结果
for each_content in all_content:  # 循环读出每个文本内容
    words_list.append(list(jieba_cut(each_content)))  # 将文件内容的分词结果以列表的形式追加到列表

# topic model
print ('train topic model...')
dic, corpus_tfidf, tfidf = text_pro(words_list, tfidf_object=None, training=True)  # 训练集的文本预处理
num_topics = 3  # 设置主题个数
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=num_topics)  # 通过LDA进行主题建模
print ('{:*^60}'.format('topic model review:'))
for i in range(num_topics):  # 输出每一类主题的结果
    print (lda.print_topic(i))  # 输出对应主题

# 新数据集的主题模型预测
print ('topic forecast...')
with open('article.txt') as f:  # 打开新的文本
    text_new = f.read()  # 读取文本数据
text_content = data_parse(data)  # 解析新的文本
words_list_new = jieba_cut(text_new)  # 将文本转换为分词列表
corpus_tfidf_new = text_pro([words_list_new], tfidf_object=tfidf, training=False)  # 新文本数据集的预处理
corpus_lda_new = lda[corpus_tfidf_new]  # 获取新的分词列表（文档）的主题概率分布
print ('{:*^60}'.format('topic forecast:'))
print (list(corpus_lda_new))

['唯一编号\u3000８１８７４２２７３８\ue40c中介公司内部编号\u3000ＴＸＦ－朴\ue40c城区与地址\u3000朝阳区\u3000望京慧谷阳光慧谷根园华鼎世家ＣＬＡＳＳ果岭里大西洋新城香\ue40c所在小区\u3000长岛澜桥！！别墅\ue40c交通状况\u3000便利\ue40c售价\u3000５２２万元／套\ue40c年代、类别与装修\u3000２００４年\u3000豪华\ue40c户型与面积\u3000４室２厅３卫２６１㎡〔建筑面积〕\ue40c楼层\u3000第１层\u3000／楼高４层\u3000［朝向］南北\ue40c暖气\u3000市政供暖\ue40c说明\u3000花园３０多平米，送二个车位，小区环境特别好，看房请提前与我联系，另外本公司还有其他公寓别墅小区房源出售，欢迎您来电咨询\ue40c有效期\u3000９０天\ue40c时间\ue40c联系信息\ue40c联系人\u3000田小姐\ue40c联系电话\u3000８４７２５３５３\ue40c手机\u3000１３３９１８６８９９１\ue40c［个人免费区］\u3000\ue40c４在线帮助：梦源',
 '唯一编号\u3000８１５８２６８５２３\ue40c中介公司内部编号\u3000０６２７\ue40c城区与地址\u3000朝阳区\u3000赛特日坛路秀水街建外大街永安东里甲\u3000３号\ue40c交通状况\u3000永安里、建国门地铁\u3000很多长安街公交都到\u3000直达机场班车\ue40c售价\u3000４４０万元／套\ue40c年代、类别与装修\u3000＜未知年份＞\u3000豪华\ue40c户型与面积\u3000３室２厅２卫２０２㎡〔建筑面积〕\ue40c楼层\u3000第２层\u3000／楼高３０层\u3000［朝向］南北\ue40c暖气\u3000市政供暖\ue40c说明\u3000２．２万／㎡\u3000特价通用时代环境很好，在国贸附近，周边设施全．室内环境很好，格局装修都不错．房子南北通透，楼层理想。我公司另有大量ＣＢＤ租赁、买卖房源未刊登，你可随时拨打２４小时热线电话：１３９１１３０６６７３\u3000。我们会用最专业的知识、热情的为您服务。感谢您对我公司的关注。\u3000\ue40c有效期\u3000９０天\ue40c时间',
 '［点击图片进入