In [1]:
import gensim
import pandas as pd
import jieba
import re
import stanfordcorenlp
import math
import numpy as np
import random

# 1. Data preprocessing

In [4]:
data_source = "./sqlResult_1558435.csv"
data = pd.read_csv(data_source,encoding='gb18030')
data = data.fillna('')  #缺失数据填充
content = data['content'].tolist()
def cut(string): return ' '.join(jieba.cut(string))
def token(string): return re.findall(r'[\d|\w]+',string)

In [5]:
news_content = [token(n) for n in content]
news_content = [''.join(n) for n in news_content]
news_content = [cut(n) for n in news_content]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\wy\AppData\Local\Temp\jieba.cache
Loading model cost 1.462 seconds.
Prefix dict has been built successfully.


# 2. TF-IDF

Term frequency: $tf_{t,d} = log_{10} count(t,d)+1$  
Inverse document frequency: $idf_{t} = log_{10} (N/df_{t})$  
TF-idfweighted value: $w_{t,d} = tf_{t,d} x idf_{t}$   

N - 文档总数  
df_t - 出现t这个词的文档数目

In [11]:
def document_frequency(word):
    return sum(1 for n in news_content if word in n)

def idf(word):
    return math.log10(len(news_content)/document_frequency(word))+1

def tf(word,document):
    words = document.split()
    return sum(1 for w in words if w==word)

def tf_idf(word,document):
    return tf(word,document)*idf(word)

def get_keywords_of_a_document(document):
    words = set(document.split())
    tfidf = [
        (w,tf_idf(w,document)) for w in words
    ]
    tfidf = sorted(tfidf,key=lambda x:x[1],reverse=True)
    return tfidf

In [12]:
get_keywords_of_a_document(news_content[1])[:5]

[('骁龙', 18.28860032832347),
 ('Windows10', 15.14781401040159),
 ('桌面', 11.674990450316585),
 ('的', 8.84117289241158),
 ('高通', 8.392972936239964)]

# 3. TFIDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

In [19]:
vectorizer = TfidfVectorizer(max_features=10000)
sample_num = 50000 
sub_samples = news_content[:sample_num]
X = vectorizer.fit_transform(sub_samples)  #X为50000x10000的稀疏矩阵,10000表示word的个数

In [20]:
document_1,document_2 = random.randint(0,1000),random.randint(0,1000)
vector_of_document_1 = X[document_1].toarray()[0]
vector_of_document_2 = X[document_2].toarray()[0]
def distance(v1,v2): return cosine(v1,v2)
distance(vector_of_document_1,vector_of_document_2)

0.997881303720474

# 4. LDA

In [21]:
from gensim import corpora, models
import jieba.posseg as jp,jieba  #jp用于词性标注

In [26]:
flags = ('n', 'nr', 'ns', 'nt', 'eng', 'v', 'd')
data = pd.read_csv(data_source,encoding='gb18030')
news = data["content"][:100]  #取前100个

#获取停用词表
stop_words = []
with open('./baidu_stopwords.txt',encoding='utf-8') as f:
    for word in f.readlines():
        stop_words.append(word.strip())

#获取词语列表        
words_ls = []
for text in news:
    words = [w.word for w in jp.cut(text) if w.flag in flags and w.word not in stop_words]
    words_ls.append(words)

#为语料库的词分配编号
dictionary = corpora.Dictionary(words_ls)
corpus = [dictionary.doc2bow(words) for words in words_ls]

#lda
lda = models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=5)

In [31]:
lda.get_document_topics(corpus)

<gensim.interfaces.TransformedCorpus at 0x239cf110390>

In [32]:
lda.get_topic_terms(0,topn=10)

[(445, 0.007042604),
 (401, 0.004634682),
 (154, 0.004621647),
 (80, 0.0041359016),
 (372, 0.0040862425),
 (140, 0.0040819966),
 (151, 0.0037752644),
 (131, 0.0034634636),
 (885, 0.0029189899),
 (212, 0.00283667)]

In [33]:
lda.get_term_topics(0,minimum_probability=1e-4)  #0为词id

[(2, 0.0003043753), (3, 0.00012307003)]

In [34]:
for topic in lda.print_topics(num_words=10):
    print(topic)

(0, '0.007*"市场" + 0.005*"中国" + 0.005*"会" + 0.004*"还" + 0.004*"都" + 0.004*"不" + 0.004*"人" + 0.003*"记者" + 0.003*"电影" + 0.003*"女子"')
(1, '0.005*"内容" + 0.005*"不" + 0.005*"会" + 0.005*"都" + 0.004*"市场" + 0.004*"人" + 0.004*"记者" + 0.004*"还" + 0.003*"乐视" + 0.003*"中国"')
(2, '0.006*"企业" + 0.006*"都" + 0.005*"手机" + 0.005*"会" + 0.004*"跨境" + 0.004*"乐视" + 0.004*"不" + 0.004*"人" + 0.004*"市场" + 0.004*"还"')
(3, '0.005*"都" + 0.005*"不" + 0.005*"中国" + 0.004*"市场" + 0.004*"会" + 0.004*"人" + 0.004*"还" + 0.004*"冰架" + 0.003*"摄像头" + 0.003*"乐视"')
(4, '0.006*"市场" + 0.004*"中国" + 0.004*"企业" + 0.004*"都" + 0.004*"配送" + 0.004*"冰架" + 0.004*"人" + 0.004*"跨境" + 0.004*"还" + 0.003*"相关"')
