In [1]:
# 导入数据处理和机器学习库
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import gammaln
import scipy
from scipy import spatial

# 导入NLP处理库
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

# 导入其他辅助库
import gensim
import pickle
import jieba
import re
import random
import copy
import operator
import os
import string

#导入自建库
from mrflda3 import LatentDirichletAllocationWithCooccurrence
from utils import (
    preprocess,
    processReviews,
    jieba_cut,
    coherence_score,
    get_hscore,
    kl_score
)

# 设置随机种子
random.seed(a=123)
np.random.seed(seed=123)

In [None]:
# 读取数据
excel_file = 'data.xlsx'
df = pd.read_excel(excel_file, nrows=100)
#df = pd.read_excel(excel_file)
# 应用jieba分词和去停用词
df['content_clean'] = df['Text'].apply(jieba_cut)

# 预处理数据
df['words'] = df['content_clean'].apply(lambda x: [item for item in x.split(" ")])
count_matrix, tfidf_matrix, vocabulary, words = processReviews(df['content_clean'].values)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.474 seconds.
Prefix dict has been built successfully.


In [3]:
def convert_numbers(k):
    for i in range(len(k)):
        try:
            num2words(int(k[i]))
            k[i] = " "
        except:
            pass
    return k

def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [4]:
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('sgns.target.word-ngram.1-2.dynwin5.thr10.neg5.dim300.iter5.bz2',binary=False,encoding='utf-8')

In [4]:
edges_threshold = 0.8

In [6]:
docs_edges, ignored, taken, count = [], [], [], 0
for idx, doc in enumerate(df['words'].values):
    edges = []
    for i in doc:
        for j in doc:
            if i != j:
                try:
                    a = embeddings_index[i]
                    b = embeddings_index[j]
                    if get_cosine(a, b) > edges_threshold:
                        edges.append((vocabulary[i], vocabulary[j]))
                except:
                    try:
                        embeddings_index[i]
                        taken.append(i)
                    except:
                        ignored.append(i)
                    try:
                        embeddings_index[j]
                    except:
                        ignored.append(j)
                        taken.append(j)
                    pass
    docs_edges.append(edges)

In [7]:
# 使用pickle将变量保存到文件
#with open('data_30.pkl', 'wb') as f:
#    pickle.dump((docs_edges, ignored, taken, count), f)

#with open('data_30.pkl', 'rb') as f:
    #docs_edges, ignored, taken, count = pickle.load(f)

In [8]:
#with open('data_all.pkl', 'wb') as f:
#    pickle.dump((docs_edges, ignored, taken, count), f)

In [9]:
edge_dict = {}
for i in docs_edges:
    for j in i:
        try:
            edge_dict[j[0]] += [j[1]]
        except:
            edge_dict[j[0]] = [j[1]]

In [10]:
#edge_dict

{1836: [2101,
  1662,
  1756,
  1662,
  1662,
  1973,
  2101,
  2101,
  2101,
  1662,
  2101,
  1756,
  1662,
  1756,
  1756,
  1756,
  1973,
  2101,
  2101,
  1662,
  1662,
  1662,
  1756,
  1756,
  1756,
  1662,
  1662,
  1662,
  1756,
  1756,
  1662,
  1756,
  1973,
  1662,
  1756,
  1662,
  1756,
  1756,
  1662,
  1756,
  1662,
  1756,
  1973,
  1973,
  1973,
  1973,
  1973,
  1662,
  1756,
  2101,
  2101,
  1756,
  1756,
  1662,
  1756,
  1973,
  1756,
  2101,
  2101,
  1756,
  1662,
  1973,
  1756,
  1756,
  1756,
  1756,
  1973,
  2101,
  1662,
  1756,
  1662,
  2101,
  1756,
  1756,
  2101,
  1756,
  1756,
  2101,
  1662,
  1662,
  1973,
  1756,
  1973,
  1756,
  1973,
  1756,
  1662,
  1662,
  1756,
  1756,
  1756,
  1756,
  1973,
  1662,
  1662,
  1756,
  2101,
  1973,
  1756,
  2101,
  1756,
  2101,
  1662,
  2101,
  1756,
  1756,
  1756,
  1756,
  1662,
  1756,
  1973,
  1756,
  1662,
  1662,
  1662,
  1662,
  1662,
  1756,
  1662,
  1756,
  1756,
  1662,
  1756,
  1756,
  

In [4]:
# 使用pickle保存edge_dict到文件
#with open('edge_dict.pkl', 'wb') as f:
#    pickle.dump(edge_dict, f)
    
# 使用pickle从文件中加载edge_dict
with open('edge_dict.pkl', 'rb') as f:
    edge_dict = pickle.load(f)

In [5]:
# 创建LDA模型实例
lda = LatentDirichletAllocationWithCooccurrence(lambda_param = 1.0,n_components=3, random_state=0, max_iter=20)
# 训练LDA模型
lda.fit(count_matrix, edge_dict=edge_dict, vocabulary=vocabulary)

# 获取文档-主题分布
doc_topic_dist = lda.transform(count_matrix)
# 获取词-主题分布
topic_word_dist = lda.getTopKWords(K=10, vocab=vocabulary)
# 计算一致性得分
coherence = coherence_score(count_matrix, topic_word_dist, vocabulary)
print(f"Coherence Score: {coherence}")
# 计算H分数
h_score = get_hscore(doc_topic_dist, count_matrix, lda.n_components)
print(f"H Score: {h_score}")

Starting fit with shape: 100 915
Iteration: 0
Lambda value at iteration 0: 1.6135776203056236
Iteration: 1
Lambda value at iteration 1: 2.4230432221821947
Iteration: 2
Lambda value at iteration 2: 2.9977627328288046
Iteration: 3


  self.exp_dirichlet_component_ = np.exp(self._dirichlet_expectation(self.components_))


Lambda value at iteration 3: 3.4437005804128864
Iteration: 4
Lambda value at iteration 4: 3.808134961120146
Iteration: 5
Lambda value at iteration 5: 4.116306147253691
Iteration: 6
Lambda value at iteration 6: 4.383287574390014
Iteration: 7
Lambda value at iteration 7: 4.618804987283928
Iteration: 8
Lambda value at iteration 8: 4.829500530267995
Iteration: 9
Lambda value at iteration 9: 5.0201125430858715
Iteration: 10
Lambda value at iteration 10: 5.194140066253211
Iteration: 11
Lambda value at iteration 11: 5.354240557986852
Iteration: 12
Lambda value at iteration 12: 5.502479854253057
Iteration: 13
Lambda value at iteration 13: 5.640495710796583
Iteration: 14
Lambda value at iteration 14: 5.769608471517424
Iteration: 15
Lambda value at iteration 15: 5.890898135751164
Iteration: 16
Lambda value at iteration 16: 6.005259368789174
Iteration: 17
Lambda value at iteration 17: 6.113441622298915
Iteration: 18
Lambda value at iteration 18: 6.21607895367391
Iteration: 19
Lambda value at iter

In [6]:
# 查看主题-词分布
topic_word_distribution = lda.components_
#print("Topic-Word Distribution:")
#print(topic_word_distribution)

# 查看特定文档的主题分布
document_topic_distribution = lda.transform(count_matrix)
#print("Document-Topic Distribution:")
#print(document_topic_distribution)

# 查看特定主题的词分布
#topic_number = 0  # 选择主题编号
#words = lda.vocabulary_.keys()  # 获取词汇表中的词
#print(f"Words distribution in topic {topic_number}:")
#print({word: topic_word_distribution[topic_number, idx] for idx, word in enumerate(words)})

# 想要特定文档的主题分布
#document_number = 0  # 选择文档编号
#print(f"Topic distribution in document {document_number}:")
#print(document_topic_distribution[document_number])

In [8]:
document_topic_distribution

array([[9.43174830e-01, 5.56664696e-02, 1.15870050e-03],
       [9.74452425e-02, 1.37146310e-01, 7.65408447e-01],
       [3.97490729e-04, 3.32176664e-04, 9.99270333e-01],
       [4.17104911e-01, 4.37737510e-01, 1.45157579e-01],
       [3.36802827e-02, 2.15827272e-02, 9.44736990e-01],
       [6.74153292e-03, 5.71595156e-03, 9.87542516e-01],
       [7.17244309e-04, 1.59053732e-04, 9.99123702e-01],
       [9.95469928e-01, 4.33869845e-06, 4.52573358e-03],
       [1.31717110e-03, 7.47024526e-01, 2.51658303e-01],
       [9.99601637e-01, 4.59154745e-05, 3.52448021e-04],
       [1.43359459e-04, 9.99708142e-01, 1.48498482e-04],
       [9.98885691e-01, 7.29468983e-04, 3.84840424e-04],
       [1.81320268e-02, 9.46751916e-01, 3.51160571e-02],
       [2.98342366e-01, 7.41011065e-05, 7.01583533e-01],
       [6.17980340e-02, 3.13531451e-02, 9.06848821e-01],
       [1.32940682e-03, 2.66390003e-04, 9.98404203e-01],
       [3.90991014e-03, 4.07612598e-04, 9.95682477e-01],
       [4.79538099e-01, 3.99361

In [19]:
import pickle

# 保存 topic_word_distribution
with open('topic_word_distribution.pkl', 'wb') as f:
    pickle.dump(topic_word_distribution, f)

# 保存 document_topic_distribution
with open('document_topic_distribution.pkl', 'wb') as f:
    pickle.dump(document_topic_distribution, f)

In [21]:
import csv

# 保存得分到CSV文件
with open('model_scores.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Coherence Score', 'H Score'])
    writer.writerow([coherence, h_score])

In [None]:
#import pickle

# 加载 topic_word_distribution
#with open('topic_word_distribution.pkl', 'rb') as f:
#    topic_word_distribution_loaded = pickle.load(f)

# 加载 document_topic_distribution
#with open('document_topic_distribution.pkl', 'rb') as f:
#    document_topic_distribution_loaded = pickle.load(f)