In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis


In [3]:
# 下载NLTK的资源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 读取文本文件
with open('/data1/dxw_data/llm/mkt_llm/starbuck/starbuck_comments_1.txt', 'r', encoding='utf-8') as file:
    long_text = file.read()

# 文本预处理
def preprocess(text):
    # 分词
    tokens = word_tokenize(text.lower())
    # 去除停用词和非字母的词
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

[nltk_data] Downloading package punkt to /home/dxw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dxw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dxw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# 对文本进行预处理
processed_text = preprocess(long_text)

# 创建词典和语料库
dictionary = corpora.Dictionary([processed_text])
corpus = [dictionary.doc2bow(processed_text)]

# 构建LDA模型
num_topics = 5  # 设定主题数
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [5]:
# 输出主题
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}\nWords: {topic}\n")

# 修正后的LDA可视化部分
def prepare_lda_vis_data(model, corpus, dictionary):
    # 提取LDA模型数据
    topic_term_dists = np.array(model.get_topics())
    doc_topic_dists = np.array([[topic_prob for _, topic_prob in model.get_document_topics(bow, minimum_probability=0)] for bow in corpus])
    doc_lengths = [sum(count for _, count in doc) for doc in corpus]
    vocab = list(dictionary.token2id.keys())
    term_frequency = np.array(list(dictionary.dfs.values()))
    
    return pyLDAvis.prepare(
        topic_term_dists=topic_term_dists,
        doc_topic_dists=doc_topic_dists,
        doc_lengths=doc_lengths,
        vocab=vocab,
        term_frequency=term_frequency,
        sort_topics=False
    )


Topic: 0
Words: 0.050*"口味" + 0.050*"环境" + 0.027*"以前上学时每次买面包" + 0.027*"性价比" + 0.027*"办了会员卡" + 0.027*"还记得以前名字叫金蝶轩" + 0.027*"从小学就开始吃的面包店啦" + 0.027*"料挺扎实的" + 0.027*"每周都会买几次当早餐" + 0.027*"价格实惠"

Topic: 1
Words: 0.029*"口味" + 0.029*"环境" + 0.029*"真材实料" + 0.029*"店家地址在宝安区" + 0.029*"老婆饼yyds" + 0.029*"肉松小贝味道不错" + 0.029*"买给孩子吃放心" + 0.029*"黑糖核桃吐司也很好吃" + 0.029*"很多面包没有了" + 0.029*"核桃香甜可口"

Topic: 2
Words: 0.029*"口味" + 0.029*"以前好吃" + 0.029*"砸招牌" + 0.029*"也没以前好吃了" + 0.029*"每周都会买几次当早餐" + 0.029*"现在感觉变贵好多了" + 0.029*"老婆饼好好吃" + 0.029*"店家地址在宝安区" + 0.029*"经常点御泥坊的面包" + 0.029*"御蝶坊的老顾客啦"

Topic: 3
Words: 0.029*"环境" + 0.029*"口味" + 0.029*"买给孩子吃放心" + 0.029*"老婆饼都好多人排队" + 0.029*"老婆饼yyds" + 0.029*"也没以前好吃了" + 0.029*"以前好吃" + 0.029*"现在感觉变贵好多了" + 0.029*"老婆饼好好吃" + 0.029*"算是老字号了吧"

Topic: 4
Words: 0.029*"有时需要排半个小时左右呢" + 0.029*"酥酥脆脆的" + 0.029*"口味" + 0.029*"环境" + 0.029*"也没以前好吃了" + 0.029*"老婆饼yyds" + 0.029*"一直买御蝶坊的面包" + 0.029*"算是老字号了吧" + 0.029*"办了会员卡" + 0.029*"核桃香甜可口"

