In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
import datetime

## 主题分析

In [None]:
df = pd.read_excel('data分词后.xlsx', header=0)
all_texts = [] # 创建一个空列表来存储句子
all_words = [] # 创建一个空列表来存储词语
for i in range(len(df)):
    content = df.分词后内容[i]
    all_texts.append(content)
    all_words.append(content.split(' '))

In [None]:
print('开始时间：' + str(datetime.datetime.now()))
dictionary = corpora.Dictionary(all_words)
corpus = [dictionary.doc2bow(comment) for comment in all_words]
 
# 设置LDA模型的参数
num_topics = 5  # 假设我们想提取5个主题
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 将主题内容存入excel中
topic_list = []
for text_bow in corpus:
    topic_distribution = lda_model[text_bow]
    max_topic = max(topic_distribution, key=lambda x: x[1])[0]
    topic_list.append(f"主题{max_topic + 1}")
data = {'主题名': topic_list, '分词后内容': all_texts}
df = pd.DataFrame(data)
df = df.sort_values(by='主题名')
df.to_excel('data主题分类后.xlsx', index=False)

# 打印每个主题的关键字
for idx, topic in lda_model.print_topics(-1):
    print(f"主题 {idx + 1}: {topic}")
print('\n结束时间：' + str(datetime.datetime.now()))

## 词共现分析

In [None]:
from collections import defaultdict, Counter
print('开始时间：' + str(datetime.datetime.now()))
# 生成词共现矩阵
co_occurrence_matrix = defaultdict(Counter)
window_size = 2  # 定义窗口大小

# 遍历每个句子
for sentence in all_words:
    # 遍历句子中的每个单词，计算窗口内的共现词
    for i, word in enumerate(sentence):
        for j in range(1, window_size + 1):
            # 检查左侧共现词
            if i - j >= 0:
                left_word = sentence[i - j]
                co_occurrence_matrix[word][left_word] += 1
            # 检查右侧共现词
            if i + j < len(sentence):
                right_word = sentence[i + j]
                co_occurrence_matrix[word][right_word] += 1

# 定义要查询的单词列表（选取词频统计中三首歌同时出现次数最多的四个词）
target_words = ['喜欢', '幸福', '永远', '真的']
# 打印每个目标单词的前10名共现词及其出现次数
for target_word in target_words:
    if target_word in co_occurrence_matrix:
        co_occurrences = co_occurrence_matrix[target_word]
        sorted_co_occurrences = sorted(co_occurrences.items(), key=lambda x: x[1], reverse=True)
        top_20_co_occurrences = sorted_co_occurrences[:10]
        print(f'{target_word} 的前20名共现词及其出现次数:')
        for co_word, count in top_20_co_occurrences:
            print(f'  {co_word}: {count}',end='')
        print()
    else:
        print(f'{target_word} 没有共现词。')

print('\n结束时间：' + str(datetime.datetime.now()))