In [9]:
import jieba
from collections import Counter

def preprocess_text(text):
    # 分词处理
    words = jieba.cut(text)
    # 去除停用词
    stop_words = set(['，','。','的', '是', '在', '了', '和', '有', '就', '也', '都', '上', '中', '下'])
    words = [word for word in words if word not in stop_words]
    return words

def analyze_comments(comments):
    # 统计词频
    word_freq = Counter()
    # 意群分类
    categories = {'服务': ['服务', '态度', '员工'], '图书': ['图书', '书籍', '品种'], '价格': ['价格', '优惠', '折扣']}
    
    for comment in comments:
        comment_id, comment_content = comment.split(" ", 1)
        words = preprocess_text(comment_content)
        word_freq.update(words)

    # 获取词频最高的100个词
    top_words = word_freq.most_common(100)

    # 按照意群进行分类
    category_freq = {category: 0 for category in categories}
    for word, freq in top_words:
        for category, keywords in categories.items():
            if any(keyword in word for keyword in keywords):
                category_freq[category] += freq

    return top_words, category_freq

if __name__ == "__main__":
    # 你的100段评论
    comments = [
        "1 服务态度很好，员工热情。",
        "2 图书种类丰富，书籍很多。",
        "3 价格比较实惠，有很多优惠活动。",
        # ... （添加剩余的评论）
    ]

    top_words, category_freq = analyze_comments(comments)

    print("Top 100 Words:")
    for word, freq in top_words:
        print(f"{word}: {freq}")

    print("\nCategory Frequency:")
    for category, freq in category_freq.items():
        print(f"{category}: {freq}")




Top 100 Words:
很多: 2
服务态度: 1
很: 1
好: 1
员工: 1
热情: 1
图书: 1
种类: 1
丰富: 1
书籍: 1
价格: 1
比较: 1
实惠: 1
优惠活动: 1

Category Frequency:
服务: 2
图书: 2
价格: 2
