In [25]:
import re
from collections import Counter
import jieba


In [29]:
def load_liwc_dict(dic_path):
    with open(dic_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    categories = {}
    liwc_dict = {}

    category_section = True
    for line in lines:
        line = line.strip()
        if line.startswith('%'):
            category_section = not category_section
            continue

        if category_section:
            parts = line.split()
            category_id = parts[0]
            category_name = parts[1]
            categories[category_id] = category_name
        else:
            parts = line.split()
            word = parts[0]
            category_ids = parts[1:]
            for category_id in category_ids:
                category_name = categories[category_id]
                if category_name not in liwc_dict:
                    liwc_dict[category_name] = []
                liwc_dict[category_name].append(word)
    
    return liwc_dict

def tokenize(text):
    words = jieba.lcut(text)
    return words

def match_word(word, pattern):
    if pattern.endswith('*'):
        return word.startswith(pattern[:-1])
    return word == pattern

def calculate_theme_distribution(texts, liwc_dict):
    theme_counts_per_text = []

    for text in texts:
        words = tokenize(text)
        
        theme_counts = Counter()
        for theme, keywords in liwc_dict.items():
            for keyword in keywords:
                for word in words:
                    if match_word(word, keyword):
                        theme_counts[theme] += 1
        
        theme_counts_per_text.append(theme_counts)

    theme_distributions = []
    for theme_counts in theme_counts_per_text:
        total_count = sum(theme_counts.values())
        if total_count > 0:
            theme_distribution = {theme: count / total_count for theme, count in theme_counts.items()}
        else:
            theme_distribution = {theme: 0 for theme in liwc_dict.keys()}
        theme_distributions.append(theme_distribution)
    
    return theme_distributions

In [30]:

# 假设10个已知的BERT总结的主题文本
bert_themes = [
    "用户非常喜欢吃蛋挞和老婆饼，形容它们新鲜出炉，又香又脆，适合大人小孩都享用。",
    "用户表示经常光顾这家店，在这里办理过很多次卡，对店家的信誉感到满意。",
    "用户提到购买的是最近日期的产品，但口味有点发酸，怀疑是之前没卖完的重新装袋打日期卖的。这表明用户对店家的产品质量有一定的担忧。",
    "用户表示经常在他们家订生日蛋糕，强调店家的生日蛋糕款式多样且性价比高。",
    "用户注意到店家的蛋糕使用的是动物奶油，而非植物奶油，对此表示肯定。",
    "其他用户的评价。",
    "产品包装很好，送货及时。",
    "店铺环境干净整洁，服务人员友好。",
    "价格实惠，性价比高。",
    "种类繁多，选择多样。"
]

# 加载LIWC词典
liwc_dict = load_liwc_dict('/data1/dxw_data/llm/ML/LIWC/datasets/sc_liwc.dic')

# 示例原始评论数据
texts = [
    "蛋挞和老婆饼新鲜出炉，又香又脆，适合大人小孩都享用。",
    "经常光顾这家店，办理过很多次卡，对店家的信誉感到满意。",
    "购买的是最近日期的产品，但口味有点发酸，怀疑是之前没卖完的重新装袋打日期卖的。对此有担忧。",
    "经常在他们家订生日蛋糕，款式多样且性价比高。",
    "店家的蛋糕使用的是动物奶油，而非植物奶油，对此表示肯定。"
]

# 计算每个文本的主题分布
theme_distributions = calculate_theme_distribution(texts, liwc_dict)

# 输出结果
for i, distribution in enumerate(theme_distributions):
    print(f"文本 {i+1} 的主题分布：")
    for theme, proportion in distribution.items():
        print(f"{theme}: {proportion:.2f}")
    print()

文本 1 的主题分布：
cogmech: 0.31
incl: 0.25
social: 0.12
humans: 0.12
affect: 0.06
percept: 0.06
posemo: 0.06

文本 2 的主题分布：
cogmech: 0.09
relativ: 0.09
space: 0.05
funct: 0.05
time: 0.05
TenseM: 0.05
insight: 0.05
assent: 0.05
discrep: 0.05
affect: 0.14
Personal: 0.09
posemo: 0.09
PastM: 0.05
money: 0.05
work: 0.05
leisure: 0.05

文本 3 的主题分布：
cogmech: 0.05
relativ: 0.14
space: 0.03
social: 0.03
funct: 0.05
quant: 0.03
tentat: 0.05
time: 0.14
TenseM: 0.03
insight: 0.03
assent: 0.05
affect: 0.08
negemo: 0.08
percept: 0.03
anx: 0.05
Personal: 0.05
PastM: 0.03
money: 0.05

文本 4 的主题分布：
cogmech: 0.09
relativ: 0.27
space: 0.18
social: 0.09
funct: 0.09
quant: 0.09
time: 0.09
discrep: 0.09

文本 5 的主题分布：
cogmech: 0.27
funct: 0.09
certain: 0.09
insight: 0.09
assent: 0.09
negate: 0.09
affect: 0.09
posemo: 0.09
cause: 0.09

