In [14]:
from collections import Counter
import jieba
import re

# 预定义的词典
LIWC_dict = {
    'positive_emotion': ['好吃', '喜欢', '美味'],
    'negative_emotion': ['贵', '太贵了', '昂贵'],
    # 可以添加更多类别和单词
}

# 和liwc软件的主要区别在于同义词的覆盖率和分类可能不足


In [15]:

def tokenize(text):
    # 使用 jieba 分词将文本分割为单词
    words = jieba.lcut(text)
    return words

def match_word(word, pattern):
    # 支持简单的词根匹配
    if pattern.endswith('*'):
        return word.startswith(pattern[:-1])
    return word == pattern

def analyze_text(file_path, LIWC_dict):
    # 读取文本文件
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # 将文本拆分为单词
    words = tokenize(text)
    total_words = len(words)
    unique_words = len(set(words))
    sentences = re.split(r'[。！？]', text)  # 更复杂的分句方法
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0

    # 计算每个类别的单词频率
    category_freq = {category: 0 for category in LIWC_dict}
    word_freq = Counter(words)

    for category, category_patterns in LIWC_dict.items():
        for pattern in category_patterns:
            for word in words:
                if match_word(word, pattern):
                    category_freq[category] += 1
    
    # 计算百分比
    category_percentage = {category: (freq / total_words) * 100 if total_words > 0 else 0 for category, freq in category_freq.items()}

    # 输出结果
    print("Category frequencies:")
    for category, freq in category_freq.items():
        print(f"{category}: {freq} ({category_percentage[category]:.2f}%)")

    print(f"Total words: {total_words}")
    print(f"Unique words: {unique_words}")
    print(f"Total sentences: {total_sentences}")
    print(f"Average sentence length: {avg_sentence_length:.2f}")


In [17]:
# 指定文本文件路径
file_path = '/data1/dxw_data/llm/mkt_llm/starbuck/starbuck_comments_all.txt'

# 分析文本文件并输出各类别单词的频率
analyze_text(file_path, LIWC_dict)

Category frequencies:
positive_emotion: 192 (1.75%)
negative_emotion: 15 (0.14%)
Total words: 10966
Unique words: 2045
Total sentences: 316
Average sentence length: 34.70
