In [None]:
import pandas as pd
import jieba
import emoji
import re
from tqdm import tqdm

# Load the CSV file
file_path = '/data1/dxw_data/llm/redbook_final/script_next/rawdata_20%.csv'
df = pd.read_csv(file_path)

# Step 1: Combine post_title, post_content, and post_tag into combind_text
df['combind_text'] = df[['post_title', 'post_content', 'post_tag']].fillna('').agg(' '.join, axis=1)

# Load stop words
stopwords_path = '/data1/dxw_data/llm/Multimodal-MKT/label/text-cluster/stopwords_cn.txt'
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f)

# Remove stop words, punctuation, numbers, English letters, and convert emojis
def preprocess_text(text):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove punctuation, numbers, and English letters
    text = re.sub(r'[^\u4e00-\u9fa5\s]', '', text)
    
    # Remove stop words
    words = [word for word in jieba.cut(text) if word.strip() and word not in stopwords]
    
    return ' '.join(words)

# Apply preprocessing with a progress bar
df['combind_text'] = [preprocess_text(text) for text in tqdm(df['combind_text'], desc='Preprocessing Text')]

# Step 2: Segment the text using jieba
df['segmented_text'] = [' '.join(jieba.cut(text)) for text in tqdm(df['combind_text'], desc='Segmenting Text')]

# List of Chinese number characters and '小红书'
chinese_numbers = ['小红书', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿']

# Step 3: Identify words that match key_words
key_words = [
    "裙", "裙子", "项链", "配饰", "裤", "吊带", "风格", "饰品", "单品", "衬衫", "身材", "耳环", "主义", "混搭", 
    "手链", "元素", "绒", "肩", "鞋子", "瘦", "套装", "款", "毛", "吊坠", "造型", "型", "饰", "袜", 
    "马甲", "系", "夹克", "裳", "推荐", "服", "衣服", "靴", "款", "白t", "搭配", "恤", "大衣", "头", "风", 
    "毛衣", "服", "内搭", "靴子", "链", "套装", "头发", "背心", "毛衣", "外套", "帽", "发型", "包", "衣", 
    "戒指", "鞋", "衫", "袍", "手镯", "单品", "装", "镜", "帽子", "袖", "风", "感", "系", "型", "搭", "装", 
    "式", "派", "调", "潮", "范", "领", "色", "款", "裤", "裙", "穿", "搭", "夏", "春", "秋", "冬", "鞋", 
    "白", "季", "白", "红", "黑", "蓝", "绿", "黄", "紫", "灰", "衣", "服", "套", "包", "潮流", "时尚", 
    "复古", "简约", "休闲", "通勤", "街头", "个性", "优雅", "气质", "名媛", "甜美", "清新", "叠穿", 
    "搭配", "混搭", "色彩", "质感", "配饰", "外套", "毛衣", "村衫", "牛仔", "婚礼", "度假", "派对", 
    "职场", "约会", "旅行"
]

# Function to check if any word matches a key word and does not start with chinese_numbers
def find_matching_words(segmented_text, key_words, chinese_numbers):
    words = segmented_text.split()
    matching_terms = []
    
    for word in words:
        # Skip words that start with any chinese_number
        if any(word.startswith(cn) for cn in chinese_numbers):
            continue
        # Check if word ends with a keyword
        if any(word.endswith(kw) for kw in key_words):
            matching_terms.append(word)
    
    return matching_terms

# Apply the function to segmented_text and save results to a text file with a progress bar
output_file = '/data1/dxw_data/llm/RA/cuhk_xinyu/matching_words_combined_unique3.txt'
unique_terms = set()

for idx, row in tqdm(df.iterrows(), total=len(df), desc='Finding Matching Words'):
    matching_terms = find_matching_words(row['segmented_text'], key_words, chinese_numbers)
    unique_terms.update(matching_terms)

# Remove duplicates and save to file
with open(output_file, 'w', encoding='utf-8') as f:
    for term in sorted(unique_terms):  # Sort the terms for easier reading
        f.write(f"{term}\n")

print("Processing complete. Unique matching words saved to:", output_file)


In [2]:
import csv

# 输入文件路径
input_files = [
    '/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_notstyle_dataset-output.csv',
    '/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_style_dataset-output.csv'  # 第二个文件路径
]

# 输出文件路径
output_label_1_file = 'combined_output_label_1.txt'
output_label_0_file = 'combined_output_label_0.txt'

# 打开两个输出文件
with open(output_label_1_file, mode='w', encoding='utf-8') as label_1_file, \
     open(output_label_0_file, mode='w', encoding='utf-8') as label_0_file:
    
    # 逐个处理输入文件
    for input_file in input_files:
        with open(input_file, mode='r', encoding='utf-8') as infile:
            # 创建csv阅读器
            reader = csv.DictReader(infile)
            
            # 遍历每一行，根据output_label决定写入哪个文件
            for row in reader:
                word = row['word']
                output_label = row['output_label']
                
                if output_label == '1':
                    label_1_file.write(f"{word}\n")
                elif output_label == '0':
                    label_0_file.write(f"{word}\n")

print("处理完成，文件已保存。")


处理完成，文件已保存。
