In [None]:
import pandas as pd
from tqdm.auto import tqdm
import sys

# Register tqdm for pandas progress_apply
tqdm.pandas()
# Load your final 3-class dataset
final_output_file = '../data/final_training_data_3class.csv'

df = pd.read_csv(final_output_file)

# Filter out only the neutral reviews (where sentiment == 1)
neutral_df = df[df['sentiment'] == 1].copy()

print(f"找到了 {len(neutral_df)} 条中性评论用于数据增强。")
neutral_df.head()

找到了 5006 条中性评论用于数据增强。


Unnamed: 0,sentiment,final_text
4,1,我第一次是在名古屋的一家商店里听到这首歌的，立刻爱上了凯莎（Ke$ha）的《Tik Tok》...
22,1,“这是一次芝加哥乐队的回顾专辑，这次是进口版。问题和以往一样。哥伦比亚唱片公司的录音远远胜过...
23,1,一般来说，我通常很享受翻唱专辑。但这张专辑例外。它既没有让汤姆·佩蒂也没有让卢·威廉姆斯得到...
69,1,这张录音是1979年11月在布鲁塞尔为比利时的Philippe Defalle唱片公司录制的...
127,1,“我对这张CD非常失望，因为里面没有歌词。<br />这些只是录音室的伴奏带。<br />虽...


In [9]:
import dashscope
from src import config # We need this to get the API key and model name
import time

# --- NEW Back-translation function using Qwen API ---

# First, make sure the API key is set up
# This function is from our previous train.py, we can reuse it here
def setup_api_key():
    if not config.QWEN_API_KEY:
        raise ValueError("QWEN_API_KEY environment variable not set.")
    dashscope.api_key = config.QWEN_API_KEY

# Call it once to set the key
setup_api_key()

def back_translate_qwen(text, lang='fr'):
    """
    Back-translates using the stable Qwen API.
    """
    try:
        # --- Step 1: Chinese to Foreign Language ---
        prompt1 = f"Please translate the following Chinese text to {lang}. Only return the translated text. Chinese text: '{text}'"
        response1 = dashscope.Generation.call(
            model=config.QWEN_MODEL_NAME, # We can use the fast 'qwen-turbo'
            prompt=prompt1
        )
        if response1.status_code != 200:
            print(f"Error in first translation step: {response1.message}")
            return None
        translated_text = response1.output.strip()

        # Add a very small delay, as official APIs are more robust
        time.sleep(0.2)

        # --- Step 2: Foreign Language back to Chinese ---
        prompt2 = f"Please translate the following {lang} text to Chinese. Only return the translated text. {lang.capitalize()} text: '{translated_text}'"
        response2 = dashscope.Generation.call(
            model=config.QWEN_MODEL_NAME,
            prompt=prompt2
        )
        if response2.status_code != 200:
            print(f"Error in second translation step: {response2.message}")
            return None
            
        back_translated_text = response2.output.strip()
        
        return back_translated_text
        
    except Exception as e:
        print(f"An unexpected error occurred during Qwen back-translation: {e}")
        return None

#### 将经过dataaugumentation代码数据增强之后的含英文数据进行最后的清洗：

In [None]:
import pandas as pd
import re

# --- 这是确保数据集100%纯净的最后一步 ---

# 1. 加载你最新的、经过数据增强的数据集
source_file = '../data/final_training_data_augmented.csv'
# 这将是我们用于训练的、最最最终版的文件
final_clean_file = '../data/TRAINING_DATASET_FINAL_DATAAUG.csv'

print(f"正在加载增强后的数据，来源: {source_file}")
df = pd.read_csv(source_file)
print(f"加载完成，共有 {len(df)} 行数据。")

# 2. 定义一个函数，用来检查文本中是否包含至少一个中文字符
def contains_chinese(text):
    """
    Returns True if the text contains at least one Chinese character.
    """
    # [\u4e00-\u9fa5] 是所有中文字符的Unicode范围
    return bool(re.search(r'[\u4e00-\u9fa5]', str(text)))

# 3. 筛选出所有包含中文的行
print("正在筛选，只保留包含中文的评论...")
# .apply() 方法会将函数应用到'final_text'列的每一行
is_chinese_mask = df['final_text'].apply(contains_chinese)

# 使用这个布尔值的"mask"来过滤DataFrame，只保留为True的行
df_clean = df[is_chinese_mask].copy()

print(f"筛选完成！包含中文的行数: {len(df_clean)}")
print(f"被剔除的英文行数: {len(df) - len(df_clean)}")

# 4. 保存最终的、纯净的中文数据集
df_clean.to_csv(final_clean_file, index=False, encoding='utf-8-sig')

print(f"\n--- 最终清洗完成！ ---")
print(f"一个全新的、只包含中文的数据集已保存到: {final_clean_file}")
print("这个文件才是我们应该用于最终训练的文件。")
print("\n最终纯净数据预览:")
print(df_clean.head())

正在加载增强后的数据，来源: ../data/final_training_data_augmented.csv
加载完成，共有 111644 行数据。
正在筛选，只保留包含中文的评论...
筛选完成！包含中文的行数: 29633
被剔除的英文行数: 82011

--- 最终清洗完成！ ---
一个全新的、只包含中文的数据集已保存到: ../data/TRAINING_DATASET_FINAL.csv
这个文件才是我们应该用于最终训练的文件。

最终纯净数据预览:
   sentiment                                         final_text
0          1  柴可夫斯基并没有轻率地创作他的第四交响曲；他表达了一个宏大的主题，总结了他许多内心的斗争。这...
4          1  这张CD并没有我预期的那么好。这张专辑最好的歌曲是《Home Sweet Home》的翻唱版...
6          2  这是一支引人入胜的复古金属乐队。我之前对他们的作品并不十分印象深刻，但在我看来，这张专辑是他...
7          1  这张专辑上有一些有趣的混音，其中一半对我来说非常出色，一个很有趣，还有一个我认为不好。让听众...
9          2                                    “我只希望它能有更多歌曲……”


### 清洗3分类数据


In [None]:
import pandas as pd
import re

# --- Configuration ---
# Input file: The 3-class dataset without augmentation 
source_file = '../data/final_training_data_3class.csv'
# Output file: A new, clean version of the above
output_file = '../data/final_training_data_3class_cleaned.csv'

print(f"--- Starting cleaning process for {source_file} ---")

# --- 1. Load the dataset ---
try:
    df = pd.read_csv(source_file)
    print(f"Successfully loaded {len(df)} rows.")
except FileNotFoundError:
    print(f"ERROR: File not found at {source_file}. Please make sure the file exists.")
    # In a notebook, this will stop the execution of this cell
    raise

# --- 2. Define a comprehensive text cleaning function ---
def clean_and_normalize(text):
    """
    This function cleans and normalizes a single piece of text.
    """
    if not isinstance(text, str):
        return ""
    # Remove HTML tags (e.g., <br>)
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    # Replace newline characters and tabs with a space
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    # Optional: Remove non-essential special characters, 
    # keeping Chinese, English, numbers, and basic punctuation.
    # You can customize the characters you want to keep inside the brackets.
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.,!?\'"]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Normalizing text (removing HTML, extra spaces, etc.)...")
# Apply the function to create a new clean column, let's call it 'cleaned_text' for now
df['cleaned_text'] = df['final_text'].apply(clean_and_normalize)

# --- 3. Define a function to check for Chinese characters ---
def contains_chinese(text):
    """
    Returns True if the text contains at least one Chinese character.
    """
    return bool(re.search(r'[\u4e00-\u9fa5]', str(text)))

# --- 4. Filter out rows that do not contain Chinese text ---
print("Filtering out rows that are still in English...")
original_rows = len(df)
# Create a boolean mask: True for rows with Chinese, False otherwise
is_chinese_mask = df['cleaned_text'].apply(contains_chinese)
# Keep only the rows where the mask is True
df_clean = df[is_chinese_mask].copy()
removed_rows = original_rows - len(df_clean)
print(f"{removed_rows} English-only rows were removed.")

# --- 5. Select final columns and save the file ---
# We only need the sentiment and the cleaned text
final_df = df_clean[['sentiment', 'cleaned_text']]
# Rename 'cleaned_text' back to 'final_text' for consistency with the training script
final_df = final_df.rename(columns={'cleaned_text': 'final_text'})

final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\n--- Cleaning complete! ---")
print(f"Final dataset has {len(final_df)} rows.")
print(f"Clean file saved to: {output_file}")
print("\nPreview of the final clean data:")
print(final_df.head())

--- Starting cleaning process for ../data/final_training_data_3class_cleaned.csv ---
Successfully loaded 17958 rows.
Normalizing text (removing HTML, extra spaces, etc.)...
Filtering out rows that are still in English...
0 English-only rows were removed.

--- Cleaning complete! ---
Final dataset has 17958 rows.
Clean file saved to: ../data/final_training_data_3class_cleaned.csv

Preview of the final clean data:
   sentiment                                         final_text
0          2  如果我每听一次这张CD就有一美元每回让艾丽卡Alexa播放一次就有一美元的话我现在就会很有钱...
1          2                    出色的音效迫不及待想亲眼看到他们了当他们在城里时我总是想念他们
2          2                  这是一张很棒的CD音乐很好播放也很顺畅卖家回复非常快三天内就收到了
3          0                  这些不是真正的德国歌手他们有口音这和他们宣传的完全不一样音乐太差了
4          1  我第一次是在名古屋的一家商店里听到这首歌的立刻爱上了凯莎Keha的Tik Tok混音版在日本...
