In [32]:
import os
import time
import requests
import pandas as pd
import sqlite3
from datetime import datetime
import random
import json
import csv
import re
from IPython.display import display, HTML
from tqdm.notebook import tqdm

# OpenAI APIConfiguration
OPENAI_API_KEY = "your_api_config_file.json"  # Please replace with your actual API key
MODEL = "gpt-4.1-mini-2025-04-14"
API_URL = "https://api.openai.com/v1/chat/completions"

# DatabaseConfiguration
DB_NAME = "propaganda_techniques_analysis.db"

# Analysis type constants
ANALYSIS_TYPE_MISSED = "missed"  # True labels存inbutPredictionnotcontains
ANALYSIS_TYPE_FALSE_POSITIVE = "false_positive"  # True labels不存inbutPredictioncontains
ANALYSIS_TYPE_CORRECT = "correct"  # Correctly identifiedof案例
ANALYSIS_TYPE_CONFUSED = "confused"  # byConfusion为Other labelsof案例

# ReadCSVFile
def read_csv_file(file_path):
    try:
        # 首先尝试use制Table符作为分隔符
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        
        # Check是否Correct解析 - 应该have多个列
        if len(df.columns) == 1 and ',' in df.columns[0]:
            print("警告：FilenotCorrect解析，尝试use逗号分隔符...")
            # 尝试use逗号分隔符
            df = pd.read_csv(file_path, sep=',', encoding='utf-8')
            
        # Check again
        if 'true_labels' not in df.columns:
            print(f"当前列名: {df.columns.tolist()}")
            print("尝试其他解析方法...")
            # 尝试不同ofRead方式
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Check内容中of分隔符
                if '\t' in content:
                    print("File内容contains制Table符，butpandasnotCorrect解析")
                else:
                    print("File内容不contains制Table符，Checkactual分隔符")
                
                # 尝试手动解析
                lines = content.strip().split('\n')
                headers = lines[0].strip().split(',')  # Assume header is comma-separated
                print(f"手动解析of列名: {headers}")
                
                # Create自定义DataFrame
                data = []
                for line in lines[1:]:
                    values = line.strip().split(',')
                    row_data = {headers[i]: values[i] for i in range(min(len(headers), len(values)))}
                    data.append(row_data)
                
                df = pd.DataFrame(data)
        
        print(f"最终解析Result - 列名: {df.columns.tolist()}, 形状: {df.shape}")
        return df
        
    except Exception as e:
        print(f"Read file出错: {str(e)}")
        return pd.DataFrame()

# 显示CSVinformation
def display_csv_info(df):
    print(f"CSVFilecontains {len(df)} rowsData")
    print(f"列名: {', '.join(df.columns)}")
    
    # 显示前几rowsData
    display(df.head())
    
    # Analyze label distribution
    true_labels_count = {}
    pred_labels_count = {}
    
    for index, row in df.iterrows():
        if isinstance(row['true_labels'], str):
            labels = row['true_labels'].split(',')
            for label in labels:
                true_labels_count[label] = true_labels_count.get(label, 0) + 1
        
        if isinstance(row['pred_labels'], str):
            labels = row['pred_labels'].split(',')
            for label in labels:
                pred_labels_count[label] = pred_labels_count.get(label, 0) + 1
    
    # convertCountConvert为DataFrame并显示
    true_df = pd.DataFrame(list(true_labels_count.items()), columns=['Label', 'True labels出现次数'])
    true_df = true_df.sort_values('True labels出现次数', ascending=False)
    
    pred_df = pd.DataFrame(list(pred_labels_count.items()), columns=['Label', 'Prediction labels出现次数'])
    pred_df = pred_df.sort_values('Prediction labels出现次数', ascending=False)
    
    # 合并两个DataFrame
    merged_df = pd.merge(true_df, pred_df, on='Label', how='outer').fillna(0)
    merged_df['差异'] = merged_df['True labels出现次数'] - merged_df['Prediction labels出现次数']
    merged_df = merged_df.sort_values('差异', ascending=False)
    
    print("\nLabel distribution情况:")
    display(merged_df)
    
    return merged_df


# 构建GPTPrompt
def build_prompt(row, target_label, analysis_type, confused_label=None):
    if analysis_type == ANALYSIS_TYPE_MISSED:
        prompt = f"""请Analysis以下Text中出现of宣传technique，特别关注"{target_label}":

Text: {row['text']}

True labels中contains"{target_label}"，but prediction labels do not contain。请用英语Analysis以下问题，并convert回答合并为一个连贯of段落，Total长度不超过150English words:

1. This articleText中contains{target_label}of具体原因是什么？
2. Text中支持This一判断of关键词或句子是什么？
3. {target_label}This种techniqueof典型特征如何inText中体现？
4. Model应该关注Text中of哪些额外特征来更准确地识别This种technique？

请注意：所have回答必须合并成单个段落，Do not use bullet points or numbering，确保内容连贯流畅且不超过150words。"""
    
    elif analysis_type == ANALYSIS_TYPE_FALSE_POSITIVE:
        prompt = f"""请Analysis以下Text中关于宣传technique"{target_label}"ofPredictionError:

Text: {row['text']}

Prediction labels中contains"{target_label}"，but true labels do not contain。请用英语Analysis以下问题，并convert回答合并为一个连贯of段落，Total长度不超过150English words:

1. This articleText中不contains{target_label}of具体原因是什么？
2. Text中支持This一判断of关键词或句子是什么？
3. {target_label}This种techniqueof典型特征如何inText中体现？
4. Modelin识别{target_label}时可能存in哪些误解或Error模式？

请注意：所have回答必须合并成单个段落，Do not use bullet points or numbering，确保内容连贯流畅且不超过150words。"""
    
    elif analysis_type == ANALYSIS_TYPE_CORRECT:
        prompt = f"""请Analysis以下Text中Correctly identifiedof宣传technique"{target_label}":

Text: {row['text']}

True labelsandPrediction labels都contains"{target_label}"。请用英语Analysis以下问题，并convert回答合并为一个连贯of段落，Total长度不超过150English words:

1. This articleText中contains{target_label}of具体原因是什么？
2. Text中支持This一判断of关键词或句子是什么？
3. {target_label}This种techniqueof典型特征如何inText中体现？
4. ModelCorrectly identifiedThis种techniqueof关键是什么？

请注意：所have回答必须合并成单个段落，Do not use bullet points or numbering，确保内容连贯流畅且不超过150words。"""
    
    elif analysis_type == ANALYSIS_TYPE_CONFUSED:
        if not confused_label:
            confused_label = "Other labels"
        
        prompt = f"""请Analysis以下Text中宣传techniqueLabelConfusionof情况:

Text: {row['text']}

True labels是"{target_label}"，but predicted as"{confused_label}"。请用英语Analysis以下问题，并convert回答合并为一个连贯of段落，Total长度不超过150English words:

1. Why this text is more consistent with{target_label}而不是{confused_label}？
2. This两种techniquehave什么关键区别？
3. Model可能ConfusionThis两种techniqueof原因是什么？

请注意：所have回答必须合并成单个段落，Do not use bullet points or numbering，确保内容连贯流畅且不超过150words。"""
    
    return prompt

# CallGPT API
def call_gpt_api(prompt):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    
    data = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.3,
        "max_tokens": 200  # 限制GeneratedtokenQuantity，约对应150English words
    }
    
    try:
        response = requests.post(API_URL, headers=headers, json=data)
        response.raise_for_status()
        content = response.json()["choices"][0]["message"]["content"]
        
        # Check内容长度，如果超过150words，进rows截断
        words = content.split()
        if len(words) > 150:
            content = ' '.join(words[:150]) + "..."
            print("回答by截断至150words")
        
        # 确保Output是单个段落，移除多余of换rowsand标点
        content = content.replace("\n", " ").replace("\r", " ")
        while "  " in content:
            content = content.replace("  ", " ")
            
        # 移除可能of项目符号或Number
        content = re.sub(r'^\s*[\d\.\-\*]+\s*', '', content)
        content = re.sub(r'\n\s*[\d\.\-\*]+\s*', ' ', content)
        
        return content
    except requests.exceptions.RequestException as e:
        print(f"APIRequestError: {e}")
        # 添加随机延迟避免Frequency限制
        time.sleep(random.uniform(1, 3))
        return None

# Initialize database并Create table（如果不存in）
# Initialize database并Create table（如果不存in）
def initialize_database():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    # Create table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS label_analysis (
        id TEXT,
        text TEXT,
        true_labels TEXT,
        pred_labels TEXT,
        language TEXT,
        analysis TEXT,
        target_label TEXT,
        analysis_type TEXT,
        confused_label TEXT,
        created_at TIMESTAMP,
        PRIMARY KEY (id, target_label, analysis_type, confused_label)
    )
    ''')
    
    conn.commit()
    return conn, cursor

# convertAnalysisResultStore to database
# convertAnalysisResultStore to database
def save_to_db(conn, cursor, row, analysis, target_label, analysis_type, confused_label=None):
    # CheckRecord是否Already exists
    if confused_label:
        cursor.execute(
            "SELECT id FROM label_analysis WHERE id = ? AND target_label = ? AND analysis_type = ? AND confused_label = ?", 
            (row['id'], target_label, analysis_type, confused_label)
        )
    else:
        cursor.execute(
            "SELECT id FROM label_analysis WHERE id = ? AND target_label = ? AND analysis_type = ?", 
            (row['id'], target_label, analysis_type)
        )
    exists = cursor.fetchone()
    
    if exists:
        # Update现haveRecord
        if confused_label:
            cursor.execute('''
            UPDATE label_analysis 
            SET analysis = ?, created_at = ?
            WHERE id = ? AND target_label = ? AND analysis_type = ? AND confused_label = ?
            ''', (analysis, datetime.now(), row['id'], target_label, analysis_type, confused_label))
        else:
            cursor.execute('''
            UPDATE label_analysis 
            SET analysis = ?, created_at = ?
            WHERE id = ? AND target_label = ? AND analysis_type = ?
            ''', (analysis, datetime.now(), row['id'], target_label, analysis_type))
    else:
        # Insert新Record
        cursor.execute('''
        INSERT INTO label_analysis 
        (id, text, true_labels, pred_labels, language, analysis, target_label, analysis_type, confused_label, created_at)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            row['id'], 
            row['text'], 
            row['true_labels'], 
            row['pred_labels'], 
            row['language'], 
            analysis,
            target_label,
            analysis_type,
            confused_label,
            datetime.now()
        ))
    
    conn.commit()

# Get analyzed records from database
def get_analyzed_records(target_label=None, analysis_type=None):
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()
    
    query = "SELECT * FROM label_analysis WHERE 1=1"
    params = []
    
    if target_label:
        query += " AND target_label = ?"
        params.append(target_label)
    
    if analysis_type:
        query += " AND analysis_type = ?"
        params.append(analysis_type)
    
    cursor.execute(query, params)
    rows = cursor.fetchall()
    
    # Convert为pandas DataFrame
    if rows:
        df = pd.DataFrame([dict(row) for row in rows])
    else:
        df = pd.DataFrame(columns=['id', 'text', 'true_labels', 'pred_labels', 
                                 'language', 'analysis', 'target_label', 
                                 'analysis_type', 'created_at'])
    
    conn.close()
    return df

# Main function - Analyze unrecognized labels（Missed）
def analyze_missed_label(csv_file_path, target_label, max_samples=None, display_progress=True):
    return analyze_label(csv_file_path, target_label, ANALYSIS_TYPE_MISSED, max_samples, display_progress)

# Main function - AnalysisErrorPredictionlabels（误报）
def analyze_false_positive_label(csv_file_path, target_label, max_samples=None, display_progress=True):
    return analyze_label(csv_file_path, target_label, ANALYSIS_TYPE_FALSE_POSITIVE, max_samples, display_progress)

# Main function - Analyze correctly identified labels
def analyze_correct_label(csv_file_path, target_label, max_samples=None, display_progress=True):
    return analyze_label(csv_file_path, target_label, ANALYSIS_TYPE_CORRECT, max_samples, display_progress)

# Main function - Analyze confused labels
def analyze_confused_label(csv_file_path, target_label, confused_label, max_samples=None, display_progress=True):
    return analyze_label(csv_file_path, target_label, ANALYSIS_TYPE_CONFUSED, max_samples, display_progress, confused_label)



def save_as_tsv(df, file_path):
    """
    usecsv模块直接convertDataFrameSave asTSVFile
    This样可以完全控制Output格式，避免pandasofAutomaticProcess逻辑
    """
    with open(file_path, 'w', newline='', encoding='utf-8') as tsv_file:
        # CreateTSVwriter，明确指定分隔符为制Table符
        writer = csv.writer(tsv_file, delimiter='\t')
        
        # Write column names
        writer.writerow(df.columns)
        
        # Write each row of data
        for _, row in df.iterrows():
            writer.writerow(row.values)
            
    print(f"DataSuccessfully saved为TSVFile: {file_path}")
    return file_path
# 通用Analysis函数
def analyze_label(csv_file_path, target_label, analysis_type, max_samples=None, display_progress=True, confused_label=None):
    analysis_type_names = {
        ANALYSIS_TYPE_MISSED: "Missed",
        ANALYSIS_TYPE_FALSE_POSITIVE: "误报",
        ANALYSIS_TYPE_CORRECT: "Correctly identified",
        ANALYSIS_TYPE_CONFUSED: "Confused labels"
    }
    analysis_type_name = analysis_type_names.get(analysis_type, "not知")
    
    confused_info = f"(Confusion为{confused_label})" if analysis_type == ANALYSIS_TYPE_CONFUSED and confused_label else ""
    print(f"StartAnalyze labels: {target_label} {confused_info}({analysis_type_name})")
    
    # Initialize database
    conn, cursor = initialize_database()
    
    # ReadCSV
    df = read_csv_file(csv_file_path)
    # 调试information
    print(f"ReadofDataFrame列名: {df.columns.tolist()}")
    print(f"DataFrame形状: {df.shape}")
    print("前几rowsData:")
    print(df.head())

    # Prepare new dataframe for analysis results
    result_df = df.copy()
    result_df['analysis'] = ""
    # 确保Label列为字符串Type
    df['true_labels'] = df['true_labels'].astype(str)
    df['pred_labels'] = df['pred_labels'].astype(str)
    
    # 根据Analysis type筛选Record
    if analysis_type == ANALYSIS_TYPE_MISSED:
        # 筛选True labels中存inbutPrediction labels中不存inrecords
        filtered_df = df[
            (df['true_labels'].str.contains(target_label, na=False)) & 
            (~df['pred_labels'].str.contains(target_label, na=False))
        ]
        print(f"Found{len(filtered_df)}records含have{target_label}but records where predictions do not contain this label（Missed）")
    
    elif analysis_type == ANALYSIS_TYPE_FALSE_POSITIVE:
        # 筛选Prediction labels中存inbutTrue labels中不存inrecords
        filtered_df = df[
            (~df['true_labels'].str.contains(target_label, na=False)) & 
            (df['pred_labels'].str.contains(target_label, na=False))
        ]
        print(f"Found{len(filtered_df)}recordsPredicted as{target_label}but records where true labels do not contain this label（误报）")
    
    elif analysis_type == ANALYSIS_TYPE_CORRECT:
        # 筛选True labelsandPrediction labels中都存inrecords
        filtered_df = df[
            (df['true_labels'].str.contains(target_label, na=False)) & 
            (df['pred_labels'].str.contains(target_label, na=False))
        ]
        print(f"Found{len(filtered_df)}recordsTrue labelsandPrediction labels都contains{target_label}records（Correctly identified）")
    
    elif analysis_type == ANALYSIS_TYPE_CONFUSED:
        # 筛选True labels是target_labelbut prediction labels containconfused_labelrecords
        if not confused_label:
            print("Error：ConfusionLabel analysisNeed提供confused_label参数")
            return pd.DataFrame(), ""
        
        filtered_df = df[
            (df['true_labels'].str.contains(target_label, na=False)) & 
            (~df['pred_labels'].str.contains(target_label, na=False)) &
            (df['pred_labels'].str.contains(confused_label, na=False))
        ]
        print(f"Found{len(filtered_df)}recordsTrue labels为{target_label}but predicted as{confused_label}records（Confused labels）")
    
    # 限制ProcessNumber of samples
    if max_samples and max_samples < len(filtered_df):
        filtered_df = filtered_df.sample(max_samples, random_state=42)
        print(f"已随机选择{max_samples}records进rowsAnalysis")
    
    # CreateResultOutput file（CSV）
    if analysis_type == ANALYSIS_TYPE_CONFUSED:
        output_file_name = f"{target_label.lower()}_confused_as_{confused_label.lower()}_analysis_results.csv"
    else:
        output_file_name = f"{target_label.lower()}_{analysis_type}_analysis_results.csv"
    
    # Process每一records
    for index, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc=f"Analysis{target_label}({analysis_type_name})") if display_progress else filtered_df.iterrows():
        # CheckDatabase中是否已have此Record
        if analysis_type == ANALYSIS_TYPE_CONFUSED:
            cursor.execute(
                "SELECT analysis FROM label_analysis WHERE id = ? AND target_label = ? AND analysis_type = ? AND confused_label = ?", 
                (row['id'], target_label, analysis_type, confused_label)
            )
        else:
            cursor.execute(
                "SELECT analysis FROM label_analysis WHERE id = ? AND target_label = ? AND analysis_type = ?", 
                (row['id'], target_label, analysis_type)
            )
        existing = cursor.fetchone()
        
        if existing:
            # Use existing analysis results
            analysis = existing[0]
            print(f"Record {row['id']} Already exists于Database中，跳过APICall")
        else:
            # 构建Prompt并CallAPI
            prompt = build_prompt(row, target_label, analysis_type, confused_label)
            analysis = call_gpt_api(prompt)
            
            if analysis:
                # Save to database，统一use一个函数
                save_to_db(conn, cursor, row, analysis, target_label, analysis_type, confused_label)
            else:
                print(f"Record {row['id']} Process失败")
                continue
        
        # inResultDataframe中Update analysis results
        result_df.loc[result_df['id'] == row['id'], 'analysis'] = analysis
    
    # Save results toCSVFile
    filtered_indices = result_df['id'].isin(filtered_df['id'])
    filtered_result_df = result_df[filtered_indices]
    
    # convert缺失ofAnalysis设为Empty字符串
    filtered_result_df['analysis'] = filtered_result_df['analysis'].fillna("")
    
    # 确保CSV格式与原始格式相同，并in最后添加Analysis列
    output_file_name = output_file_name.replace('.csv', '.tsv')
    # Fixed code
    save_as_tsv(filtered_result_df, output_file_name)
    
    conn.close()
    print(f"All processing completed！ResultSaved到{output_file_name}andDatabase {DB_NAME}")
    
    # 显示Result预览
    display(filtered_result_df)
    
    return filtered_result_df, output_file_name


# 多LabelProcess函数
def process_multiple_labels(csv_file_path, labels, analysis_types=None, max_samples=None, confused_pairs=None):
    if analysis_types is None:
        # 默认同时AnalysisMissedand误报
        analysis_types = [ANALYSIS_TYPE_MISSED, ANALYSIS_TYPE_FALSE_POSITIVE]
    
    results = {}
    summary_data = []
    
    # ReadCSVto get original format
    df = read_csv_file(csv_file_path)
    
    # Prepare final merged results dataframe
    final_result_df = df.copy()
    final_result_df['analysis'] = ""
    
    # Process常规Analysis type（非ConfusionType）
    for label in labels:
        for analysis_type in [at for at in analysis_types if at != ANALYSIS_TYPE_CONFUSED]:
            analysis_type_name = "Missed" if analysis_type == ANALYSIS_TYPE_MISSED else "误报" if analysis_type == ANALYSIS_TYPE_FALSE_POSITIVE else "Correctly identified"
            print(f"\nStart processingLabel: {label} ({analysis_type_name})")
            
            # Analyze labels
            results_df, output_file = analyze_label(
                csv_file_path, 
                label, 
                analysis_type,
                max_samples=max_samples
            )
            
            # Update最终ResultDataframe
            for idx, row in results_df.iterrows():
                if row['analysis']:  # 只UpdatehaveAnalysisResultofrows
                    # in现haveResult上添加LabelandAnalysis typeinformation
                    analysis_with_tag = f"[{label}|{analysis_type_name}] {row['analysis']}"
                    
                    # 如果该rows已have其他Analysis，则追加；否则直接赋值
                    current_analysis = final_result_df.loc[idx, 'analysis']
                    if current_analysis:
                        final_result_df.loc[idx, 'analysis'] = f"{current_analysis} | {analysis_with_tag}"
                    else:
                        final_result_df.loc[idx, 'analysis'] = analysis_with_tag
            
            # 添加到Result
            key = f"{label}_{analysis_type}"
            results[key] = {
                "results_df": results_df,
                "output_file": output_file
            }
            
            # 添加到摘要
            summary_data.append({
                "Label": label,
                "Analysis type": analysis_type_name,
                "Number of analysis records": len(results_df[results_df['analysis'] != ""]),
                "Output file": output_file
            })
    
    # ProcessConfusionLabel analysis
    if ANALYSIS_TYPE_CONFUSED in analysis_types and confused_pairs:
        for true_label, pred_label in confused_pairs:
            print(f"\nStart processingConfused labels: True={true_label}, Prediction={pred_label}")
            
            # Analyze confused labels
            results_df, output_file = analyze_label(
                csv_file_path, 
                true_label,
                ANALYSIS_TYPE_CONFUSED,
                max_samples=max_samples,
                confused_label=pred_label
            )
            
            # Update最终ResultDataframe
            for idx, row in results_df.iterrows():
                if row['analysis']:  # 只UpdatehaveAnalysisResultofrows
                    # in现haveResult上添加LabelandAnalysis typeinformation
                    analysis_with_tag = f"[{true_label}Confusion为{pred_label}] {row['analysis']}"
                    
                    # 如果该rows已have其他Analysis，则追加；否则直接赋值
                    current_analysis = final_result_df.loc[idx, 'analysis']
                    if current_analysis:
                        final_result_df.loc[idx, 'analysis'] = f"{current_analysis} | {analysis_with_tag}"
                    else:
                        final_result_df.loc[idx, 'analysis'] = analysis_with_tag
            
            # 添加到Result
            key = f"{true_label}_confused_as_{pred_label}"
            results[key] = {
                "results_df": results_df,
                "output_file": output_file
            }
            
            # 添加到摘要
            summary_data.append({
                "Label": true_label,
                "Analysis type": f"Confusion为{pred_label}",
                "Number of analysis records": len(results_df[results_df['analysis'] != ""]),
                "Output file": output_file
            })
    
    # Save final merged results
    final_output_file = "all_labels_analysis_results.tsv"
    # Fixed code
    save_as_tsv(final_result_df, final_output_file)
    
    # 显示摘要
    summary_df = pd.DataFrame(summary_data)
    print("\n所haveLabelProcessing completed！")
    display(HTML("<h3>Process摘要</h3>"))
    display(summary_df)
    
    print(f"所haveAnalysisResult已合并保存到: {final_output_file}")
    
    return results, summary_df, final_output_file, final_result_df


# Exampleuse方法
def usage_example():
    print("=== Usage example ===")
    print("\n1. LoadCSV并Analyze label distribution:")
    print("df = read_csv_file('true_in_Wha.csv')")
    print("label_distribution = display_csv_info(df)")
    
    print("\n2. Analyze missed labels:")
    print("# True labels中havebutPrediction中No")
    print("results_df, output_file = analyze_missed_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)")
    
    print("\n3. Analysis误报Label:")
    print("# Prediction labels中havebutTrue labels中No")
    print("results_df, output_file = analyze_false_positive_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)")
    
    print("\n4. Analyze multiple labels:")
    print("labels = ['Whataboutism', 'Loaded_Language']")
    print("# 默认同时AnalysisMissedand误报")
    print("results, summary, final_file, final_df = process_multiple_labels('true_in_Wha.csv', labels, max_samples=2)")
    print("# Only analyzeMissed")
    print("results, summary, final_file, final_df = process_multiple_labels('true_in_Wha.csv', labels, analysis_types=['missed'], max_samples=2)")
    
    print("\n5. Analyze correctly identified labels:")
    print("results_df, output_file = analyze_correct_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)")
    
    print("\n6. Analyze confused labels:")
    print("results_df, output_file = analyze_confused_label('true_in_Wha.csv', 'Loaded_Language', 'Appeal_to_Fear-Prejudice', max_samples=3)")
    
    print("\n7. Analyze multiple labels containing confused labels:")
    print("labels = ['Whataboutism', 'Loaded_Language']")
    print("confused_pairs = [('Loaded_Language', 'Appeal_to_Fear-Prejudice'), ('Whataboutism', 'Red_Herring')]")
    print("results, summary, final_file, final_df = process_multiple_labels('true_in_Wha.csv', labels, analysis_types=['missed', 'confused'], max_samples=2, confused_pairs=confused_pairs)")

# 展示use方法
usage_example()

=== Usage example ===

1. LoadCSV并Analyze label distribution:
df = read_csv_file('true_in_Wha.csv')
label_distribution = display_csv_info(df)

2. Analyze missed labels:
# True labels中havebutPrediction中No
results_df, output_file = analyze_missed_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)

3. Analysis误报Label:
# Prediction labels中havebutTrue labels中No
results_df, output_file = analyze_false_positive_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)

4. Analyze multiple labels:
labels = ['Whataboutism', 'Loaded_Language']
# 默认同时AnalysisMissedand误报
results, summary, final_file, final_df = process_multiple_labels('true_in_Wha.csv', labels, max_samples=2)
# Only analyzeMissed
results, summary, final_file, final_df = process_multiple_labels('true_in_Wha.csv', labels, analysis_types=['missed'], max_samples=2)

5. Analyze correctly identified labels:
results_df, output_file = analyze_correct_label('true_in_Wha.csv', 'Whataboutism', max_samples=3)

6. Analyze confused labels:
r

In [53]:
#results_df, output_file = analyze_missed_label('error/true_in_NA_CA_LA.csv', 'Name_Calling-Labeling') #Missed真have假No

#results_df, output_file = analyze_correct_label('confusion/conf_true_QU_TH_RE.csv', 'Questioning_the_Reputation')

#results_df, output_file = analyze_correct_label('confusion/conf_true_LO_LA.csv', 'Loaded_Language')
#results_df, output_file = analyze_correct_label('confusion/conf_true_EX_MI.csv', 'Exaggeration-Minimisation')
#results_df, output_file = analyze_correct_label('confusion/conf_true_DO.csv', 'Doubt')
#results_df, output_file = analyze_correct_label('confusion/conf_true_CO_KI.csv', 'Conversation_Killer')
#results_df, output_file = analyze_correct_label('confusion/conf_true_AP_TO_FE_PR.csv', 'Appeal_to_Fear-Prejudice')

#results_df, output_file = analyze_false_positive_label('error/pres_in_NA_CA_LA.csv', 'Name_Calling-Labeling') #误报真No就假have

results_df, output_file = analyze_confused_label('confusion/conf_AP_TO_FE_PR_to_LO_LA.csv', 'Appeal_to_Fear-Prejudice', 'Loaded_Language')#Confusion

StartAnalyze labels: Appeal_to_Fear-Prejudice (Confusion为Loaded_Language)(Confused labels)
警告：FilenotCorrect解析，尝试use逗号分隔符...
最终解析Result - 列名: ['id', 'line', 'text', 'true_labels', 'pred_labels', 'match', 'language'], 形状: (21, 7)
ReadofDataFrame列名: ['id', 'line', 'text', 'true_labels', 'pred_labels', 'match', 'language']
DataFrame形状: (21, 7)
前几rowsData:
               id  line                                               text  \
0  po_article2551     9  Nie będę tu rozstrzygał, czy ukraińscy polityc...   
1   po_article224     3  W środę Parlament Europejski przyjął projekty ...   
2  po_article2289    13  Przy przejściu do „kościoła klimatycznego” koś...   
3  cr_article2490    17  Za pravoslavne ljude ovaj Četvrti domovinski r...   
4  ru_article2227     5  A look at recent headlines also grimly reminds...   

                                         true_labels  \
0  Appeal_to_Fear-Prejudice,Appeal_to_Values,Doub...   
1  Appeal_to_Fear-Prejudice,Doubt,Name_Calling-La...   
2  Appea

AnalysisAppeal_to_Fear-Prejudice(Confused labels):   0%|          | 0/21 [00:00<?, ?it/s]

  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''
  cursor.execute('''


DataSuccessfully saved为TSVFile: appeal_to_fear-prejudice_confused_as_loaded_language_analysis_results.tsv
All processing completed！ResultSaved到appeal_to_fear-prejudice_confused_as_loaded_language_analysis_results.tsvandDatabase propaganda_techniques_analysis.db


Unnamed: 0,id,line,text,true_labels,pred_labels,match,language,analysis
0,po_article2551,9,"Nie będę tu rozstrzygał, czy ukraińscy polityc...","Appeal_to_Fear-Prejudice,Appeal_to_Values,Doub...","Appeal_to_Values,Doubt,Flag_Waving,Loaded_Lang...",False,po,"The text primarily fits ""Appeal_to_Fear-Prejud..."
1,po_article224,3,W środę Parlament Europejski przyjął projekty ...,"Appeal_to_Fear-Prejudice,Doubt,Name_Calling-La...","Loaded_Language,Name_Calling-Labeling",False,po,The text aligns more with Appeal_to_Fear-Preju...
2,po_article2289,13,Przy przejściu do „kościoła klimatycznego” koś...,"Appeal_to_Fear-Prejudice,Doubt,Exaggeration-Mi...","Appeal_to_Hypocrisy,Doubt,Guilt_by_Association...",False,po,The text aligns more with Appeal_to_Fear-Preju...
3,cr_article2490,17,Za pravoslavne ljude ovaj Četvrti domovinski r...,"Appeal_to_Fear-Prejudice,Appeal_to_Values,Flag...","Appeal_to_Values,Flag_Waving,Loaded_Language,N...",False,cr,The text aligns more with Appeal_to_Fear-Preju...
4,ru_article2227,5,A look at recent headlines also grimly reminds...,"Appeal_to_Fear-Prejudice,Appeal_to_Hypocrisy,N...","Appeal_to_Hypocrisy,Appeal_to_Popularity,Doubt...",False,ru,The text aligns more with Appeal_to_Fear-Preju...
5,en_article699478811,5,With the lifting of the nuclear-related sancti...,Appeal_to_Fear-Prejudice,Loaded_Language,False,en,The text aligns more with Appeal_to_Fear-Preju...
6,po_article22181,5,"„Co jeszcze musi się wydarzyć, aby czarno-ziel...","Appeal_to_Fear-Prejudice,Appeal_to_Values,Doub...","Doubt,Loaded_Language,Name_Calling-Labeling,Qu...",False,po,The text aligns more with Appeal_to_Fear-Preju...
7,po_article2317,9,Obok tego i tych wszystkich pozornych dobrych ...,"Appeal_to_Fear-Prejudice,Doubt","Doubt,Loaded_Language",False,po,The text aligns more with Appeal_to_Fear-Preju...
8,bg_article2525,5,Французите вече имат проблеми дори с виното. П...,Appeal_to_Fear-Prejudice,Loaded_Language,False,bg,The text aligns more with Appeal_to_Fear-Preju...
9,bg_article2551,9,Няма да решавам тук дали украинските политици ...,"Appeal_to_Fear-Prejudice,Appeal_to_Values,Doub...","Doubt,Flag_Waving,Loaded_Language,Name_Calling...",False,bg,The text aligns more with Appeal_to_Fear-Preju...


In [54]:
def get_database_stats():
    """
    Get data库中recordsStatistical information
    """
    import sqlite3
    import pandas as pd
    
    # Connection到Database
    DB_NAME = "propaganda_techniques_analysis.db"
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    # QueryTable中Total number of records
    cursor.execute("SELECT COUNT(*) FROM label_analysis")
    total_records = cursor.fetchone()[0]
    print(f"Database中Total共have {total_records} records")
    
    # 按Analysis typeStatistics
    cursor.execute("""
    SELECT analysis_type, COUNT(*) as count 
    FROM label_analysis 
    GROUP BY analysis_type
    ORDER BY count DESC
    """)
    analysis_type_stats = cursor.fetchall()
    print("\n按Analysis typeStatistics:")
    for analysis_type, count in analysis_type_stats:
        analysis_type_name = ""
        if analysis_type == "missed":
            analysis_type_name = "Missed"
        elif analysis_type == "false_positive":
            analysis_type_name = "误报"
        elif analysis_type == "correct":
            analysis_type_name = "Correctly identified"
        elif analysis_type == "confused":
            analysis_type_name = "Confused labels"
            
        print(f"  {analysis_type}({analysis_type_name}): {count} records")
    
    # 按TargetLabel statistics
    cursor.execute("""
    SELECT target_label, COUNT(*) as count 
    FROM label_analysis 
    GROUP BY target_label
    ORDER BY count DESC
    """)
    target_label_stats = cursor.fetchall()
    print("\n按TargetLabel statistics:")
    for target_label, count in target_label_stats:
        print(f"  {target_label}: {count} records")
    
    # ConfusionLabel statistics
    cursor.execute("""
    SELECT target_label, confused_label, COUNT(*) as count 
    FROM label_analysis 
    WHERE analysis_type = 'confused' AND confused_label IS NOT NULL
    GROUP BY target_label, confused_label
    ORDER BY count DESC
    """)
    confused_stats = cursor.fetchall()
    if confused_stats:
        print("\nConfusionLabel statistics:")
        for target_label, confused_label, count in confused_stats:
            print(f"  {target_label} byConfusion为 {confused_label}: {count} records")
    
    # Get最近ofAnalysisRecord
    cursor.execute("""
    SELECT id, target_label, analysis_type, confused_label, created_at 
    FROM label_analysis 
    ORDER BY created_at DESC
    LIMIT 5
    """)
    recent_records = cursor.fetchall()
    print("\n最近of5recordsAnalysisRecord:")
    for id, target_label, analysis_type, confused_label, created_at in recent_records:
        confused_info = f"(Confusion为{confused_label})" if confused_label else ""
        print(f"  ID: {id}, Label: {target_label} {confused_info}, Type: {analysis_type}, Time: {created_at}")
    
    # Close connection
    conn.close()
    
    return {
        "total_records": total_records,
        "analysis_type_stats": analysis_type_stats,
        "target_label_stats": target_label_stats,
        "confused_stats": confused_stats,
        "recent_records": recent_records
    }

# Call函数Get data库Statistical information
stats = get_database_stats()

Database中Total共have 1531 records

按Analysis typeStatistics:
  correct(Correctly identified): 467 records
  false_positive(误报): 464 records
  missed(Missed): 445 records
  confused(Confused labels): 155 records

按TargetLabel statistics:
  Loaded_Language: 179 records
  Causal_Oversimplification: 166 records
  Doubt: 134 records
  Red_Herring: 124 records
  Obfuscation-Vagueness-Confusion: 107 records
  Name_Calling-Labeling: 96 records
  Exaggeration-Minimisation: 92 records
  Whataboutism: 89 records
  Straw_Man: 83 records
  Appeal_to_Fear-Prejudice: 80 records
  Conversation_Killer: 78 records
  Questioning_the_Reputation: 72 records
  False_Dilemma-No_Choice: 69 records
  Appeal_to_Authority: 60 records
  Flag_Waving: 59 records
  Appeal_to_Pity: 43 records

ConfusionLabel statistics:
  Exaggeration-Minimisation byConfusion为 Loaded_Language: 33 records
  Questioning_the_Reputation byConfusion为 Loaded_Language: 28 records
  Doubt byConfusion为 Loaded_Language: 24 records
  Appeal_to_F

In [55]:
def export_database_to_tsv():
    """
    convertDatabase中of所haveRecordExport asTSVFile
    """
    import sqlite3
    import pandas as pd
    import csv
    from datetime import datetime
    
    # Connection到Database
    DB_NAME = "propaganda_techniques_analysis.db"
    conn = sqlite3.connect(DB_NAME)
    
    # Query所haveData
    query = "SELECT * FROM label_analysis"
    df = pd.read_sql_query(query, conn)
    
    # GenerateOutput file名（containsTimestamp）
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"propaganda_analysis_export_{timestamp}.tsv"
    
    # usecsv模块Save asTSVFile，确保CorrectProcess特殊字符and分隔符
    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        # CreateTSVwriter，明确指定分隔符为制Table符
        writer = csv.writer(tsv_file, delimiter='\t')
        
        # Write column names
        writer.writerow(df.columns)
        
        # Write each row of data
        for _, row in df.iterrows():
            writer.writerow(row.values)
    
    # Close database connection
    conn.close()
    
    print(f"Database中of所haveRecordSuccessfully exported到: {output_file}")
    print(f"Total exported {len(df)} records，contains {len(df.columns)} fields")
    
    # 显示Data分布information
    print("\nData分布概况:")
    print(f"Analysis type distribution:\n{df['analysis_type'].value_counts()}")
    print(f"\nTargetLabel distribution:\n{df['target_label'].value_counts().head(10)} ...")
    
    return output_file, df

# Call函数Export data
output_file, exported_df = export_database_to_tsv()

Database中of所haveRecordSuccessfully exported到: propaganda_analysis_export_20250427_154826.tsv
Total exported 1531 records，contains 10 fields

Data分布概况:
Analysis type distribution:
analysis_type
correct           467
false_positive    464
missed            445
confused          155
Name: count, dtype: int64

TargetLabel distribution:
target_label
Loaded_Language                    179
Causal_Oversimplification          166
Doubt                              134
Red_Herring                        124
Obfuscation-Vagueness-Confusion    107
Name_Calling-Labeling               96
Exaggeration-Minimisation           92
Whataboutism                        89
Straw_Man                           83
Appeal_to_Fear-Prejudice            80
Name: count, dtype: int64 ...
