In [6]:
import pandas as pd
import os

"""
Clean Submission File Generators for All Subtasks
This script takes the prediction files with extra columns and creates clean submission files
"""

# ==================== CONFIGURATION ====================
# Set your language
LANGUAGE = 'eng'  # Change to 'swa' or 'amh' as needed

# Set which model's predictions to use (short name from training)
# Examples: 'twitter-roberta-base-hate-latest', 'deberta-v3-base', 'xlm-roberta-base', 'afro-xlmr-base'
MODEL_NAME = 'deberta-v3-base'

# ==================== SUBTASK 1: BINARY CLASSIFICATION ====================
def create_subtask1_submission(language, model_name):
    """
    Create clean submission for Subtask 1 (Binary: polarization)
    Expected format: id, polarization
    """
    input_file = f'./subtask1/predictions_{language}_{model_name}.csv'
    output_file = f'./submission/submission_subtask1_{language}_{model_name}.csv'
    
    if not os.path.exists(input_file):
        print(f"Error: {input_file} not found!")
        return
    
    # Read predictions
    df = pd.read_csv(input_file)
    
    # Create clean submission with only required columns
    submission = pd.DataFrame()
    
    # ID column
    if 'id' in df.columns:
        submission['id'] = df['id']
    else:
        print("Warning: No 'id' column found!")
        return
    
    # Use per-label threshold predictions (best performance)
    if 'polarization_tuned' in df.columns:
        submission['polarization'] = df['polarization_tuned']
    elif 'polarization' in df.columns:
        submission['polarization'] = df['polarization']
    else:
        print("Error: No polarization column found!")
        return
    
    # Save
    submission.to_csv(output_file, index=False)
    print(f"Subtask 1: Created {output_file}")
    print(f"  Columns: {submission.columns.tolist()}")
    print(f"  Rows: {len(submission)}")
    print(f"  Sample:\n{submission.head()}\n")

# ==================== SUBTASK 2: MULTI-LABEL POLARIZATION ====================
def create_subtask2_submission(language, model_name):
    """
    Create clean submission for Subtask 2 (Multi-label: 5 polarization types)
    Expected format: id, gender/sexual, political, religious, racial/ethnic, other
    """
    input_file = f'./subtask2/predictions_{language}_{model_name}.csv'
    output_file = f'./submission/submission_subtask2_{language}_{model_name}.csv'
    
    if not os.path.exists(input_file):
        print(f"Error: {input_file} not found!")
        return
    
    # Read predictions
    df = pd.read_csv(input_file)
    
    # Define label columns
    label_columns = ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
    
    # Create clean submission
    submission = pd.DataFrame()
    
    # ID column
    if 'id' in df.columns:
        submission['id'] = df['id']
    else:
        print("Warning: No 'id' column found!")
        return
    
    # Add label columns (use per-label threshold predictions - best performance)
    for label in label_columns:
        if label in df.columns:
            # This is the per-label threshold prediction (recommended)
            submission[label] = df[label].astype(int)
        elif f'{label}_default' in df.columns:
            # Fallback to default if per-label not available
            submission[label] = df[f'{label}_default'].astype(int)
        else:
            print(f"Error: Column {label} not found!")
            return
    
    # Save
    submission.to_csv(output_file, index=False)
    print(f"Subtask 2: Created {output_file}")
    print(f"  Columns: {submission.columns.tolist()}")
    print(f"  Rows: {len(submission)}")
    print(f"  Sample:\n{submission.head()}\n")

# ==================== SUBTASK 3: MULTI-LABEL MANIFESTATION ====================
def create_subtask3_submission(language, model_name):
    """
    Create clean submission for Subtask 3 (Multi-label: 6 manifestation types)
    Expected format: id, stereotype, vilification, dehumanization, extreme_language, lack_of_empathy, invalidation
    """
    input_file = f'./subtask3/predictions_{language}_{model_name}.csv'
    output_file = f'./submission/submission_subtask3_{language}_{model_name}.csv'
    
    if not os.path.exists(input_file):
        print(f"Error: {input_file} not found!")
        return
    
    # Read predictions
    df = pd.read_csv(input_file)
    
    # Define label columns
    label_columns = ['stereotype', 'vilification', 'dehumanization', 
                     'extreme_language', 'lack_of_empathy', 'invalidation']
    
    # Create clean submission
    submission = pd.DataFrame()
    
    # ID column
    if 'id' in df.columns:
        submission['id'] = df['id']
    else:
        print("Warning: No 'id' column found!")
        return
    
    # Add label columns (use per-label threshold predictions - best performance)
    for label in label_columns:
        if label in df.columns:
            # This is the per-label threshold prediction (recommended)
            submission[label] = df[label].astype(int)
        elif f'{label}_default' in df.columns:
            # Fallback to default if per-label not available
            submission[label] = df[f'{label}_default'].astype(int)
        else:
            print(f"Error: Column {label} not found!")
            return
    
    # Save
    submission.to_csv(output_file, index=False)
    print(f"Subtask 3: Created {output_file}")
    print(f"  Columns: {submission.columns.tolist()}")
    print(f"  Rows: {len(submission)}")
    print(f"  Sample:\n{submission.head()}\n")

# ==================== BATCH PROCESSING ====================
def create_all_submissions(language, model_name):
    """Create clean submissions for all three subtasks"""
    print(f"Creating clean submission files for {language.upper()} using {model_name}")
    print("="*80)
    
    create_subtask1_submission(language, model_name)
    create_subtask2_submission(language, model_name)
    create_subtask3_submission(language, model_name)
    
    print("="*80)
    print("All submission files created successfully!")
    print("\nIMPORTANT: These files use the TUNED THRESHOLDS which typically perform better.")
    print("If you want to use default threshold (0.5), modify the code to use '_default' columns.")



In [7]:
# ==================== MAIN ====================
if __name__ == "__main__":
    # Create submissions for the configured language and model
    # create_all_submissions(LANGUAGE, MODEL_NAME)
    
    # Optional: Create submissions for all models
    # Uncomment the lines below if you want to create submissions for all trained models
    
    print("\n\nCreating submissions for ALL models...")
    models_eng = ['twitter-roberta-base-hate-latest', 'deberta-v3-base', 'xlm-roberta-base']
    models_african = ['twitter-roberta-base-hate-latest', 'afro-xlmr-base']
    
    for model in models_eng:
        print(f"\n{'='*80}")
        create_all_submissions('eng', model)
    
    for model in models_african:
        print(f"\n{'='*80}")
        create_all_submissions('swa', model)
        print(f"\n{'='*80}")
        create_all_submissions('amh', model)



Creating submissions for ALL models...

Creating clean submission files for ENG using twitter-roberta-base-hate-latest
Subtask 1: Created ./submission/submission_subtask1_eng_twitter-roberta-base-hate-latest.csv
  Columns: ['id', 'polarization']
  Rows: 160
  Sample:
                                     id  polarization
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0
2  eng_95770ff547ea5e48b0be00f385986483             0
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             1
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0

Subtask 2: Created ./submission/submission_subtask2_eng_twitter-roberta-base-hate-latest.csv
  Columns: ['id', 'gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
  Rows: 160
  Sample:
                                     id  gender/sexual  political  religious  \
0  eng_f66ca14d60851371f9720aaf4ccd9b58              0          0          0   
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb    