In [31]:
import pandas as pd

csv_file_path = 'PT_validation_predictions.csv'
data = pd.read_csv(csv_file_path)

In [32]:
# Helper function to process labels
def process_labels(label):
    narratives = []
    subnarratives = []

    # Split by commas but ensure commas within labels are preserved
    labels = []
    current_label = ""
    for part in label.split(','):
        if current_label and part.lstrip().startswith('_'):
            # Append to the current label if it's part of the same narrative
            current_label += ',' + part
        else:
            if current_label:
                labels.append(current_label)
            current_label = part
    if current_label:
        labels.append(current_label)

    # Process each label
    for entry in labels:
        if entry == "Other__Other":
            # Special case: "Other__Other"
            narratives.append('Other')
            subnarratives.append('Other')
        elif '__' in entry:
            # Split into narrative and subnarrative
            narrative, subnarrative = entry.split('__', 1)
            narrative = narrative.replace('_', ' ')
            subnarrative = subnarrative.replace('_', ' ')
            narratives.append(narrative)
            if subnarrative.lower() == 'other':
                # If subnarrative is "Other", format it as [Narrative]: Other
                subnarratives.append(f"{narrative}: Other")
            else:
                subnarratives.append(subnarrative)
        elif entry.lower() == 'other':
            # Special case: 'Other'
            narratives.append('Other')
            subnarratives.append('Other')
        else:
            # Narrative exists but no specific subnarrative
            narrative = entry.replace('_', ' ')
            narratives.append(narrative)
            subnarratives.append(f"{narrative}: Other")

    return ';'.join(narratives), ';'.join(subnarratives)

# Apply the processing function to the 'predicted_labels' column
data['narrative'], data['subnarrative'] = zip(*data['predicted_labels'].apply(process_labels))

# Create the final output DataFrame
output_data = data[['article_id', 'narrative', 'subnarrative']]
output_data.columns = ['article_id', 'narrative_1;...;narrative_N', 'subnarrative_1;...;subnarrative_N']

# Save as TSV
output_tsv_path = 'PT_validation_predictions_transformed.tsv'
output_data.to_csv(output_tsv_path, sep='\t', index=False)

print(f"TSV file saved to {output_tsv_path}")



TSV file saved to PT_validation_predictions_transformed.tsv


In [5]:
import pandas as pd
import ast


csv_file_path = 'gpt4o-mini_preds_dev.csv'
data = pd.read_csv(csv_file_path)

In [6]:

# Helper function to process narratives and subnarratives
def process_narratives_and_subnarratives(p_system):
    narratives = []
    subnarratives = []
    
    # Parse the input as a dictionary
    try:
        parsed_data = ast.literal_eval(p_system)
        narratives = parsed_data.get('narrative', [])
        subnarratives = parsed_data.get('subnarrative', [])
    except Exception as e:
        print(f"Error parsing p_system: {p_system}, error: {e}")
    
    return ';'.join(narratives), ';'.join(subnarratives)


# Process the p_system column
data['narrative'], data['subnarrative'] = zip(*data['p_system'].apply(process_narratives_and_subnarratives))

# Group by the 'language' column and save separate TSV files
languages = data['language'].unique()
for lang in languages:
    lang_data = data[data['language'] == lang]
    
    # Create the final output DataFrame
    output_data = lang_data[['article_id', 'narrative', 'subnarrative']]
    output_data.columns = ['article_id', 'narrative_1;...;narrative_N', 'subnarrative_1;...;subnarrative_N']
    
    # Save as TSV
    output_tsv_path = f'{lang}_validation_predictions_transformed_2.tsv'
    output_data.to_csv(output_tsv_path, sep='\t', index=False)
    print(f"TSV file saved to {output_tsv_path}")


TSV file saved to BG_validation_predictions_transformed_2.tsv
TSV file saved to EN_validation_predictions_transformed_2.tsv
TSV file saved to HI_validation_predictions_transformed_2.tsv
TSV file saved to PT_validation_predictions_transformed_2.tsv
