In [None]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q pandas numpy scipy scikit-learn openpyxl

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import cohen_kappa_score
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [None]:
# Load quality filtered dataset
DATASET_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/quality_filtered.csv'
# Or for local: DATASET_PATH = '../dataset/quality_filtered.csv'

df = pd.read_csv(DATASET_PATH, encoding='utf-8')
print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:\n{df['Label'].value_counts()}")

In [None]:
# Sampling strategy for human validation
# Target: 1,000-1,500 samples with stratified sampling

VALIDATION_SAMPLE_SIZE = 1000  # Adjust as needed (1000-1500)

# Stratified sampling by label and quality flag
def stratified_sample(df, n_samples, stratify_cols=['Label', 'quality_flag']):
    """
    Perform stratified sampling to maintain distribution.
    """
    # Calculate proportions
    total = len(df)
    sampled_dfs = []
    
    for label in df['Label'].unique():
        for quality in df['quality_flag'].unique():
            subset = df[(df['Label'] == label) & (df['quality_flag'] == quality)]
            proportion = len(subset) / total
            n_subset = max(1, int(n_samples * proportion))
            
            if len(subset) > 0:
                sampled = subset.sample(n=min(n_subset, len(subset)), random_state=42)
                sampled_dfs.append(sampled)
    
    result = pd.concat(sampled_dfs, ignore_index=True)
    
    # Shuffle
    result = result.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return result

# Check if quality_flag exists
if 'quality_flag' not in df.columns:
    # Simple random stratified sampling by label
    validation_sample = df.groupby('Label', group_keys=False).apply(
        lambda x: x.sample(n=min(VALIDATION_SAMPLE_SIZE//2, len(x)), random_state=42)
    ).reset_index(drop=True)
else:
    validation_sample = stratified_sample(df, VALIDATION_SAMPLE_SIZE)

print(f"Validation sample size: {len(validation_sample)}")
print(f"\nLabel distribution in sample:\n{validation_sample['Label'].value_counts()}")

In [None]:
# Create annotation template
# Columns for annotators to fill:
# - Label_Annotator1: 0/1
# - Label_Annotator2: 0/1
# - Intent_Preserved: Yes/No/Unclear
# - CodeMix_Natural: Yes/No/Unclear
# - Bullying_Strength: Same/Stronger/Weaker
# - Comments: Free text

annotation_template = validation_sample.copy()

# Add annotation columns
annotation_template['Label_Annotator1'] = ''
annotation_template['Label_Annotator2'] = ''
annotation_template['Intent_Preserved'] = ''  # Yes/No/Unclear
annotation_template['CodeMix_Natural'] = ''   # Yes/No/Unclear  
annotation_template['Bullying_Strength'] = '' # Same/Stronger/Weaker
annotation_template['Comments'] = ''

# Reorder columns for better annotation experience
column_order = [
    'ID',
    'Text_HindiEnglish',
    'Text_SinhalaEnglish', 
    'Label',  # Ground truth (hidden during annotation)
    'Label_Annotator1',
    'Label_Annotator2',
    'Intent_Preserved',
    'CodeMix_Natural',
    'Bullying_Strength',
    'Comments'
]

# Only include columns that exist
available_cols = [c for c in column_order if c in annotation_template.columns]
annotation_template = annotation_template[available_cols]

print("Annotation template columns:")
print(annotation_template.columns.tolist())
annotation_template.head()

In [None]:
# Save annotation template
# CSV for Google Sheets import
CSV_PATH = '/content/drive/MyDrive/HIN_SIN/annotations/annotation_template.csv'
annotation_template.to_csv(CSV_PATH, index=False, encoding='utf-8')
print(f"Saved CSV template: {CSV_PATH}")

# Excel for easier annotation
EXCEL_PATH = '/content/drive/MyDrive/HIN_SIN/annotations/annotation_template.xlsx'
annotation_template.to_excel(EXCEL_PATH, index=False, engine='openpyxl')
print(f"Saved Excel template: {EXCEL_PATH}")

In [None]:
# Create annotation guidelines
guidelines = """
================================================================================
ANNOTATION GUIDELINES: Sinhala-English Cyberbullying Dataset Validation
================================================================================

TASK: Validate translated samples from Hindi-English to Sinhala-English

COLUMNS TO FILL:

1. Label_Annotator1 / Label_Annotator2:
   - 0 = Non-bullying / Positive / Neutral
   - 1 = Bullying / Negative / Toxic
   
2. Intent_Preserved:
   - Yes = The translated text conveys the same meaning/intent as original
   - No = The meaning was lost or significantly changed
   - Unclear = Cannot determine
   
3. CodeMix_Natural:
   - Yes = The Sinhala-English mix sounds natural (as Sri Lankans would speak)
   - No = Awkward mixing, over-translated, or unnatural
   - Unclear = Cannot determine
   
4. Bullying_Strength (compared to original):
   - Same = Similar intensity of bullying/positivity
   - Stronger = Translation is more aggressive/offensive
   - Weaker = Translation is milder/less offensive
   
5. Comments:
   - Any observations, issues, or suggestions

EXAMPLES:

Good translation (Label: 1, Intent: Yes, CodeMix: Yes, Strength: Same):
  Original: "Tum jaise logon se baat nahi karte."
  Translated: "ඔයා වගේ මිනිස්සු එක්ක talk කරන්නේ නෑ."
  
Bad translation (Intent: No, CodeMix: No):
  Original: "You're such a loser yaar."
  Translated: "ඔබ පරාජිතයෙක්."  (Lost 'loser' and 'yaar', too formal)

IMPORTANT NOTES:
- Do NOT look at the 'Label' column while annotating (it's the ground truth)
- If you're unsure, mark as 'Unclear' and add a comment
- Pay special attention to slang and swear words preservation
- Natural code-mixing means English words should be kept where Sri Lankans would use them

================================================================================
"""

# Save guidelines
GUIDELINES_PATH = '/content/drive/MyDrive/HIN_SIN/annotations/annotation_guidelines.txt'
with open(GUIDELINES_PATH, 'w', encoding='utf-8') as f:
    f.write(guidelines)
print(f"Saved guidelines: {GUIDELINES_PATH}")
print(guidelines)

---
## After Annotation: Calculate Inter-Annotator Agreement

Run the cells below after both annotators have completed their annotations.

In [None]:
# Load completed annotations
# ANNOTATED_PATH = '/content/drive/MyDrive/HIN_SIN/annotations/annotation_completed.csv'
# annotated_df = pd.read_csv(ANNOTATED_PATH, encoding='utf-8')

# For demonstration, create dummy annotations
# REMOVE THIS BLOCK and use actual data
annotated_df = annotation_template.copy()
# Simulate annotations (REMOVE for real data)
annotated_df['Label_Annotator1'] = annotated_df['Label'].apply(lambda x: x if np.random.random() > 0.1 else 1-x)
annotated_df['Label_Annotator2'] = annotated_df['Label'].apply(lambda x: x if np.random.random() > 0.15 else 1-x)
annotated_df['Intent_Preserved'] = np.random.choice(['Yes', 'No', 'Unclear'], size=len(annotated_df), p=[0.7, 0.2, 0.1])
annotated_df['CodeMix_Natural'] = np.random.choice(['Yes', 'No', 'Unclear'], size=len(annotated_df), p=[0.65, 0.25, 0.1])

In [None]:
def calculate_inter_annotator_agreement(df):
    """
    Calculate inter-annotator agreement metrics.
    """
    results = {}
    
    # Filter out empty/invalid annotations
    valid_df = df[
        (df['Label_Annotator1'].notna()) & 
        (df['Label_Annotator2'].notna()) &
        (df['Label_Annotator1'] != '') &
        (df['Label_Annotator2'] != '')
    ].copy()
    
    # Convert to numeric
    valid_df['Label_Annotator1'] = pd.to_numeric(valid_df['Label_Annotator1'], errors='coerce')
    valid_df['Label_Annotator2'] = pd.to_numeric(valid_df['Label_Annotator2'], errors='coerce')
    valid_df = valid_df.dropna(subset=['Label_Annotator1', 'Label_Annotator2'])
    
    if len(valid_df) == 0:
        print("No valid annotations found!")
        return None
    
    # 1. Raw Agreement (% same labels)
    agreement = (valid_df['Label_Annotator1'] == valid_df['Label_Annotator2']).mean()
    results['raw_agreement'] = agreement
    
    # 2. Cohen's Kappa
    kappa = cohen_kappa_score(
        valid_df['Label_Annotator1'].astype(int),
        valid_df['Label_Annotator2'].astype(int)
    )
    results['cohens_kappa'] = kappa
    
    # 3. Agreement with ground truth
    if 'Label' in valid_df.columns:
        gt_agree_1 = (valid_df['Label_Annotator1'] == valid_df['Label']).mean()
        gt_agree_2 = (valid_df['Label_Annotator2'] == valid_df['Label']).mean()
        results['annotator1_vs_gt'] = gt_agree_1
        results['annotator2_vs_gt'] = gt_agree_2
    
    # 4. Sample size
    results['n_samples'] = len(valid_df)
    
    return results, valid_df

# Calculate agreement
agreement_results, valid_annotations = calculate_inter_annotator_agreement(annotated_df)

if agreement_results:
    print("=" * 50)
    print("INTER-ANNOTATOR AGREEMENT RESULTS")
    print("=" * 50)
    print(f"\nSamples analyzed: {agreement_results['n_samples']}")
    print(f"\nRaw Agreement: {agreement_results['raw_agreement']:.2%}")
    print(f"Cohen's Kappa: {agreement_results['cohens_kappa']:.3f}")
    print(f"\nAnnotator 1 vs Ground Truth: {agreement_results.get('annotator1_vs_gt', 'N/A'):.2%}")
    print(f"Annotator 2 vs Ground Truth: {agreement_results.get('annotator2_vs_gt', 'N/A'):.2%}")
    
    # Interpret Kappa
    kappa = agreement_results['cohens_kappa']
    if kappa < 0:
        interpretation = "Poor (worse than chance)"
    elif kappa < 0.2:
        interpretation = "Slight agreement"
    elif kappa < 0.4:
        interpretation = "Fair agreement"
    elif kappa < 0.6:
        interpretation = "Moderate agreement"
    elif kappa < 0.8:
        interpretation = "Substantial agreement"
    else:
        interpretation = "Almost perfect agreement"
    
    print(f"\nKappa Interpretation: {interpretation}")

In [None]:
# Analyze qualitative annotations
if 'Intent_Preserved' in annotated_df.columns:
    print("\n=== Intent Preservation ===")
    print(annotated_df['Intent_Preserved'].value_counts(normalize=True).round(3))

if 'CodeMix_Natural' in annotated_df.columns:
    print("\n=== Code-Mixing Naturalness ===")
    print(annotated_df['CodeMix_Natural'].value_counts(normalize=True).round(3))

if 'Bullying_Strength' in annotated_df.columns:
    print("\n=== Bullying Strength Change ===")
    print(annotated_df['Bullying_Strength'].value_counts(normalize=True).round(3))

In [None]:
# Identify disagreements for resolution
if valid_annotations is not None:
    disagreements = valid_annotations[
        valid_annotations['Label_Annotator1'] != valid_annotations['Label_Annotator2']
    ]
    
    print(f"\n=== Disagreements: {len(disagreements)} samples ===")
    
    if len(disagreements) > 0:
        print("\nSample disagreements:")
        for idx, row in disagreements.head(5).iterrows():
            print(f"\nID: {row['ID']}")
            if 'Text_SinhalaEnglish' in row:
                print(f"Text: {row['Text_SinhalaEnglish']}")
            elif 'Translated_Text' in row:
                print(f"Text: {row['Translated_Text']}")
            print(f"A1: {int(row['Label_Annotator1'])}, A2: {int(row['Label_Annotator2'])}, GT: {row['Label']}")
        
        # Save disagreements for resolution
        disagreements.to_csv(
            '/content/drive/MyDrive/HIN_SIN/annotations/disagreements_for_resolution.csv',
            index=False, encoding='utf-8'
        )
        print(f"\nSaved disagreements for resolution")

In [None]:
# Create final validated sample
# Resolve disagreements by majority vote or discussion

def resolve_labels(row):
    """
    Resolve label disagreements.
    Strategy: Majority vote (with ground truth as tiebreaker)
    """
    a1 = row['Label_Annotator1']
    a2 = row['Label_Annotator2']
    gt = row['Label']
    
    if a1 == a2:
        return int(a1)
    else:
        # Use ground truth as tiebreaker
        return int(gt)

if valid_annotations is not None:
    valid_annotations['Final_Label'] = valid_annotations.apply(resolve_labels, axis=1)
    
    # Save validated sample
    valid_annotations.to_csv(
        '/content/drive/MyDrive/HIN_SIN/annotations/human_validated_sample.csv',
        index=False, encoding='utf-8'
    )
    print(f"Saved validated sample: {len(valid_annotations)} samples")
    print(f"\nNext step: Run 05_finalization.ipynb")