In [None]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q pandas numpy

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import json

In [None]:
# Load quality filtered dataset
FILTERED_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/quality_filtered.csv'
filtered_df = pd.read_csv(FILTERED_PATH, encoding='utf-8')
print(f"Quality filtered samples: {len(filtered_df)}")

# Load human validated sample (if available)
try:
    VALIDATED_PATH = '/content/drive/MyDrive/HIN_SIN/annotations/human_validated_sample.csv'
    validated_df = pd.read_csv(VALIDATED_PATH, encoding='utf-8')
    print(f"Human validated samples: {len(validated_df)}")
    has_validation = True
except:
    print("No human validation data found. Proceeding without it.")
    has_validation = False

In [None]:
# If human validation exists, identify samples to remove
samples_to_remove = set()

if has_validation:
    # Remove samples where:
    # 1. Intent was NOT preserved
    # 2. Code-mixing was NOT natural
    # 3. Both annotators disagreed with ground truth
    
    for idx, row in validated_df.iterrows():
        remove = False
        
        # Check intent preservation
        if 'Intent_Preserved' in row and row['Intent_Preserved'] == 'No':
            remove = True
        
        # Check code-mixing naturalness
        if 'CodeMix_Natural' in row and row['CodeMix_Natural'] == 'No':
            remove = True
        
        # Check if both annotators disagreed with ground truth
        if 'Label_Annotator1' in row and 'Label_Annotator2' in row:
            a1 = row['Label_Annotator1']
            a2 = row['Label_Annotator2']
            gt = row['Label']
            if a1 != gt and a2 != gt:
                remove = True
        
        if remove:
            samples_to_remove.add(row['ID'])
    
    print(f"Samples flagged for removal from validation: {len(samples_to_remove)}")

In [None]:
# Create final dataset
final_df = filtered_df[~filtered_df['ID'].isin(samples_to_remove)].copy()

print(f"\n=== Dataset Size Progression ===")
print(f"After quality filtering: {len(filtered_df)}")
print(f"Removed by validation: {len(samples_to_remove)}")
print(f"Final dataset size: {len(final_df)}")

In [None]:
# Prepare final dataset columns
# Standard format for the Sinhala-English cyberbullying dataset

final_columns = {
    'ID': 'id',
    'Text_SinhalaEnglish': 'text',
    'Label': 'label'
}

# Rename and select columns
if 'Text_SinhalaEnglish' in final_df.columns:
    text_col = 'Text_SinhalaEnglish'
elif 'Translated_Text' in final_df.columns:
    text_col = 'Translated_Text'
else:
    text_col = final_df.columns[final_df.columns.str.contains('Text', case=False)][0]

output_df = final_df[['ID', text_col, 'Label']].copy()
output_df.columns = ['id', 'text', 'label']

# Reset index
output_df = output_df.reset_index(drop=True)
output_df['id'] = range(1, len(output_df) + 1)

print(f"Final dataset shape: {output_df.shape}")
output_df.head(10)

In [None]:
# Dataset statistics
print("=" * 60)
print("FINAL DATASET STATISTICS")
print("=" * 60)

print(f"\nTotal samples: {len(output_df)}")
print(f"\nLabel distribution:")
label_counts = output_df['label'].value_counts()
print(f"  Non-bullying (0): {label_counts[0]} ({label_counts[0]/len(output_df):.1%})")
print(f"  Bullying (1): {label_counts[1]} ({label_counts[1]/len(output_df):.1%})")

# Text length statistics
output_df['text_length'] = output_df['text'].str.len()
output_df['word_count'] = output_df['text'].str.split().str.len()

print(f"\nText length (characters):")
print(f"  Mean: {output_df['text_length'].mean():.1f}")
print(f"  Min: {output_df['text_length'].min()}")
print(f"  Max: {output_df['text_length'].max()}")

print(f"\nWord count:")
print(f"  Mean: {output_df['word_count'].mean():.1f}")
print(f"  Min: {output_df['word_count'].min()}")
print(f"  Max: {output_df['word_count'].max()}")

In [None]:
# Check for Sinhala content
import re

def has_sinhala(text):
    """Check if text contains Sinhala characters."""
    sinhala_pattern = re.compile(r'[\u0D80-\u0DFF]')
    return bool(sinhala_pattern.search(str(text)))

def has_english(text):
    """Check if text contains English characters."""
    return bool(re.search(r'[a-zA-Z]', str(text)))

output_df['has_sinhala'] = output_df['text'].apply(has_sinhala)
output_df['has_english'] = output_df['text'].apply(has_english)
output_df['is_code_mixed'] = output_df['has_sinhala'] & output_df['has_english']

print("\nCode-mixing analysis:")
print(f"  Has Sinhala: {output_df['has_sinhala'].sum()} ({output_df['has_sinhala'].mean():.1%})")
print(f"  Has English: {output_df['has_english'].sum()} ({output_df['has_english'].mean():.1%})")
print(f"  Code-mixed (both): {output_df['is_code_mixed'].sum()} ({output_df['is_code_mixed'].mean():.1%})")

In [None]:
# Remove temporary analysis columns
output_df = output_df[['id', 'text', 'label']]

# View final samples
print("\n=== Sample Entries ===")
print("\n--- Non-bullying samples ---")
for _, row in output_df[output_df['label'] == 0].head(3).iterrows():
    print(f"  [{row['id']}] {row['text']}")

print("\n--- Bullying samples ---")
for _, row in output_df[output_df['label'] == 1].head(3).iterrows():
    print(f"  [{row['id']}] {row['text']}")

In [None]:
# Save final dataset
OUTPUT_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/SinhalaEnglish_final.csv'
# Or for local: OUTPUT_PATH = '../dataset/SinhalaEnglish_final.csv'

output_df.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Saved final dataset to: {OUTPUT_PATH}")

In [None]:
# Create dataset documentation (datacard)
datacard = f"""
# Sinhala-English Code-Mixed Cyberbullying Dataset

## Overview
This dataset contains Sinhala-English code-mixed text samples labeled for cyberbullying detection.
It was created by translating a Hindi-English code-mixed cyberbullying dataset.

## Dataset Statistics
- **Total samples**: {len(output_df)}
- **Non-bullying (0)**: {label_counts[0]} ({label_counts[0]/len(output_df):.1%})
- **Bullying (1)**: {label_counts[1]} ({label_counts[1]/len(output_df):.1%})

## Creation Pipeline
1. **Source**: Hindi-English code-mixed cyberbullying dataset
2. **Translation**: Hindi ‚Üí Sinhala using IndicTrans2
3. **Code-mixing preservation**: English tokens preserved using language detection
4. **Quality filtering**: XLM-RoBERTa zero-shot classification for label consistency
5. **Human validation**: Partial manual validation with inter-annotator agreement

## File Format
- **Format**: CSV (UTF-8 encoded)
- **Columns**:
  - `id`: Unique identifier
  - `text`: Sinhala-English code-mixed text
  - `label`: Binary label (0 = non-bullying, 1 = bullying)

## Labels
- **0 (Non-bullying)**: Positive, neutral, or supportive content
- **1 (Bullying)**: Toxic, offensive, or cyberbullying content

## Usage
```python
import pandas as pd
df = pd.read_csv('SinhalaEnglish_final.csv', encoding='utf-8')
```

## Citation
[Add your citation here]

## License
[Add license information]

## Created
{datetime.now().strftime('%Y-%m-%d')}
"""

# Save datacard
DATACARD_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/README.md'
with open(DATACARD_PATH, 'w', encoding='utf-8') as f:
    f.write(datacard)
print(f"Saved dataset documentation to: {DATACARD_PATH}")
print(datacard)

In [None]:
# Create metadata JSON
metadata = {
    "dataset_name": "Sinhala-English Code-Mixed Cyberbullying Dataset",
    "version": "1.0",
    "created_date": datetime.now().isoformat(),
    "source_dataset": "Hindi-English Code-Mixed Cyberbullying Dataset",
    "statistics": {
        "total_samples": int(len(output_df)),
        "non_bullying_count": int(label_counts[0]),
        "bullying_count": int(label_counts[1]),
        "class_balance": round(label_counts[1] / len(output_df), 3)
    },
    "pipeline": [
        "IndicTrans2 translation (Hindi ‚Üí Sinhala)",
        "Code-mixing preservation",
        "XLM-RoBERTa quality filtering",
        "Human validation"
    ],
    "languages": ["Sinhala", "English"],
    "task": "Binary classification (cyberbullying detection)"
}

METADATA_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/metadata.json'
with open(METADATA_PATH, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f"Saved metadata to: {METADATA_PATH}")

In [None]:
# Final summary
print("\n" + "=" * 60)
print("üéâ DATASET CREATION COMPLETE!")
print("=" * 60)

print(f"""
üìä Final Dataset:
   - File: SinhalaEnglish_final.csv
   - Samples: {len(output_df)}
   - Labels: 0 (non-bullying), 1 (bullying)

üìÅ Output Files:
   - dataset/SinhalaEnglish_final.csv (main dataset)
   - dataset/README.md (documentation)
   - dataset/metadata.json (metadata)

‚úÖ Pipeline completed:
   1. Translation (Hindi ‚Üí Sinhala)
   2. Code-mixing preservation
   3. Quality filtering
   4. Human validation
   5. Finalization

üî¨ Ready for:
   - Model training
   - Research publication
   - Benchmarking
""")