In [1]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required packages
!pip install -q transformers torch pandas numpy tqdm scikit-learn

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Check GPU
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda' if device == 0 else 'cpu'}")

Using device: cuda


In [4]:
# Load code-mixed preserved dataset
DATASET_PATH = '/content/drive/Othercomputers/My Laptop/HIN_SIN/dataset/code_mixed_preserved.csv'
# Or for local: DATASET_PATH = '../dataset/code_mixed_preserved.csv'

df = pd.read_csv(DATASET_PATH, encoding='utf-8')
print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:\n{df['Label'].value_counts()}")
df.head()

Dataset shape: (25000, 7)

Label distribution:
Label
1    15000
0    10000
Name: count, dtype: int64


Unnamed: 0,ID,Original_Text,Translated_Text,Label,quality_score,english_preserved,sinhala_ratio
0,1,You're awesome!,You're awesome!,0,0.2,0.5,0.0
1,2,Tum jaise logon se baat nahi karte.,Tum jaise logon se baat nahi karte.,1,0.4,1.0,0.0
2,3,Thanks for your support yaar.,Thanks for your support yaar.,0,0.4,1.0,0.0
3,4,Teri aukaat kya hai samjhta hai?,Teri aukaat kya hai samjhta hai?,1,0.4,1.0,0.0
4,5,"Nice effort, proud of you.","Nice effort, proud of you.",0,0.4,1.0,0.0


In [5]:
# Initialize XLM-RoBERTa for zero-shot classification
# This model works well for multilingual text including Sinhala

print("Loading XLM-RoBERTa zero-shot classifier...")
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",  # Good for zero-shot
    device=device
)

# Alternative: Use multilingual model for better Sinhala support
# classifier = pipeline(
#     "zero-shot-classification",
#     model="joeddav/xlm-roberta-large-xnli",
#     device=device
# )

print("Classifier loaded!")

Loading XLM-RoBERTa zero-shot classifier...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Classifier loaded!


In [6]:
# Define classification labels
CANDIDATE_LABELS = ["bullying", "not bullying", "positive", "negative", "toxic", "friendly"]

# Mapping for bullying detection
BULLYING_LABELS = {"bullying", "negative", "toxic"}
NON_BULLYING_LABELS = {"not bullying", "positive", "friendly"}

def classify_text(text, classifier):
    """
    Classify text using zero-shot classification.
    Returns predicted label (0 or 1) and confidence score.
    """
    try:
        result = classifier(text, CANDIDATE_LABELS, multi_label=False)

        # Get top label and score
        top_label = result['labels'][0]
        top_score = result['scores'][0]

        # Map to binary label
        if top_label in BULLYING_LABELS:
            predicted_label = 1
        else:
            predicted_label = 0

        return predicted_label, top_score, top_label

    except Exception as e:
        return None, 0.0, str(e)

# Test classification
test_texts = [
    "You're awesome!",
    "You're such a loser!",
    "Thanks for your support bro"
]

print("Testing classifier:")
for text in test_texts:
    label, score, top = classify_text(text, classifier)
    print(f"  '{text}' -> {label} ({top}: {score:.3f})")

Testing classifier:
  'You're awesome!' -> 0 (positive: 0.940)
  'You're such a loser!' -> 1 (negative: 0.844)
  'Thanks for your support bro' -> 0 (friendly: 0.543)


In [7]:
# Classify original texts (Hindi-English)
print("Classifying original texts...")

original_predictions = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Original texts"):
    pred_label, confidence, top_label = classify_text(row['Original_Text'], classifier)
    original_predictions.append({
        'ID': row['ID'],
        'original_pred_label': pred_label,
        'original_confidence': confidence,
        'original_top_label': top_label
    })

original_pred_df = pd.DataFrame(original_predictions)
print(f"\nOriginal text predictions completed!")

Classifying original texts...


Original texts:   0%|          | 7/25000 [00:01<1:09:39,  5.98it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Original texts: 100%|██████████| 25000/25000 [49:14<00:00,  8.46it/s]



Original text predictions completed!


In [8]:
# Classify translated texts (Sinhala-English)
print("Classifying translated texts...")

translated_predictions = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Translated texts"):
    pred_label, confidence, top_label = classify_text(row['Translated_Text'], classifier)
    translated_predictions.append({
        'ID': row['ID'],
        'translated_pred_label': pred_label,
        'translated_confidence': confidence,
        'translated_top_label': top_label
    })

translated_pred_df = pd.DataFrame(translated_predictions)
print(f"\nTranslated text predictions completed!")

Classifying translated texts...


Translated texts: 100%|██████████| 25000/25000 [50:05<00:00,  8.32it/s]


Translated text predictions completed!





In [9]:
# Merge predictions with original dataframe
df = df.merge(original_pred_df, on='ID')
df = df.merge(translated_pred_df, on='ID')

# Calculate label consistency
df['label_match_original'] = (df['original_pred_label'] == df['Label']).astype(int)
df['label_match_translated'] = (df['translated_pred_label'] == df['Label']).astype(int)
df['label_consistent'] = (df['original_pred_label'] == df['translated_pred_label']).astype(int)

print("=== Label Consistency Analysis ===")
print(f"Original matches ground truth: {df['label_match_original'].mean():.2%}")
print(f"Translated matches ground truth: {df['label_match_translated'].mean():.2%}")
print(f"Original = Translated (consistency): {df['label_consistent'].mean():.2%}")

=== Label Consistency Analysis ===
Original matches ground truth: 86.38%
Translated matches ground truth: 86.38%
Original = Translated (consistency): 100.00%


In [10]:
# Analyze by label
print("\n=== Analysis by Ground Truth Label ===")

for label in [0, 1]:
    subset = df[df['Label'] == label]
    label_name = "Non-bullying" if label == 0 else "Bullying"
    print(f"\n{label_name} samples (n={len(subset)}):")
    print(f"  Original correct: {subset['label_match_original'].mean():.2%}")
    print(f"  Translated correct: {subset['label_match_translated'].mean():.2%}")
    print(f"  Consistent: {subset['label_consistent'].mean():.2%}")


=== Analysis by Ground Truth Label ===

Non-bullying samples (n=10000):
  Original correct: 80.48%
  Translated correct: 80.48%
  Consistent: 100.00%

Bullying samples (n=15000):
  Original correct: 90.31%
  Translated correct: 90.31%
  Consistent: 100.00%


In [11]:
# Define quality filtering criteria
def compute_quality_flag(row):
    """
    Compute quality flag based on multiple criteria.

    Returns:
    - 'high': Keep with high confidence
    - 'medium': Keep but flag for potential review
    - 'low': Remove from dataset
    """
    score = 0

    # Criterion 1: Translation maintains label (most important)
    if row['label_match_translated']:
        score += 3

    # Criterion 2: Label consistency between original and translated
    if row['label_consistent']:
        score += 2

    # Criterion 3: High confidence in translated prediction
    if row['translated_confidence'] > 0.7:
        score += 1
    elif row['translated_confidence'] > 0.5:
        score += 0.5

    # Criterion 4: Code-mixing quality score (if available)
    if 'quality_score' in row and row['quality_score'] >= 0.6:
        score += 1

    # Classify based on total score
    if score >= 5:
        return 'high'
    elif score >= 3:
        return 'medium'
    else:
        return 'low'

df['quality_flag'] = df.apply(compute_quality_flag, axis=1)

print("=== Quality Distribution ===")
print(df['quality_flag'].value_counts())

=== Quality Distribution ===
quality_flag
high      21595
low        2442
medium      963
Name: count, dtype: int64


In [12]:
# View samples from each quality category
print("\n=== HIGH QUALITY SAMPLES ===")
high_samples = df[df['quality_flag'] == 'high'].head(5)
for _, row in high_samples.iterrows():
    print(f"\n[Label: {row['Label']}]")
    print(f"  Original: {row['Original_Text']}")
    print(f"  Translated: {row['Translated_Text']}")

print("\n=== LOW QUALITY SAMPLES ===")
low_samples = df[df['quality_flag'] == 'low'].head(5)
for _, row in low_samples.iterrows():
    print(f"\n[Label: {row['Label']}]")
    print(f"  Original: {row['Original_Text']}")
    print(f"  Translated: {row['Translated_Text']}")
    print(f"  Pred (orig/trans): {row['original_pred_label']}/{row['translated_pred_label']}")


=== HIGH QUALITY SAMPLES ===

[Label: 0]
  Original: You're awesome!
  Translated: You're awesome!

[Label: 1]
  Original: Tum jaise logon se baat nahi karte.
  Translated: Tum jaise logon se baat nahi karte.

[Label: 0]
  Original: Thanks for your support yaar.
  Translated: Thanks for your support yaar.

[Label: 1]
  Original: Teri aukaat kya hai samjhta hai?
  Translated: Teri aukaat kya hai samjhta hai?

[Label: 0]
  Original: Nice effort, proud of you.
  Translated: Nice effort, proud of you.

=== LOW QUALITY SAMPLES ===

[Label: 0]
  Original: Tum bahut accha kaam karte ho.
  Translated: Tum bahut accha kaam karte ho.
  Pred (orig/trans): 1/1

[Label: 0]
  Original: Tum bahut accha kaam karte ho.
  Translated: Tum bahut accha kaam karte ho.
  Pred (orig/trans): 1/1

[Label: 0]
  Original: Tum bahut accha kaam karte ho.
  Translated: Tum bahut accha kaam karte ho.
  Pred (orig/trans): 1/1

[Label: 0]
  Original: Tum bahut accha kaam karte ho.
  Translated: Tum bahut accha kaam ka

In [13]:
# Filter dataset - keep high and medium quality samples
filtered_df = df[df['quality_flag'].isin(['high', 'medium'])].copy()

print("=== FILTERING SUMMARY ===")
print(f"Original samples: {len(df)}")
print(f"After filtering: {len(filtered_df)}")
print(f"Removed: {len(df) - len(filtered_df)} ({(len(df) - len(filtered_df))/len(df):.1%})")

print(f"\nFiltered label distribution:")
print(filtered_df['Label'].value_counts())

=== FILTERING SUMMARY ===
Original samples: 25000
After filtering: 22558
Removed: 2442 (9.8%)

Filtered label distribution:
Label
1    13547
0     9011
Name: count, dtype: int64


In [14]:
# Check class balance after filtering
original_balance = df['Label'].value_counts(normalize=True)
filtered_balance = filtered_df['Label'].value_counts(normalize=True)

print("\n=== Class Balance ===")
print(f"Original - Non-bullying: {original_balance[0]:.1%}, Bullying: {original_balance[1]:.1%}")
print(f"Filtered - Non-bullying: {filtered_balance[0]:.1%}, Bullying: {filtered_balance[1]:.1%}")


=== Class Balance ===
Original - Non-bullying: 40.0%, Bullying: 60.0%
Filtered - Non-bullying: 39.9%, Bullying: 60.1%


In [15]:
# Prepare final filtered dataset
output_columns = ['ID', 'Original_Text', 'Translated_Text', 'Label',
                  'quality_flag', 'translated_confidence']

# Add quality_score if available
if 'quality_score' in filtered_df.columns:
    output_columns.append('quality_score')

final_filtered = filtered_df[output_columns].copy()

# Rename for clarity
final_filtered = final_filtered.rename(columns={
    'Translated_Text': 'Text_SinhalaEnglish',
    'Original_Text': 'Text_HindiEnglish'
})

print("Final filtered dataset:")
final_filtered.head()

Final filtered dataset:


Unnamed: 0,ID,Text_HindiEnglish,Text_SinhalaEnglish,Label,quality_flag,translated_confidence,quality_score
0,1,You're awesome!,You're awesome!,0,high,0.93986,0.2
1,2,Tum jaise logon se baat nahi karte.,Tum jaise logon se baat nahi karte.,1,high,0.493205,0.4
2,3,Thanks for your support yaar.,Thanks for your support yaar.,0,high,0.572508,0.4
3,4,Teri aukaat kya hai samjhta hai?,Teri aukaat kya hai samjhta hai?,1,high,0.520245,0.4
4,5,"Nice effort, proud of you.","Nice effort, proud of you.",0,high,0.898297,0.4


In [16]:
# Save filtered dataset
OUTPUT_PATH = '/content/drive/Othercomputers/My Laptop/HIN_SIN/dataset/quality_filtered.csv'
# Or for local: OUTPUT_PATH = '../dataset/quality_filtered.csv'

final_filtered.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Saved filtered dataset to: {OUTPUT_PATH}")

# Save removed samples for analysis
removed_df = df[df['quality_flag'] == 'low']
removed_df.to_csv('/content/drive/Othercomputers/My Laptop/HIN_SIN/outputs/removed_samples.csv',
                  index=False, encoding='utf-8')
print(f"Saved {len(removed_df)} removed samples for analysis")

Saved filtered dataset to: /content/drive/Othercomputers/My Laptop/HIN_SIN/dataset/quality_filtered.csv
Saved 2442 removed samples for analysis


In [17]:
# Final summary
print("\n" + "="*60)
print("QUALITY FILTERING COMPLETE")
print("="*60)
print(f"\nInput: {len(df)} samples")
print(f"Output: {len(final_filtered)} samples")
print(f"Removed: {len(df) - len(final_filtered)} samples ({(len(df) - len(final_filtered))/len(df):.1%})")
print(f"\nLabel distribution:")
print(f"  Non-bullying (0): {len(final_filtered[final_filtered['Label']==0])}")
print(f"  Bullying (1): {len(final_filtered[final_filtered['Label']==1])}")
print(f"\nNext step: Run 04_validation_prep.ipynb to prepare samples for human validation")


QUALITY FILTERING COMPLETE

Input: 25000 samples
Output: 22558 samples
Removed: 2442 samples (9.8%)

Label distribution:
  Non-bullying (0): 9011
  Bullying (1): 13547

Next step: Run 04_validation_prep.ipynb to prepare samples for human validation
