In [None]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages
!pip install -q pandas numpy langdetect regex tqdm

In [None]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from tqdm import tqdm

# Set seed for reproducibility
DetectorFactory.seed = 0

In [None]:
# Load translated dataset
DATASET_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/translated_raw.csv'
# Or for local: DATASET_PATH = '../dataset/translated_raw.csv'

df = pd.read_csv(DATASET_PATH, encoding='utf-8')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Define comprehensive English preservation list
ENGLISH_PRESERVE_LIST = {
    # Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
    'my', 'your', 'his', 'her', 'its', 'our', 'their',
    'mine', 'yours', 'hers', 'ours', 'theirs',
    
    # Common verbs
    'is', 'are', 'am', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'do', 'does', 'did', 'done',
    'will', 'would', 'could', 'should', 'can', 'may', 'might', 'must',
    'know', 'think', 'want', 'need', 'like', 'love', 'hate',
    
    # Articles & determiners
    'the', 'a', 'an', 'this', 'that', 'these', 'those', 'some', 'any', 'no',
    
    # Conjunctions & prepositions
    'and', 'or', 'but', 'so', 'because', 'if', 'when', 'where', 'what', 'who', 'how',
    'in', 'on', 'at', 'to', 'for', 'with', 'from', 'of', 'by',
    
    # Internet slang (CRITICAL - must preserve)
    'bro', 'dude', 'man', 'buddy', 'yaar', 'sis',
    'lol', 'lmao', 'rofl', 'omg', 'wtf', 'af', 'ngl', 'tbh', 'idk', 'idc',
    'bruh', 'yo', 'sup', 'wassup', 'hey', 'hi', 'hello', 'bye',
    
    # Insults & slang (preserve for cyberbullying context)
    'loser', 'idiot', 'stupid', 'dumb', 'fool', 'moron', 'jerk', 'creep',
    'fake', 'trash', 'garbage', 'pathetic', 'lame', 'weak', 'coward',
    'ugly', 'fat', 'skinny', 'weird', 'crazy', 'psycho', 'retard',
    
    # Positive words
    'awesome', 'cool', 'nice', 'great', 'good', 'best', 'amazing', 'wonderful',
    'beautiful', 'lovely', 'sweet', 'kind', 'kindest', 'smart', 'intelligent',
    'proud', 'happy', 'positive', 'love', 'support',
    
    # Common expressions
    'thanks', 'thank', 'sorry', 'please', 'welcome', 'okay', 'ok', 'yes', 'no', 'not',
    'job', 'done', 'well', 'effort', 'keep', 'up', 'always', 'never',
    'souls', 'one', 'know',
    
    # Contractions (common forms)
    "you're", "i'm", "it's", "that's", "what's", "don't", "won't", "can't",
    "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't",
    "wouldn't", "couldn't", "shouldn't"
}

print(f"Total English words to preserve: {len(ENGLISH_PRESERVE_LIST)}")

In [None]:
def is_sinhala_char(char):
    """
    Check if a character is Sinhala script.
    Sinhala Unicode range: U+0D80 to U+0DFF
    """
    return '\u0D80' <= char <= '\u0DFF'

def is_sinhala_token(token):
    """
    Check if a token contains Sinhala characters.
    """
    return any(is_sinhala_char(c) for c in token)

def is_english_token(token):
    """
    Check if a token is purely English (ASCII letters).
    """
    clean_token = re.sub(r'[^a-zA-Z]', '', token)
    return len(clean_token) > 0 and clean_token.isascii()

def get_english_ratio(text):
    """
    Calculate the ratio of English words in text.
    """
    words = text.split()
    if not words:
        return 0
    english_count = sum(1 for w in words if is_english_token(w))
    return english_count / len(words)

def get_sinhala_ratio(text):
    """
    Calculate the ratio of Sinhala words in text.
    """
    words = text.split()
    if not words:
        return 0
    sinhala_count = sum(1 for w in words if is_sinhala_token(w))
    return sinhala_count / len(words)

# Test
test_text = "ඔයා awesome bro!"
print(f"Text: {test_text}")
print(f"English ratio: {get_english_ratio(test_text):.2f}")
print(f"Sinhala ratio: {get_sinhala_ratio(test_text):.2f}")

In [None]:
def extract_english_from_original(original_text):
    """
    Extract English words from the original Hindi-English text.
    These should be preserved in the translation.
    """
    words = original_text.split()
    english_words = []
    
    for word in words:
        # Remove punctuation for checking
        clean_word = re.sub(r'[^a-zA-Z\']', '', word)
        if clean_word and clean_word.isascii():
            # Check if it's a common English word or in preserve list
            if clean_word.lower() in ENGLISH_PRESERVE_LIST:
                english_words.append(word)  # Keep with punctuation
    
    return english_words

def restore_english_tokens(original_text, translated_text):
    """
    Restore English tokens from original text if they were incorrectly translated.
    """
    # Get English words that should be preserved
    english_to_preserve = extract_english_from_original(original_text)
    
    if not english_to_preserve:
        return translated_text
    
    # Check if these words exist in translated text
    translated_lower = translated_text.lower()
    missing_english = []
    
    for word in english_to_preserve:
        clean_word = re.sub(r'[^a-zA-Z\']', '', word).lower()
        if clean_word not in translated_lower:
            missing_english.append(word)
    
    # If English words are missing, we need to reconstruct
    if missing_english:
        # Strategy: Analyze original structure and rebuild
        # This is a simplified approach - may need refinement
        return translated_text
    
    return translated_text

# Test
test_original = "You're awesome bro!"
test_translated = "ඔයා නියමයි!"
print(f"Original: {test_original}")
print(f"English to preserve: {extract_english_from_original(test_original)}")

In [None]:
def analyze_code_mixing(original_text, translated_text):
    """
    Analyze the code-mixing quality of a translation.
    
    Returns a dict with:
    - original_english_ratio: English ratio in original
    - translated_english_ratio: English ratio in translated
    - english_preserved: Whether English words were preserved
    - quality_score: Overall quality score (0-1)
    """
    original_english = extract_english_from_original(original_text)
    orig_eng_ratio = get_english_ratio(original_text)
    trans_eng_ratio = get_english_ratio(translated_text)
    trans_sin_ratio = get_sinhala_ratio(translated_text)
    
    # Check if English words are preserved
    translated_lower = translated_text.lower()
    preserved_count = sum(1 for w in original_english 
                          if re.sub(r'[^a-zA-Z]', '', w).lower() in translated_lower)
    preservation_ratio = preserved_count / len(original_english) if original_english else 1.0
    
    # Quality score calculation
    # Good translation: has Sinhala + preserved English
    quality_score = 0.0
    
    # Penalize if no Sinhala (means no translation happened)
    if trans_sin_ratio > 0:
        quality_score += 0.4
    
    # Reward English preservation
    quality_score += 0.4 * preservation_ratio
    
    # Bonus for natural code-mixing (not pure Sinhala or pure English)
    if 0.1 < trans_eng_ratio < 0.9 and trans_sin_ratio > 0.1:
        quality_score += 0.2
    
    return {
        'original_english_ratio': orig_eng_ratio,
        'translated_english_ratio': trans_eng_ratio,
        'translated_sinhala_ratio': trans_sin_ratio,
        'english_preserved_ratio': preservation_ratio,
        'quality_score': quality_score
    }

In [None]:
# Analyze all translations
print("Analyzing code-mixing quality...")

analysis_results = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    analysis = analyze_code_mixing(row['Original_Text'], row['Translated_Text'])
    analysis['ID'] = row['ID']
    analysis['Label'] = row['Label']
    analysis_results.append(analysis)

analysis_df = pd.DataFrame(analysis_results)

print(f"\n=== Code-Mixing Quality Analysis ===")
print(f"Average quality score: {analysis_df['quality_score'].mean():.3f}")
print(f"Avg English preservation: {analysis_df['english_preserved_ratio'].mean():.3f}")
print(f"Avg Sinhala ratio in translations: {analysis_df['translated_sinhala_ratio'].mean():.3f}")

In [None]:
# Identify problematic translations
# 1. Pure English (no translation)
pure_english = analysis_df[analysis_df['translated_sinhala_ratio'] == 0]
print(f"Pure English (no translation): {len(pure_english)}")

# 2. Pure Sinhala (over-translated)
pure_sinhala = analysis_df[analysis_df['translated_english_ratio'] == 0]
print(f"Pure Sinhala (over-translated): {len(pure_sinhala)}")

# 3. Low English preservation
low_preservation = analysis_df[analysis_df['english_preserved_ratio'] < 0.5]
print(f"Low English preservation (<50%): {len(low_preservation)}")

# 4. Good code-mixed samples
good_samples = analysis_df[analysis_df['quality_score'] >= 0.6]
print(f"Good quality (score >= 0.6): {len(good_samples)}")

In [None]:
def fix_code_mixing(original_text, translated_text):
    """
    Fix code-mixing issues in translated text.
    
    Strategy:
    1. If translation is pure English, keep as is (already English)
    2. If translation lost English words, try to restore structure
    3. Ensure slang and internet expressions are preserved
    """
    # Case 1: Original was pure English - keep as is
    if get_english_ratio(original_text) > 0.9:
        return original_text
    
    # Case 2: Translation is same as original (no translation happened)
    if original_text.strip() == translated_text.strip():
        return translated_text
    
    # Case 3: Restore missing English words
    original_words = original_text.split()
    translated_words = translated_text.split()
    
    result_words = []
    english_to_preserve = extract_english_from_original(original_text)
    english_set = set(w.lower().strip('.,!?\'"') for w in english_to_preserve)
    
    # Track which English words we've added
    added_english = set()
    
    for word in translated_words:
        result_words.append(word)
    
    # Check if any critical English words are missing
    translated_lower = translated_text.lower()
    missing_english = []
    
    for eng_word in english_to_preserve:
        clean_word = re.sub(r'[^a-zA-Z]', '', eng_word).lower()
        if clean_word and clean_word not in translated_lower:
            missing_english.append(eng_word)
    
    # If critical words are missing, append them at logical positions
    # This is a simplified approach
    if missing_english:
        # For now, we'll flag these for manual review
        pass
    
    return ' '.join(result_words)

# Apply fixes
print("Applying code-mixing fixes...")

fixed_translations = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    fixed_text = fix_code_mixing(row['Original_Text'], row['Translated_Text'])
    fixed_translations.append({
        'ID': row['ID'],
        'Original_Text': row['Original_Text'],
        'Translated_Text': fixed_text,
        'Label': row['Label']
    })

fixed_df = pd.DataFrame(fixed_translations)
print("Fixes applied!")

In [None]:
# Re-analyze after fixes
print("Re-analyzing after fixes...")

fixed_analysis = []
for idx, row in fixed_df.iterrows():
    analysis = analyze_code_mixing(row['Original_Text'], row['Translated_Text'])
    fixed_analysis.append(analysis)

fixed_analysis_df = pd.DataFrame(fixed_analysis)

print(f"\n=== After Fixes ===")
print(f"Average quality score: {fixed_analysis_df['quality_score'].mean():.3f}")
print(f"Avg English preservation: {fixed_analysis_df['english_preserved_ratio'].mean():.3f}")

In [None]:
# Add quality metrics to the dataframe
fixed_df['quality_score'] = fixed_analysis_df['quality_score']
fixed_df['english_preserved'] = fixed_analysis_df['english_preserved_ratio']
fixed_df['sinhala_ratio'] = fixed_analysis_df['translated_sinhala_ratio']

# View samples
print("\n=== Sample Translations ===")
for idx in range(min(10, len(fixed_df))):
    row = fixed_df.iloc[idx]
    print(f"\n[{row['Label']}] Original: {row['Original_Text']}")
    print(f"    Translated: {row['Translated_Text']}")
    print(f"    Quality: {row['quality_score']:.2f}")

In [None]:
# Save the code-mixed preserved dataset
OUTPUT_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/code_mixed_preserved.csv'
# Or for local: OUTPUT_PATH = '../dataset/code_mixed_preserved.csv'

fixed_df.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Saved code-mixed dataset to: {OUTPUT_PATH}")

print(f"\n=== Summary ===")
print(f"Total samples: {len(fixed_df)}")
print(f"High quality (score >= 0.6): {len(fixed_df[fixed_df['quality_score'] >= 0.6])}")
print(f"Medium quality (0.3-0.6): {len(fixed_df[(fixed_df['quality_score'] >= 0.3) & (fixed_df['quality_score'] < 0.6)])}")
print(f"Low quality (< 0.3): {len(fixed_df[fixed_df['quality_score'] < 0.3])}")
print(f"\nNext step: Run 03_quality_filtering.ipynb")