In [None]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages
!pip install -q transformers torch sentencepiece pandas tqdm langdetect
!pip install -q git+https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from langdetect import detect, DetectorFactory
import torch

# Set seed for reproducibility
DetectorFactory.seed = 0

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# Load the Hindi-English dataset
# Update path based on your setup
DATASET_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/HindiEnglish.csv'
# Or for local: DATASET_PATH = '../dataset/HindiEnglish.csv'

df = pd.read_csv(DATASET_PATH, encoding='utf-8')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nLabel distribution:\n{df['Label'].value_counts()}")
df.head(10)

In [None]:
# Initialize IndicTrans2 for Hindi to Sinhala translation
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# IndicTrans2 model for Indic-to-Indic translation
MODEL_NAME = "ai4bharat/indictrans2-indic-indic-1B"

print("Loading IndicTrans2 model... (this may take a few minutes)")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)
print("Model loaded successfully!")

In [None]:
# Language detection helper functions
def detect_language(text):
    """
    Detect the language of a text.
    Returns 'en' for English, 'hi' for Hindi, or 'unknown'
    """
    try:
        lang = detect(text)
        return lang
    except:
        return 'unknown'

def is_english_token(token):
    """
    Check if a token is English based on character set and common patterns.
    """
    # Check if token contains only ASCII letters
    if re.match(r'^[a-zA-Z]+$', token):
        return True
    # Check for common English patterns with punctuation
    if re.match(r"^[a-zA-Z]+[!',.?]*$", token):
        return True
    return False

def is_hindi_token(token):
    """
    Check if token contains Devanagari script (Hindi).
    """
    # Devanagari Unicode range: U+0900 to U+097F
    devanagari_pattern = re.compile(r'[\u0900-\u097F]')
    return bool(devanagari_pattern.search(token))

def is_romanized_hindi(token):
    """
    Check if token is likely Romanized Hindi (Hinglish).
    This is heuristic-based.
    """
    # Common Hindi words written in Roman script
    common_hindi_words = {
        'tum', 'kya', 'hai', 'nahi', 'karte', 'ho', 'kahi', 'ka', 'ki', 'ke',
        'tera', 'teri', 'mera', 'meri', 'tumhara', 'tumhari', 'uska', 'uski',
        'bohot', 'bahut', 'accha', 'acha', 'badiya', 'kaam', 'baat', 'log',
        'logon', 'jaise', 'kaise', 'pagal', 'bakwas', 'band', 'kar', 'samjhta',
        'samajh', 'aukaat', 'sharam', 'aati', 'dekhne', 'layak', 'face',
        'nautanki', 'dimag', 'kharab', 'irritating', 'sakta', 'itna', 'idiot',
        'presentation', 'tha', 'bhai', 'kamaal', 'diya', 'tujhe', 'gaya'
    }
    return token.lower() in common_hindi_words

# Test the functions
test_tokens = ['You', 'awesome', 'Tum', 'kya', 'hai', 'bhai', '!', 'yaar']
for token in test_tokens:
    print(f"{token}: English={is_english_token(token)}, Hindi={is_romanized_hindi(token)}")

In [None]:
# Define English words/slang to preserve (NOT translate)
PRESERVE_ENGLISH = {
    # Common English words in code-mixed text
    'you', 'your', 'i', 'me', 'my', 'we', 'us', 'they', 'them', 'he', 'she', 'it',
    'is', 'are', 'am', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
    'the', 'a', 'an', 'this', 'that', 'these', 'those',
    'and', 'or', 'but', 'so', 'because', 'if', 'when', 'where', 'what', 'who', 'how',
    'not', 'no', 'yes', 'ok', 'okay',
    
    # Internet slang & expressions (MUST preserve)
    'bro', 'dude', 'man', 'buddy', 'yaar', 'lol', 'omg', 'wtf', 'lmao', 'rofl',
    'fake', 'loser', 'idiot', 'stupid', 'dumb', 'fool', 'jerk', 'moron',
    'awesome', 'cool', 'nice', 'great', 'good', 'bad', 'worst', 'best',
    'thanks', 'sorry', 'please', 'welcome',
    'love', 'hate', 'like', 'proud', 'happy', 'sad',
    
    # Common expressions
    "you're", "i'm", "it's", "that's", "what's", "don't", "won't", "can't",
    'job', 'done', 'well', 'effort', 'support', 'positive', 'kindest', 'souls', 'know',
    'keep', 'up', 'always'
}

print(f"Total English words to preserve: {len(PRESERVE_ENGLISH)}")

In [None]:
def translate_hindi_to_sinhala(text, tokenizer, model, device):
    """
    Translate Hindi text to Sinhala using IndicTrans2.
    """
    # Prepare input with language tags
    # Format: <2si> for target language (Sinhala)
    input_text = f"<2si> {text}"
    
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=5,
            early_stopping=True
        )
    
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated

# Test translation
test_hindi = "तुम बहुत अच्छा काम करते हो"
print(f"Hindi: {test_hindi}")
print(f"Sinhala: {translate_hindi_to_sinhala(test_hindi, tokenizer, model, device)}")

In [None]:
def translate_code_mixed_sentence(sentence, tokenizer, model, device):
    """
    Translate a Hindi-English code-mixed sentence to Sinhala-English.
    Preserves English tokens and translates only Hindi portions.
    
    Strategy:
    1. Tokenize the sentence
    2. Identify English vs Hindi tokens
    3. Group consecutive Hindi tokens
    4. Translate Hindi groups to Sinhala
    5. Reconstruct with preserved English
    """
    # Handle pure English sentences
    if all(c.isascii() for c in sentence.replace(' ', '')):
        # Check if it's mostly English
        words = sentence.split()
        english_count = sum(1 for w in words if is_english_token(w.strip('.,!?')))
        if english_count / len(words) > 0.7:  # >70% English
            return sentence  # Keep as is
    
    # Tokenize
    tokens = sentence.split()
    result_tokens = []
    hindi_buffer = []
    
    for token in tokens:
        clean_token = token.strip('.,!?\'"')
        punctuation = token[len(clean_token):] if len(token) > len(clean_token) else ''
        
        # Check if token should be preserved (English or in preserve list)
        if (is_english_token(clean_token) and 
            clean_token.lower() in PRESERVE_ENGLISH):
            # Flush Hindi buffer first
            if hindi_buffer:
                hindi_text = ' '.join(hindi_buffer)
                try:
                    sinhala_text = translate_hindi_to_sinhala(hindi_text, tokenizer, model, device)
                    result_tokens.append(sinhala_text)
                except:
                    result_tokens.append(hindi_text)  # Fallback
                hindi_buffer = []
            result_tokens.append(token)  # Preserve English
        
        elif is_hindi_token(clean_token) or is_romanized_hindi(clean_token):
            # Add to Hindi buffer for batch translation
            hindi_buffer.append(token)
        
        else:
            # Unknown - try to translate
            hindi_buffer.append(token)
    
    # Flush remaining Hindi buffer
    if hindi_buffer:
        hindi_text = ' '.join(hindi_buffer)
        try:
            sinhala_text = translate_hindi_to_sinhala(hindi_text, tokenizer, model, device)
            result_tokens.append(sinhala_text)
        except:
            result_tokens.append(hindi_text)
    
    return ' '.join(result_tokens)

In [None]:
# Test with sample sentences from the dataset
test_sentences = [
    "You're awesome!",
    "Tum jaise logon se baat nahi karte.",
    "Thanks for your support yaar.",
    "Teri aukaat kya hai samjhta hai?",
    "Idiot kahi ka!",
    "Good job, well done!",
    "Bakwas band kar."
]

print("=" * 60)
print("TRANSLATION TEST")
print("=" * 60)

for sent in test_sentences:
    translated = translate_code_mixed_sentence(sent, tokenizer, model, device)
    print(f"\nOriginal: {sent}")
    print(f"Translated: {translated}")

In [None]:
# Process the entire dataset
print(f"Processing {len(df)} sentences...")

translated_texts = []
errors = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Translating"):
    try:
        original_text = row['Text']
        translated_text = translate_code_mixed_sentence(
            original_text, tokenizer, model, device
        )
        translated_texts.append({
            'ID': row['ID'],
            'Original_Text': original_text,
            'Translated_Text': translated_text,
            'Label': row['Label']
        })
    except Exception as e:
        errors.append({'ID': row['ID'], 'Text': row['Text'], 'Error': str(e)})
        translated_texts.append({
            'ID': row['ID'],
            'Original_Text': row['Text'],
            'Translated_Text': row['Text'],  # Keep original on error
            'Label': row['Label']
        })
    
    # Save checkpoint every 100 samples
    if (idx + 1) % 100 == 0:
        checkpoint_df = pd.DataFrame(translated_texts)
        checkpoint_df.to_csv('/content/drive/MyDrive/HIN_SIN/outputs/translation_checkpoint.csv', 
                            index=False, encoding='utf-8')

print(f"\nCompleted! Errors: {len(errors)}")

In [None]:
# Create output DataFrame
translated_df = pd.DataFrame(translated_texts)

print(f"\nTranslated Dataset Shape: {translated_df.shape}")
print(f"\nSample translations:")
translated_df.head(10)

In [None]:
# Save the translated dataset
OUTPUT_PATH = '/content/drive/MyDrive/HIN_SIN/dataset/translated_raw.csv'
# Or for local: OUTPUT_PATH = '../dataset/translated_raw.csv'

translated_df.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Saved translated dataset to: {OUTPUT_PATH}")

# Save errors log if any
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv('/content/drive/MyDrive/HIN_SIN/outputs/translation_errors.csv', 
                     index=False, encoding='utf-8')
    print(f"Saved {len(errors)} errors to translation_errors.csv")

In [None]:
# Summary statistics
print("=" * 60)
print("TRANSLATION SUMMARY")
print("=" * 60)
print(f"Total samples: {len(translated_df)}")
print(f"Successful translations: {len(translated_df) - len(errors)}")
print(f"Errors: {len(errors)}")
print(f"\nLabel distribution:")
print(translated_df['Label'].value_counts())
print(f"\nNext step: Run 02_code_mixing.ipynb to refine code-mixing preservation")