<a href="https://colab.research.google.com/github/chaitanya-maddala-236/Cross-Language-Sentimenal-Analysis/blob/main/Cross_Language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install transformers datasets scikit-learn pandas numpy matplotlib seaborn torch -q

# Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import load_dataset
import os
from tqdm import tqdm
import json
from scipy import stats
pip install -U transformers accelerate datasets



In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:

print("="*80)
print("STEP 1: LOADING DATASETS")
print("="*80)

# 1.1 Load English Training Data (IMDb)
print("\nЁЯУе Loading English IMDb dataset...")
try:
    imdb_dataset = load_dataset("imdb")

    # Sample for faster training (remove sampling for full dataset)
    train_data = imdb_dataset['train'].shuffle(seed=42).select(range(5000))
    val_data = imdb_dataset['test'].shuffle(seed=42).select(range(1000))

    print(f"тЬЕ English Training samples: {len(train_data)}")
    print(f"тЬЕ English Validation samples: {len(val_data)}")
    print(f"Sample: {train_data[0]['text'][:100]}...")
    print(f"Label: {train_data[0]['label']} (0=Negative, 1=Positive)")
except Exception as e:
    print(f"тЭМ Error loading IMDb: {e}")
    print("Note: Run this on Kaggle with internet enabled")

# 1.2 Load Indic Language Test Data
print("\nЁЯУе Loading Indic language datasets...")


In [None]:
def create_sample_indic_data():
    """Create comprehensive sample Indic data for demonstration"""

    # HINDI SAMPLES (100 examples)
    hindi_positive = [
        'рдпрд╣ рдлрд┐рд▓реНрдо рдмрд╣реБрдд рдЕрдЪреНрдЫреА рд╣реИ',  # This movie is very good
        'рд╢рд╛рдирджрд╛рд░ рдЕрднрд┐рдирдп рдФрд░ рдХрд╣рд╛рдиреА',  # Great acting and story
        'рдмрд╣реБрдд рд░реЛрдорд╛рдВрдЪрдХ рдФрд░ рдордиреЛрд░рдВрдЬрдХ',  # Very exciting and entertaining
        'рдореБрдЭреЗ рдпрд╣ рдлрд┐рд▓реНрдо рдмрд╣реБрдд рдкрд╕рдВрдж рдЖрдИ',  # I really liked this movie
        'рдХрдорд╛рд▓ рдХреА рдлрд┐рд▓реНрдо рд╣реИ',  # Amazing movie
        'рдмреЗрд╣рддрд░реАрди рдирд┐рд░реНрджреЗрд╢рди рдФрд░ рд╕рдВрдЧреАрдд',  # Excellent direction and music
        'рдкрд░рд┐рд╡рд╛рд░ рдХреЗ рд╕рд╛рде рджреЗрдЦрдиреЗ рд▓рд╛рдпрдХ',  # Worth watching with family
        'рд╕рднреА рдЕрднрд┐рдиреЗрддрд╛рдУрдВ рдиреЗ рд╢рд╛рдирджрд╛рд░ рдХрд╛рдо рдХрд┐рдпрд╛',  # All actors did great work
        'рдпрд╣ рд╕рд╛рд▓ рдХреА рд╕рдмрд╕реЗ рдЕрдЪреНрдЫреА рдлрд┐рд▓реНрдо рд╣реИ',  # This is the best movie of the year
        'рдордиреЛрд░рдВрдЬрдХ рдФрд░ рдкреНрд░реЗрд░рдгрд╛рджрд╛рдпрдХ',  # Entertaining and inspiring
        'рджрд┐рд▓ рдХреЛ рдЫреВ рдЬрд╛рдиреЗ рд╡рд╛рд▓реА рдХрд╣рд╛рдиреА',  # Heart-touching story
        'рдмрдЪреНрдЪреЛрдВ рдХреЗ рд▓рд┐рдП рдПрдХрджрдо рд╕рд╣реА',  # Perfect for children
        'рд╣рд░ рдХрд┐рд╕реА рдХреЛ рджреЗрдЦрдиреА рдЪрд╛рд╣рд┐рдП',  # Everyone should watch
        'рдкреИрд╕рд╛ рд╡рд╕реВрд▓ рдлрд┐рд▓реНрдо',  # Value for money movie
        'рдзрдорд╛рдХреЗрджрд╛рд░ рдПрдХреНрд╢рди рд╕реАрди',  # Explosive action scenes
        'рдЧрд╛рдиреЗ рдмрд╣реБрдд рд╕реБрдВрджрд░ рд╣реИрдВ',  # Songs are very beautiful
        'рдХрд╣рд╛рдиреА рдореЗрдВ рдирдпрд╛ рдЯреНрд╡рд┐рд╕реНрдЯ',  # New twist in the story
        'рд╡рд┐рдЬреБрдЕрд▓ рдЗрдлреЗрдХреНрдЯреНрд╕ рдХрдорд╛рд▓ рдХреЗ',  # Amazing visual effects
        'рднрд╛рд╡рдирд╛рддреНрдордХ рдФрд░ рдорд╛рд░реНрдорд┐рдХ',  # Emotional and poignant
        'рд╣рд╛рд╕реНрдп рд╕реЗ рднрд░рдкреВрд░',  # Full of humor
        'рд░реЛрдорд╛рдВрд╕ рдмрд╣реБрдд рдЕрдЪреНрдЫрд╛ рд╣реИ',  # Romance is very good
        'рд╕рдВрджреЗрд╢ рджреЗрдиреЗ рд╡рд╛рд▓реА рдлрд┐рд▓реНрдо',  # Message-oriented movie
        'рдпрд╛рджрдЧрд╛рд░ рдлрд┐рд▓реНрдо рдмрди рдЧрдИ',  # Became a memorable movie
        'рд╕рд┐рдиреЗрдорд╛рдШрд░реЛрдВ рдореЗрдВ рдЬрд░реВрд░ рджреЗрдЦреЗрдВ',  # Must watch in theaters
        'рдкреВрд░реЗ рдкрд░рд┐рд╡рд╛рд░ рдХрд╛ рдордиреЛрд░рдВрдЬрди',  # Entertainment for whole family
        'рдмреЗрд╣рддрд░реАрди рдкрдЯрдХрдерд╛',  # Excellent screenplay
        'рдЕрднрд┐рдирдп рдХреА рдмреЗрд╣рддрд░реАрди рдорд┐рд╕рд╛рд▓',  # Great example of acting
        'рд╕рд┐рдиреЗрдореЗрдЯреЛрдЧреНрд░рд╛рдлреА рд╢рд╛рдирджрд╛рд░ рд╣реИ',  # Cinematography is wonderful
        'рд╕рдВрд╡рд╛рдж рдмрд╣реБрдд рдкреНрд░рднрд╛рд╡реА рд╣реИрдВ',  # Dialogues are very effective
        'рдХреНрд▓рд╛рдЗрдореЗрдХреНрд╕ рдзрдорд╛рдХреЗрджрд╛рд░',  # Climax is explosive
        'рд╕рд╕реНрдкреЗрдВрд╕ рдмрдирд╛ рд░рд╣рддрд╛ рд╣реИ',  # Suspense is maintained
        'рд░реЛрдиреЗ рдкрд░ рдордЬрдмреВрд░ рдХрд░ рджреЗрдЧреА',  # Will make you cry
        'рд╣рдВрд╕рд╛рдиреЗ рдореЗрдВ рдХрд╛рдордпрд╛рдм',  # Successful in making laugh
        'рдкреНрд░реЗрдо рдХрд╣рд╛рдиреА рджрд┐рд▓ рдХреЛ рдЫреВ рдЧрдИ',  # Love story touched the heart
        'рджреЗрдЦрддреЗ рд╣реА рд░рд╣реЛрдЧреЗ',  # You'll keep watching
        'рд╕рдордп рдмреАрддрдиреЗ рдХрд╛ рдкрддрд╛ рдирд╣реАрдВ рдЪрд▓рд╛',  # Didn't realize time passing
        'рд╕рднреА рдЙрдореНрд░ рдХреЗ рд▓реЛрдЧреЛрдВ рдХреЗ рд▓рд┐рдП',  # For all age groups
        'рдмреЙрд▓реАрд╡реБрдб рдХреА рд╢рд╛рди',  # Pride of Bollywood
        'рдмреНрд▓реЙрдХрдмрд╕реНрдЯрд░ рдлрд┐рд▓реНрдо',  # Blockbuster movie
        'рд╣рд┐рдЯ рд╣реЛрдиреЗ рд╡рд╛рд▓реА рдлрд┐рд▓реНрдо',  # Movie that will be a hit
        'рджреЛрдмрд╛рд░рд╛ рджреЗрдЦрдиреЗ рд▓рд╛рдпрдХ',  # Worth watching again
        'рдкреВрд░реА рддрд░рд╣ рдордиреЛрд░рдВрдЬрдХ',  # Completely entertaining
        'рдЧрдЬрдм рдХреА рдлрд┐рд▓реНрдо рд╣реИ',  # Awesome movie
        'рд▓рд╛рдЬрд╡рд╛рдм рдкреНрд░рд╕реНрддреБрддрд┐',  # Marvelous presentation
        'рджрдорджрд╛рд░ рдЕрднрд┐рдирдп',  # Powerful acting
        'рд░реЛрдорд╛рдВрдЪ рд╕реЗ рднрд░рд╛ рдЕрдиреБрднрд╡',  # Thrilling experience
        'рдпрдерд╛рд░реНрдерд╡рд╛рджреА рдЪрд┐рддреНрд░рдг',  # Realistic portrayal
        'рд╕рд╛рдорд╛рдЬрд┐рдХ рд╕рдВрджреЗрд╢ рдХреЗ рд╕рд╛рде',  # With social message
        'рдорд╛рд╕реНрдЯрд░рдкреАрд╕ рдлрд┐рд▓реНрдо',  # Masterpiece movie
        'рд╣рд░ рджреГрд╢реНрдп рд╢рд╛рдирджрд╛рд░',  # Every scene is wonderful
    ]

    hindi_negative = [
        'рдореБрдЭреЗ рдпрд╣ рдлрд┐рд▓реНрдо рдкрд╕рдВрдж рдирд╣реАрдВ рдЖрдИ',  # I didn't like this movie
        'рдмреЗрдХрд╛рд░ рдлрд┐рд▓реНрдо рд╕рдордп рдХреА рдмрд░реНрдмрд╛рджреА',  # Bad movie waste of time
        'рдХрд╣рд╛рдиреА рдмрд╣реБрдд рдХрдордЬреЛрд░ рд╣реИ',  # Story is very weak
        'рдЙрдмрд╛рдК рдФрд░ рд▓рдВрдмреА рдлрд┐рд▓реНрдо',  # Boring and long movie
        'рдкреИрд╕реЗ рдХрд╛ рдмрд░реНрдмрд╛рдж',  # Waste of money
        'рдЕрднрд┐рдирдп рдмрд┐рд▓реНрдХреБрд▓ рдирд╣реАрдВ рдЬрдорд╛',  # Acting didn't work at all
        'рдирд┐рд░реНрджреЗрд╢рди рдореЗрдВ рдХрдореА',  # Lack in direction
        'рд╕рдВрдЧреАрдд рднреА рдЕрдЪреНрдЫрд╛ рдирд╣реАрдВ',  # Music is also not good
        'рджреЗрдЦрдиреЗ рд▓рд╛рдпрдХ рдирд╣реАрдВ',  # Not worth watching
        'рд╕рдордп рдЦрд░рд╛рдм рд╣реЛ рдЧрдпрд╛',  # Time got wasted
        'рдХреЛрдИ рдирдпрд╛рдкрди рдирд╣реАрдВ',  # Nothing new
        'рдкреБрд░рд╛рдиреА рдХрд╣рд╛рдиреА рдлрд┐рд░ рд╕реЗ',  # Old story again
        'рдмрдЪреНрдЪреЛрдВ рдХреЛ рдордд рджрд┐рдЦрд╛рдирд╛',  # Don't show to children
        'рдмрд╣реБрдд рдирд┐рд░рд╛рд╢рд╛рдЬрдирдХ',  # Very disappointing
        'рдПрдХреНрд╢рди рднреА рдмреЗрдХрд╛рд░',  # Action also useless
        'рдЧрд╛рдиреЗ рдХрд╛рдиреЛрдВ рдХреЛ рддрдХрд▓реАрдл',  # Songs hurt the ears
        'рдЯреНрд╡рд┐рд╕реНрдЯ рдХрд╛ рдЕрднрд╛рд╡',  # Lack of twist
        'рд╡рд┐рдЬреБрдЕрд▓ рдЗрдлреЗрдХреНрдЯреНрд╕ рдШрдЯрд┐рдпрд╛',  # Poor visual effects
        'рднрд╛рд╡рдирд╛рддреНрдордХ рд░реВрдк рд╕реЗ рдХрдордЬреЛрд░',  # Emotionally weak
        'рд╣рд╛рд╕реНрдп рдлреАрдХрд╛ рд╣реИ',  # Comedy is bland
        'рд░реЛрдорд╛рдВрд╕ рдЬрдмрд░рджрд╕реНрддреА рдХрд╛',  # Forced romance
        'рдХреЛрдИ рд╕рдВрджреЗрд╢ рдирд╣реАрдВ',  # No message
        'рднреВрд▓рдиреЗ рдпреЛрдЧреНрдп рдлрд┐рд▓реНрдо',  # Forgettable movie
        'рдШрд░ рдкрд░ рднреА рдордд рджреЗрдЦреЗрдВ',  # Don't watch even at home
        'рдкрд░рд┐рд╡рд╛рд░ рдХреЗ рд╕рд╛рде рд╢рд░реНрдорд┐рдВрджрдЧреА',  # Embarrassment with family
        'рдкрдЯрдХрдерд╛ рдореЗрдВ рдЫреЗрдж',  # Holes in screenplay
        'рдЕрднрд┐рдирдп рдмрдирд╛рд╡рдЯреА рд▓рдЧрд╛',  # Acting seemed artificial
        'рд╕рд┐рдиреЗрдореЗрдЯреЛрдЧреНрд░рд╛рдлреА рдФрд╕рдд',  # Average cinematography
        'рд╕рдВрд╡рд╛рдж рдмрдЪрдХрд╛рдиреЗ рд╣реИрдВ',  # Dialogues are childish
        'рдХреНрд▓рд╛рдЗрдореЗрдХреНрд╕ рдирд┐рд░рд╛рд╢ рдХрд░рддрд╛ рд╣реИ',  # Climax disappoints
        'рд╕рд╕реНрдкреЗрдВрд╕ рдмрд┐рд▓реНрдХреБрд▓ рдирд╣реАрдВ',  # No suspense at all
        'рд░реБрд▓рд╛рдиреЗ рдореЗрдВ рдирд╛рдХрд╛рдо',  # Failed to make cry
        'рд╣рдВрд╕рд╛рдиреЗ рдХреА рдХреЛрд╢рд┐рд╢ рдлреЗрд▓',  # Failed attempt to make laugh
        'рдкреНрд░реЗрдо рдХрд╣рд╛рдиреА рдШрд┐рд╕реА рдкрд┐рдЯреА',  # Love story is cliched
        'рджреЗрдЦрддреЗ рд╣реБрдП рдиреАрдВрдж рдЖрдИ',  # Felt sleepy while watching
        'рд╕рдордп рдмрд░реНрдмрд╛рдж рд╣реЛ рдЧрдпрд╛',  # Time got wasted
        'рдмрдЪреНрдЪреЛрдВ рдХреЛ рдмреЛрд░ рдХрд░реЗрдЧреА',  # Will bore children
        'рдмреЙрд▓реАрд╡реБрдб рдХреА рд╕рдмрд╕реЗ рдЦрд░рд╛рдм',  # Worst of Bollywood
        'рдлреНрд▓реЙрдк рд╣реЛрдиреЗ рд╡рд╛рд▓реА рдлрд┐рд▓реНрдо',  # Movie that will flop
        'рдПрдХ рдмрд╛рд░ рднреА рдордд рджреЗрдЦреЛ',  # Don't watch even once
        'рдкреВрд░реА рддрд░рд╣ рдмреЗрдХрд╛рд░',  # Completely useless
        'рдШрдЯрд┐рдпрд╛ рдлрд┐рд▓реНрдо рд╣реИ',  # Poor quality movie
        'рдмреБрд░реА рдкреНрд░рд╕реНрддреБрддрд┐',  # Bad presentation
        'рдХрдордЬреЛрд░ рдЕрднрд┐рдирдп',  # Weak acting
        'рд░реЛрдорд╛рдВрдЪ рдХрд╛ рдЕрднрд╛рд╡',  # Lack of thrill
        'рдЕрд╡рд╛рд╕реНрддрд╡рд┐рдХ рдЪрд┐рддреНрд░рдг',  # Unrealistic portrayal
        'рд╡реНрдпрд░реНрде рдХрд╛ рд╕рд╛рдорд╛рдЬрд┐рдХ рд╕рдВрджреЗрд╢',  # Useless social message
        'рдЕрд╕рдлрд▓ рдлрд┐рд▓реНрдо',  # Failed movie
        'рд╣рд░ рджреГрд╢реНрдп рдЙрдмрд╛рдК',  # Every scene is boring
    ]

    hindi_data = pd.DataFrame({
        'text': hindi_positive + hindi_negative,
        'label': [1] * len(hindi_positive) + [0] * len(hindi_negative)
    })

    # TELUGU SAMPLES (100 examples)
    telugu_positive = [
        'р░И р░╕р░┐р░ир░┐р░ор░╛ р░Ър░╛р░▓р░╛ р░мр░╛р░Чр▒Бр░Вр░жр░┐',  # This movie is very good
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░ир░Яр░и р░ор░░р░┐р░пр▒Б р░Хр░е',  # Wonderful acting and story
        'р░Ър░╛р░▓р░╛ р░Йр░др▒Нр░др▒Зр░Ьр░Хр░░р░ор▒Ир░и',  # Very exciting
        'р░ир░╛р░Хр▒Б р░И р░╕р░┐р░ир░┐р░ор░╛ р░Ър░╛р░▓р░╛ р░ир░Ър▒Нр░Ър░┐р░Вр░жр░┐',  # I really liked this movie
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░╕р░┐р░ир░┐р░ор░╛',  # Amazing movie
        'р░Чр▒Кр░кр▒Нр░к р░жр░░р▒Нр░╢р░Хр░др▒Нр░╡р░В р░ор░░р░┐р░пр▒Б р░╕р░Вр░Чр▒Ар░др░В',  # Great direction and music
        'р░Хр▒Бр░Яр▒Бр░Вр░мр░Вр░др▒Л р░Ър▒Вр░бр░жр░Чр░┐р░ир░жр░┐',  # Worth watching with family
        'р░Ер░Вр░жр░░р▒Б р░ир░Яр▒Ар░ир░Яр▒Бр░▓р▒Б р░Ер░жр▒Нр░нр▒Бр░др░Вр░Чр░╛ р░ир░Яр░┐р░Вр░Ър░╛р░░р▒Б',  # All actors acted wonderfully
        'р░И р░╕р░Вр░╡р░др▒Нр░╕р░░р░кр▒Б р░Йр░др▒Нр░др░о р░Ър░┐р░др▒Нр░░р░В',  # Best movie of the year
        'р░╡р░┐р░ир▒Лр░жр░нр░░р░┐р░др░В р░ор░░р░┐р░пр▒Б р░╕р▒Нр░лр▒Вр░░р▒Нр░др░┐р░жр░╛р░пр░Хр░В',  # Entertaining and inspiring
        'р░╣р▒Гр░жр░пр░╛р░ир▒Нр░ир░┐ р░др░╛р░Хр▒З р░Хр░е',  # Heart-touching story
        'р░кр░┐р░▓р▒Нр░▓р░▓р░Хр▒Б р░╕р░░р░┐р░кр▒Лр░пр▒Зр░жр░┐',  # Perfect for children
        'р░Ер░Вр░жр░░р▒В р░др░кр▒Нр░кр░Х р░Ър▒Вр░бр░╛р░▓р░┐',  # Everyone must watch
        'р░бр░мр▒Нр░мр▒Б р░╡р░┐р░▓р▒Бр░╡р▒Ир░ир░жр░┐',  # Value for money
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░пр░╛р░Хр▒Нр░╖р░ир▒Н р░╕р▒Ар░ир▒Нр░▓р▒Б',  # Amazing action scenes
        'р░кр░╛р░Яр░▓р▒Б р░Ър░╛р░▓р░╛ р░Ер░Вр░жр░Вр░Чр░╛ р░Йр░ир▒Нр░ир░╛р░пр░┐',  # Songs are very beautiful
        'р░Хр░ер░▓р▒Л р░Хр▒Кр░др▒Нр░д р░Яр▒Нр░╡р░┐р░╕р▒Нр░Яр▒Н',  # New twist in story
        'р░╡р░┐р░Ьр▒Бр░╡р░▓р▒Н р░Ор░лр▒Жр░Хр▒Нр░Яр▒Нр░╕р▒Н р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░ир░╡р░┐',  # Visual effects are amazing
        'р░нр░╛р░╡р▒Лр░жр▒Нр░╡р▒Зр░Чр░нр░░р░┐р░др░ор▒Ир░ир░жр░┐',  # Emotional
        'р░╣р░╛р░╕р▒Нр░пр░Вр░др▒Л р░ир░┐р░Вр░бр░┐р░ир░жр░┐',  # Full of humor
        'р░кр▒Нр░░р▒Зр░ор░Хр░е р░Ър░╛р░▓р░╛ р░мр░╛р░Чр▒Бр░Вр░жр░┐',  # Love story is very good
        'р░╕р░Вр░жр▒Зр░╢р░╛р░ир▒Нр░ир░┐ р░Зр░Ър▒Нр░Ър▒З р░╕р░┐р░ир░┐р░ор░╛',  # Message-giving movie
        'р░ор░░р░кр▒Бр░░р░╛р░ир░┐ р░╕р░┐р░ир░┐р░ор░╛',  # Unforgettable movie
        'р░ер░┐р░пр▒Зр░Яр░░р▒Нр░▓р░▓р▒Л р░др░кр▒Нр░кр░Х р░Ър▒Вр░бр░╛р░▓р░┐',  # Must watch in theaters
        'р░ор▒Кр░др▒Нр░др░В р░Хр▒Бр░Яр▒Бр░Вр░мр░╛р░ир░┐р░Хр░┐ р░╡р░┐р░ир▒Лр░жр░В',  # Entertainment for whole family
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░╕р▒Нр░Хр▒Нр░░р▒Ар░ир▒НтАМр░кр▒Нр░▓р▒З',  # Excellent screenplay
        'р░ир░Яр░и р░пр▒Кр░Хр▒Нр░Х р░Чр▒Кр░кр▒Нр░к р░Йр░жр░╛р░╣р░░р░г',  # Great example of acting
        'р░╕р░┐р░ир░┐р░ор░╛р░Яр▒Лр░Чр▒Нр░░р░лр▒А р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░ир░жр░┐',  # Cinematography is wonderful
        'р░бр▒Ир░▓р░╛р░Чр▒Нр░╕р▒Н р░Ър░╛р░▓р░╛ р░кр▒Нр░░р░нр░╛р░╡р░╡р░Вр░др░ор▒Ир░ир░╡р░┐',  # Dialogues are very effective
        'р░Хр▒Нр░▓р▒Ир░ор░╛р░Хр▒Нр░╕р▒Н р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░ир░жр░┐',  # Climax is amazing
        'р░╕р░╕р▒Нр░кр▒Жр░ир▒Нр░╕р▒Н р░Хр▒Кр░ир░╕р░╛р░Чр▒Бр░др▒Бр░Вр░жр░┐',  # Suspense continues
        'р░Пр░бр▒Нр░кр░┐р░Вр░Ър▒Зр░▓р░╛ р░Йр░Вр░жр░┐',  # Makes you cry
        'р░ир░╡р▒Нр░╡р░┐р░Вр░Ър░бр░Вр░▓р▒Л р░╡р░┐р░Ьр░пр░╡р░Вр░др░В',  # Successful in making laugh
        'р░кр▒Нр░░р▒Зр░ор░Хр░е р░╣р▒Гр░жр░пр░╛р░ир▒Нр░ир░┐ р░др░╛р░Хр░┐р░Вр░жр░┐',  # Love story touched heart
        'р░Ър▒Вр░╕р▒Нр░др▒Вр░ир▒З р░Йр░Вр░Яр░╛р░░р▒Б',  # You'll keep watching
        'р░╕р░ор░пр░В р░Чр░бр░┐р░Ър░┐р░Вр░жр░ир░┐ р░др▒Жр░▓р░┐р░пр░▓р▒Зр░жр▒Б',  # Didn't realize time passing
        'р░Ер░ир▒Нр░ир░┐ р░╡р░пр░╕р▒Бр░▓ р░╡р░╛р░░р░┐р░Хр░┐',  # For all ages
        'р░др▒Жр░▓р▒Бр░Чр▒Б р░╕р░┐р░ир░┐р░ор░╛ р░Чр░░р▒Нр░╡р░В',  # Pride of Telugu cinema
        'р░мр▒Нр░▓р░╛р░Хр▒НтАМр░мр░╕р▒Нр░Яр░░р▒Н р░╕р░┐р░ир░┐р░ор░╛',  # Blockbuster movie
        'р░╣р░┐р░Яр▒Н р░Ер░пр▒Нр░пр▒З р░╕р░┐р░ир░┐р░ор░╛',  # Movie that will be hit
        'р░ор░│р▒Нр░│р▒А р░Ър▒Вр░бр░жр░Чр░┐р░ир░жр░┐',  # Worth watching again
        'р░кр▒Вр░░р▒Нр░др░┐р░Чр░╛ р░╡р░┐р░ир▒Лр░жр░нр░░р░┐р░др░В',  # Completely entertaining
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░╕р░┐р░ир░┐р░ор░╛',  # Awesome movie
        'р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░кр▒Нр░░р░жр░░р▒Нр░╢р░и',  # Marvelous presentation
        'р░╢р░Хр▒Нр░др░┐р░╡р░Вр░др░ор▒Ир░и р░ир░Яр░и',  # Powerful acting
        'р░ер▒Нр░░р░┐р░▓р▒Нр░▓р░┐р░Вр░Чр▒Н р░Ер░ир▒Бр░нр░╡р░В',  # Thrilling experience
        'р░╡р░╛р░╕р▒Нр░др░╡р░┐р░Х р░Ър░┐р░др▒Нр░░р░г',  # Realistic portrayal
        'р░╕р░╛р░ор░╛р░Ьр░┐р░Х р░╕р░Вр░жр▒Зр░╢р░Вр░др▒Л',  # With social message
        'р░ор░╛р░╕р▒Нр░Яр░░р▒НтАМр░кр▒Ар░╕р▒Н р░╕р░┐р░ир░┐р░ор░╛',  # Masterpiece movie
        'р░кр▒Нр░░р░др░┐ р░╕р░ир▒Нр░ир░┐р░╡р▒Зр░╢р░В р░Ер░жр▒Нр░нр▒Бр░др░В',  # Every scene is wonderful
    ]

    telugu_negative = [
        'р░ир░╛р░Хр▒Б р░И р░╕р░┐р░ир░┐р░ор░╛ р░ир░Ър▒Нр░Ър░▓р▒Зр░жр▒Б',  # I didn't like this movie
        'р░╡р▒Нр░пр░░р▒Нр░е р░╕р░┐р░ир░┐р░ор░╛',  # Waste movie
        'р░Хр░е р░Ър░╛р░▓р░╛ р░мр░▓р░╣р▒Ар░ир░Вр░Чр░╛ р░Йр░Вр░жр░┐',  # Story is very weak
        'р░мр▒Лр░░р░┐р░Вр░Чр▒Н р░ор░░р░┐р░пр▒Б р░кр▒Кр░бр░╡р▒Ир░и р░╕р░┐р░ир░┐р░ор░╛',  # Boring and long movie
        'р░бр░мр▒Нр░мр▒Б р░╡р▒Гр░ер░╛',  # Waste of money
        'р░ир░Яр░и р░мр░┐р░▓р▒Нр░Хр▒Бр░▓р▒Н р░Ьр░ор░▓р▒Зр░жр▒Б',  # Acting didn't work at all
        'р░жр░░р▒Нр░╢р░Хр░др▒Нр░╡р░Вр░▓р▒Л р░▓р▒Лр░кр░В',  # Lack in direction
        'р░╕р░Вр░Чр▒Ар░др░В р░Хр▒Вр░бр░╛ р░мр░╛р░Чр▒Лр░▓р▒Зр░жр▒Б',  # Music is also not good
        'р░Ър▒Вр░бр░жр░Чр░┐р░ир░жр░┐ р░Хр░╛р░жр▒Б',  # Not worth watching
        'р░╕р░ор░пр░В р░╡р▒Гр░ер░╛ р░Ер░пр░┐р░Вр░жр░┐',  # Time got wasted
        'р░Пр░ор▒А р░Хр▒Кр░др▒Нр░др░жр░┐ р░▓р▒Зр░жр▒Б',  # Nothing new
        'р░кр░╛р░д р░Хр░е р░ор░│р▒Нр░│р▒А',  # Old story again
        'р░кр░┐р░▓р▒Нр░▓р░▓р░Хр▒Б р░Ър▒Вр░кр░┐р░Вр░Ър░╡р░жр▒Нр░жр▒Б',  # Don't show to children
        'р░Ър░╛р░▓р░╛ р░ир░┐р░░р░╛р░╢р░кр░░р░┐р░Ър░┐р░Вр░жр░┐',  # Very disappointing
        'р░пр░╛р░Хр▒Нр░╖р░ир▒Н р░Хр▒Вр░бр░╛ р░╡р▒Нр░пр░░р▒Нр░ер░В',  # Action also useless
        'р░кр░╛р░Яр░▓р▒Б р░Ър▒Жр░╡р▒Бр░▓р░Хр▒Б р░мр░╛р░з',  # Songs hurt the ears
        'р░Яр▒Нр░╡р░┐р░╕р▒Нр░Яр▒Н р░▓р▒Зр░жр▒Б',  # No twist
        'р░╡р░┐р░Ьр▒Бр░╡р░▓р▒Н р░Ор░лр▒Жр░Хр▒Нр░Яр▒Нр░╕р▒Н р░Ър▒Жр░др▒Нр░д',  # Poor visual effects
        'р░нр░╛р░╡р▒Лр░жр▒Нр░╡р▒Зр░Чр░Вр░Чр░╛ р░мр░▓р░╣р▒Ар░ир░В',  # Emotionally weak
        'р░╣р░╛р░╕р▒Нр░пр░В р░Ър░кр▒Нр░кр░Чр░╛ р░Йр░Вр░жр░┐',  # Comedy is bland
        'р░мр░▓р░╡р░Вр░др░кр▒Б р░░р▒Кр░ор░╛р░ир▒Нр░╕р▒Н',  # Forced romance
        'р░Ор░▓р░╛р░Вр░Яр░┐ р░╕р░Вр░жр▒Зр░╢р░В р░▓р▒Зр░жр▒Б',  # No message
        'р░ор░░р▒Нр░Ър░┐р░кр▒Лр░пр▒З р░╕р░┐р░ир░┐р░ор░╛',  # Forgettable movie
        'р░Зр░Вр░Яр▒Нр░▓р▒Л р░Хр▒Вр░бр░╛ р░Ър▒Вр░бр░╡р░жр▒Нр░жр▒Б',  # Don't watch even at home
        'р░Хр▒Бр░Яр▒Бр░Вр░мр░Вр░др▒Л р░Зр░мр▒Нр░мр░Вр░жр░┐',  # Embarrassment with family
        'р░╕р▒Нр░Хр▒Нр░░р▒Ар░ир▒НтАМр░кр▒Нр░▓р▒Зр░▓р▒Л р░░р░Вр░зр▒Нр░░р░╛р░▓р▒Б',  # Holes in screenplay
        'р░ир░Яр░и р░Хр▒Гр░др▒Нр░░р░┐р░ор░Вр░Чр░╛ р░Йр░Вр░жр░┐',  # Acting seemed artificial
        'р░╕р░┐р░ир░┐р░ор░╛р░Яр▒Лр░Чр▒Нр░░р░лр▒А р░╕р░Чр░Яр▒Б',  # Average cinematography
        'р░бр▒Ир░▓р░╛р░Чр▒Нр░╕р▒Н р░кр░┐р░▓р▒Нр░▓р░др░ир░Вр░Чр░╛ р░Йр░ир▒Нр░ир░╛р░пр░┐',  # Dialogues are childish
        'р░Хр▒Нр░▓р▒Ир░ор░╛р░Хр▒Нр░╕р▒Н р░ир░┐р░░р░╛р░╢р░кр░░р▒Бр░╕р▒Нр░др▒Бр░Вр░жр░┐',  # Climax disappoints
        'р░╕р░╕р▒Нр░кр▒Жр░ир▒Нр░╕р▒Н р░мр░┐р░▓р▒Нр░Хр▒Бр░▓р▒Н р░▓р▒Зр░жр▒Б',  # No suspense at all
        'р░Пр░бр▒Нр░кр░┐р░Вр░Ър░бр░Вр░▓р▒Л р░╡р░┐р░лр░▓р░В',  # Failed to make cry
        'р░ир░╡р▒Нр░╡р░┐р░Вр░Ър▒З р░кр▒Нр░░р░пр░др▒Нр░ир░В р░╡р░┐р░лр░▓р░В',  # Failed attempt to make laugh
        'р░кр▒Нр░░р▒Зр░ор░Хр░е р░Хр▒Нр░▓р░┐р░╖р▒Нр░Яр░ор▒Ир░ир░жр░┐',  # Love story is cliched
        'р░Ър▒Вр░╕р▒Нр░др▒В р░ир░┐р░жр▒Нр░░ р░╡р░Ър▒Нр░Ър░┐р░Вр░жр░┐',  # Felt sleepy while watching
        'р░╕р░ор░пр░В р░╡р▒Гр░ер░╛ р░Ер░пр░┐р░Вр░жр░┐',  # Time got wasted
        'р░кр░┐р░▓р▒Нр░▓р░▓р░ир▒Б р░мр▒Лр░░р▒Н р░Ър▒Зр░╕р▒Нр░др▒Бр░Вр░жр░┐',  # Will bore children
        'р░др▒Жр░▓р▒Бр░Чр▒Б р░╕р░┐р░ир░┐р░ор░╛р░▓р▒Л р░Ър▒Жр░др▒Нр░д',  # Worst of Telugu cinema
        'р░лр▒Нр░▓р░╛р░кр▒Н р░Ер░пр▒Нр░пр▒З р░╕р░┐р░ир░┐р░ор░╛',  # Movie that will flop
        'р░Тр░Хр▒Нр░Хр░╕р░╛р░░р░┐ р░Хр▒Вр░бр░╛ р░Ър▒Вр░бр░╡р░жр▒Нр░жр▒Б',  # Don't watch even once
        'р░кр▒Вр░░р▒Нр░др░┐р░Чр░╛ р░╡р▒Нр░пр░░р▒Нр░ер░В',  # Completely useless
        'р░Ър▒Жр░др▒Нр░д р░╕р░┐р░ир░┐р░ор░╛',  # Poor quality movie
        'р░Ър▒Жр░бр▒Нр░б р░кр▒Нр░░р░жр░░р▒Нр░╢р░и',  # Bad presentation
        'р░мр░▓р░╣р▒Ар░ир░ор▒Ир░и р░ир░Яр░и',  # Weak acting
        'р░ер▒Нр░░р░┐р░▓р▒Н р░▓р▒Зр░жр▒Б',  # No thrill
        'р░Ер░╡р░╛р░╕р▒Нр░др░╡р░┐р░Х р░Ър░┐р░др▒Нр░░р░г',  # Unrealistic portrayal
        'р░╡р▒Нр░пр░░р▒Нр░ер░ор▒Ир░и р░╕р░╛р░ор░╛р░Ьр░┐р░Х р░╕р░Вр░жр▒Зр░╢р░В',  # Useless social message
        'р░╡р░┐р░лр░▓р░ор▒Ир░и р░╕р░┐р░ир░┐р░ор░╛',  # Failed movie
        'р░кр▒Нр░░р░др░┐ р░╕р░ир▒Нр░ир░┐р░╡р▒Зр░╢р░В р░мр▒Лр░░р░┐р░Вр░Чр▒Н',  # Every scene is boring
    ]

    telugu_data = pd.DataFrame({
        'text': telugu_positive + telugu_negative,
        'label': [1] * len(telugu_positive) + [0] * len(telugu_negative)
    })

    # TAMIL SAMPLES (100 examples)
    tamil_positive = [
        'роЗроирпНрод родро┐ро░рпИрокрпНрокроЯроорпН рооро┐роХро╡рпБроорпН роиройрпНро▒ро╛роХ роЙро│рпНро│родрпБ',  # This movie is very good
        'роЕро░рпБроорпИропро╛рой роироЯро┐рокрпНрокрпБ рооро▒рпНро▒рпБроорпН роХродрпИ',  # Great acting and story
        'рооро┐роХро╡рпБроорпН роЪрпБро╡ро╛ро░ро╕рпНропрооро╛рой',  # Very interesting
        'роОройроХрпНроХрпБ роЗроирпНрод рокроЯроорпН рооро┐роХро╡рпБроорпН рокро┐роЯро┐родрпНродро┐ро░рпБроирпНродродрпБ',  # I really liked this movie
        'роЕро▒рпНрокрпБродрооро╛рой родро┐ро░рпИрокрпНрокроЯроорпН',  # Amazing movie
        'роЪро┐ро▒роирпНрод роЗропроХрпНроХроорпН рооро▒рпНро▒рпБроорпН роЗроЪрпИ',  # Great direction and music
        'роХрпБроЯрпБроорпНрокродрпНродрпБроЯройрпН рокро╛ро░рпНроХрпНроХ родроХрпБроирпНродродрпБ',  # Worth watching with family
        'роЕройрпИродрпНродрпБ роироЯро┐роХро░рпНроХро│рпБроорпН роЕро░рпБроорпИропро╛роХ роироЯро┐родрпНродрпБро│рпНро│ройро░рпН',  # All actors acted wonderfully
        'роЗроирпНрод роЖрогрпНроЯро┐ройрпН роЪро┐ро▒роирпНрод рокроЯроорпН',  # Best movie of the year
        'рокрпКро┤рпБродрпБрокрпЛроХрпНроХрпБ рооро▒рпНро▒рпБроорпН роКроХрпНроХрооро│ро┐роХрпНроХрпБроорпН',  # Entertaining and inspiring
        'роЗродропродрпНродрпИродрпН родрпКроЯрпБроорпН роХродрпИ',  # Heart-touching story
        'роХрпБро┤роирпНродрпИроХро│рпБроХрпНроХрпБ роПро▒рпНро▒родрпБ',  # Perfect for children
        'роЕройрпИро╡ро░рпБроорпН роХрогрпНроЯро┐рокрпНрокро╛роХ рокро╛ро░рпНроХрпНроХ ро╡рпЗрогрпНроЯрпБроорпН',  # Everyone must watch
        'рокрогродрпНродро┐ро▒рпНроХрпБ роородро┐рокрпНрокрпБро│рпНро│родрпБ',  # Value for money
        'роЕро▒рпНрокрпБродрооро╛рой роЖроХрпНро╖ройрпН роХро╛роЯрпНроЪро┐роХро│рпН',  # Amazing action scenes
        'рокро╛роЯро▓рпНроХро│рпН рооро┐роХро╡рпБроорпН роЕро┤роХро╛роХ роЙро│рпНро│рой',  # Songs are very beautiful
        'роХродрпИропро┐ро▓рпН рокрпБродро┐роп родро┐ро░рпБрокрпНрокроорпН',  # New twist in story
        'ро╡ро┐ро╖рпБро╡ро▓рпН роОроГрокрпЖроХрпНроЯрпНро╕рпН роЕро▒рпНрокрпБродрооро╛ройро╡рпИ',  # Visual effects are amazing
        'роЙрогро░рпНроЪрпНроЪро┐роХро░рооро╛ройродрпБ',  # Emotional
        'роироХрпИроЪрпНроЪрпБро╡рпИропро╛ро▓рпН роиро┐ро░рокрпНрокрокрпНрокроЯрпНроЯродрпБ',  # Full of humor
        'роХро╛родро▓рпН роХродрпИ рооро┐роХро╡рпБроорпН роиройрпНро▒ро╛роХ роЙро│рпНро│родрпБ',  # Love story is very good
        'роЪрпЖропрпНродро┐ роХрпКроЯрпБроХрпНроХрпБроорпН рокроЯроорпН',  # Message-giving movie
        'рооро▒роХрпНроХ роорпБроЯро┐ропро╛род рокроЯроорпН',  # Unforgettable movie
        'родро┐ро░рпИропро░роЩрпНроХрпБроХро│ро┐ро▓рпН роХрогрпНроЯро┐рокрпНрокро╛роХ рокро╛ро░рпНроХрпНроХ ро╡рпЗрогрпНроЯрпБроорпН',  # Must watch in theaters
        'роорпБро┤рпБ роХрпБроЯрпБроорпНрокродрпНродро┐ро▒рпНроХрпБроорпН рокрпКро┤рпБродрпБрокрпЛроХрпНроХрпБ',  # Entertainment for whole family
        'роЕро▒рпНрокрпБродрооро╛рой родро┐ро░рпИроХрпНроХродрпИ',  # Excellent screenplay
        'роироЯро┐рокрпНрокро┐ройрпН роЪро┐ро▒роирпНрод роЙродро╛ро░рогроорпН',  # Great example of acting
        'роТро│ро┐рокрпНрокродро┐ро╡рпБ роЕро▒рпНрокрпБродрооро╛ройродрпБ',  # Cinematography is wonderful
        'ро╡роЪройроЩрпНроХро│рпН рооро┐роХро╡рпБроорпН рокропройрпБро│рпНро│ро╡рпИ',  # Dialogues are very effective
        'роХрпНро│рпИрооро╛роХрпНро╕рпН роЕро▒рпНрокрпБродрооро╛ройродрпБ',  # Climax is amazing
        'роЪро╕рпНрокрпЖройрпНро╕рпН родрпКроЯро░рпНроХро┐ро▒родрпБ',  # Suspense continues
        'роЕро┤ ро╡рпИроХрпНроХрпБроорпН',  # Makes you cry
        'роЪро┐ро░ро┐роХрпНроХ ро╡рпИрокрпНрокродро┐ро▓рпН ро╡рпЖро▒рпНро▒ро┐',  # Successful in making laugh
        'роХро╛родро▓рпН роХродрпИ роЗродропродрпНродрпИродрпН родрпКроЯрпНроЯродрпБ',  # Love story touched heart
        'рокро╛ро░рпНродрпНродрпБроХрпН роХрпКрогрпНроЯрпЗ роЗро░рпБрокрпНрокрпАро░рпНроХро│рпН',  # You'll keep watching
        'роирпЗро░роорпН роХроЯроирпНродродрпЗ родрпЖро░ро┐ропро╡ро┐ро▓рпНро▓рпИ',  # Didn't realize time passing
        'роЕройрпИродрпНродрпБ ро╡ропродро┐ройро░рпБроХрпНроХрпБроорпН',  # For all ages
        'родрооро┐ро┤рпН роЪро┐ройро┐рооро╛ро╡ро┐ройрпН рокрпЖро░рпБроорпИ',  # Pride of Tamil cinema
        'рокро┐ро│ро╛роХрпНрокро╕рпНроЯро░рпН рокроЯроорпН',  # Blockbuster movie
        'ро╣ро┐роЯрпН роЖроХрпБроорпН рокроЯроорпН',  # Movie that will be hit
        'роорпАрогрпНроЯрпБроорпН рокро╛ро░рпНроХрпНроХ родроХрпБроирпНродродрпБ',  # Worth watching again
        'роорпБро▒рпНро▒ро┐ро▓рпБроорпН рокрпКро┤рпБродрпБрокрпЛроХрпНроХрпБ',  # Completely entertaining
        'роЕро▒рпНрокрпБродрооро╛рой рокроЯроорпН',  # Awesome movie
        'роЕро▒рпНрокрпБродрооро╛рой ро╡ро┐ро│роХрпНроХроХрпНроХро╛роЯрпНроЪро┐',  # Marvelous presentation
        'роЪроХрпНродро┐ро╡ро╛ропрпНроирпНрод роироЯро┐рокрпНрокрпБ',  # Powerful acting
        'родрпНро░ро┐ро▓рпНро▓ро┐роЩрпН роЕройрпБрокро╡роорпН',  # Thrilling experience
        'ропродро╛ро░рпНродрпНродрооро╛рой роЪро┐родрпНродро░ро┐рокрпНрокрпБ',  # Realistic portrayal
        'роЪроорпВроХ роЪрпЖропрпНродро┐ропрпБроЯройрпН',  # With social message
        'рооро╛ро╕рпНроЯро░рпНрокрпАро╕рпН рокроЯроорпН',  # Masterpiece movie
        'роТро╡рпНро╡рпКро░рпБ роХро╛роЯрпНроЪро┐ропрпБроорпН роЕро▒рпНрокрпБродроорпН',  # Every scene is wonderful
    ]

    tamil_negative = [
        'роОройроХрпНроХрпБ роЗроирпНрод рокроЯроорпН рокро┐роЯро┐роХрпНроХро╡ро┐ро▓рпНро▓рпИ',  # I didn't like this movie
        'роорпЛроЪрооро╛рой рокроЯроорпН',  # Bad movie
        'роХродрпИ рооро┐роХро╡рпБроорпН рокро▓ро╡рпАройрооро╛роХ роЙро│рпНро│родрпБ',  # Story is very weak
        'роЪро▓ро┐рокрпНрокро╛рой рооро▒рпНро▒рпБроорпН роирпАрогрпНроЯ рокроЯроорпН',  # Boring and long movie
        'рокрогроорпН ро╡рпАрогро╛роХро┐ро╡ро┐роЯрпНроЯродрпБ',  # Money wasted
        'роироЯро┐рокрпНрокрпБ роЪро░ро┐ропро┐ро▓рпНро▓рпИ',  # Acting didn't work
        'роЗропроХрпНроХродрпНродро┐ро▓рпН роХрпБро▒рпИрокро╛роЯрпБ',  # Lack in direction
        'роЗроЪрпИропрпБроорпН роиройрпНро▒ро╛роХ роЗро▓рпНро▓рпИ',  # Music is also not good
        'рокро╛ро░рпНроХрпНроХ родроХрпБродро┐ропро▒рпНро▒родрпБ',  # Not worth watching
        'роирпЗро░роорпН ро╡рпАрогро╛роХро┐ро╡ро┐роЯрпНроЯродрпБ',  # Time got wasted
        'рокрпБродро┐родро╛роХ роОродрпБро╡рпБроорпН роЗро▓рпНро▓рпИ',  # Nothing new
        'рокро┤рпИроп роХродрпИ роорпАрогрпНроЯрпБроорпН',  # Old story again
        'роХрпБро┤роирпНродрпИроХро│рпБроХрпНроХрпБ роХро╛роЯрпНроЯ ро╡рпЗрогрпНроЯро╛роорпН',  # Don't show to children
        'рооро┐роХро╡рпБроорпН роПрооро╛ро▒рпНро▒рооро│ро┐роХрпНроХро┐ро▒родрпБ',  # Very disappointing
        'роЖроХрпНро╖ройрпН роХрпВроЯ ро╡рпАрогро╛ройродрпБ',  # Action also useless
        'рокро╛роЯро▓рпНроХро│рпН роХро╛родрпБроХро│рпБроХрпНроХрпБ ро╡ро▓ро┐',  # Songs hurt the ears
        'родро┐ро░рпБрокрпНрокроорпН роЗро▓рпНро▓рпИ',  # No twist
        'ро╡ро┐ро╖рпБро╡ро▓рпН роОроГрокрпЖроХрпНроЯрпНро╕рпН роорпЛроЪроорпН',  # Poor visual effects
        'роЙрогро░рпНроЪрпНроЪро┐рокрпВро░рпНро╡рооро╛роХ рокро▓ро╡рпАройроорпН',  # Emotionally weak
        'роироХрпИроЪрпНроЪрпБро╡рпИ роЪрпБро╡рпИропро▒рпНро▒родрпБ',  # Comedy is bland
        'роХроЯрпНроЯро╛ропрооро╛рой роХро╛родро▓рпН',  # Forced romance
        'роОроирпНрод роЪрпЖропрпНродро┐ропрпБроорпН роЗро▓рпНро▓рпИ',  # No message
        'рооро▒роирпНродрпБро╡ро┐роЯроХрпНроХрпВроЯро┐роп рокроЯроорпН',  # Forgettable movie
        'ро╡рпАроЯрпНроЯро┐ро▓рпБроорпН рокро╛ро░рпНроХрпНроХ ро╡рпЗрогрпНроЯро╛роорпН',  # Don't watch even at home
        'роХрпБроЯрпБроорпНрокродрпНродрпБроЯройрпН роЪроЩрпНроХроЯроорпН',  # Embarrassment with family
        'родро┐ро░рпИроХрпНроХродрпИропро┐ро▓рпН родрпБро│рпИроХро│рпН',  # Holes in screenplay
        'роироЯро┐рокрпНрокрпБ роЪрпЖропро▒рпНроХрпИропро╛роХ роЗро░рпБроирпНродродрпБ',  # Acting seemed artificial
        'роТро│ро┐рокрпНрокродро┐ро╡рпБ роЪро░ро╛роЪро░ро┐',  # Average cinematography
        'ро╡роЪройроЩрпНроХро│рпН роХрпБро┤роирпНродрпИродрпНродройрооро╛ройро╡рпИ',  # Dialogues are childish
        'роХрпНро│рпИрооро╛роХрпНро╕рпН роПрооро╛ро▒рпНро▒рооро│ро┐роХрпНроХро┐ро▒родрпБ',  # Climax disappoints
        'роЪро╕рпНрокрпЖройрпНро╕рпН роЗро▓рпНро▓ро╡рпЗ роЗро▓рпНро▓рпИ',  # No suspense at all
        'роЕро┤ ро╡рпИрокрпНрокродро┐ро▓рпН родрпЛро▓рпНро╡ро┐',  # Failed to make cry
        'роЪро┐ро░ро┐роХрпНроХ ро╡рпИроХрпНроХрпБроорпН роорпБропро▒рпНроЪро┐ родрпЛро▓рпНро╡ро┐',  # Failed attempt to make laugh
        'роХро╛родро▓рпН роХродрпИ рокро┤рпИропродро╛роХ роЙро│рпНро│родрпБ',  # Love story is cliched
        'рокро╛ро░рпНроХрпНроХрпБроорпНрокрпЛродрпБ родрпВроХрпНроХроорпН ро╡роирпНродродрпБ',  # Felt sleepy while watching
        'роирпЗро░роорпН ро╡рпАрогро╛роХро┐ро╡ро┐роЯрпНроЯродрпБ',  # Time got wasted
        'роХрпБро┤роирпНродрпИроХро│рпИ роЪро▓ро┐рокрпНрокроЯрпИропроЪрпН роЪрпЖропрпНропрпБроорпН',  # Will bore children
        'родрооро┐ро┤рпН роЪро┐ройро┐рооро╛ро╡ро┐ройрпН роорпЛроЪрооро╛ройродрпБ',  # Worst of Tamil cinema
        'родрпЛро▓рпНро╡ро┐ропроЯрпИропрпБроорпН рокроЯроорпН',  # Movie that will flop
        'роТро░рпБ роорпБро▒рпИ роХрпВроЯ рокро╛ро░рпНроХрпНроХ ро╡рпЗрогрпНроЯро╛роорпН',  # Don't watch even once
        'роорпБро▒рпНро▒ро┐ро▓рпБроорпН ро╡рпАрогро╛ройродрпБ',  # Completely useless
        'роорпЛроЪрооро╛рой рокроЯроорпН',  # Poor quality movie
        'роорпЛроЪрооро╛рой ро╡ро┐ро│роХрпНроХроХрпНроХро╛роЯрпНроЪро┐',  # Bad presentation
        'рокро▓ро╡рпАройрооро╛рой роироЯро┐рокрпНрокрпБ',  # Weak acting
        'родрпНро░ро┐ро▓рпН роЗро▓рпНро▓рпИ',  # No thrill
        'роироЯрпИроорпБро▒рпИроХрпНроХрпБ рооро╛ро▒ро╛рой роЪро┐родрпНродро░ро┐рокрпНрокрпБ',  # Unrealistic portrayal
        'рокропройро▒рпНро▒ роЪроорпВроХ роЪрпЖропрпНродро┐',  # Useless social message
        'родрпЛро▓рпНро╡ро┐ропроЯрпИроирпНрод рокроЯроорпН',  # Failed movie
        'роТро╡рпНро╡рпКро░рпБ роХро╛роЯрпНроЪро┐ропрпБроорпН роЪро▓ро┐рокрпНрокрпВроЯрпНроЯрпБроХро┐ро▒родрпБ',  # Every scene is boring
    ]

    tamil_data = pd.DataFrame({
        'text': tamil_positive + tamil_negative,
        'label': [1] * len(tamil_positive) + [0] * len(tamil_negative)
    })

    # Shuffle the data
    hindi_data = hindi_data.sample(frac=1, random_state=42).reset_index(drop=True)
    telugu_data = telugu_data.sample(frac=1, random_state=42).reset_index(drop=True)
    tamil_data = tamil_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Print statistics
    print("\nЁЯУК Dataset Statistics:")
    print(f"   Hindi: {len(hindi_data)} samples ({hindi_data['label'].sum()} positive, {len(hindi_data) - hindi_data['label'].sum()} negative)")
    print(f"   Telugu: {len(telugu_data)} samples ({telugu_data['label'].sum()} positive, {len(telugu_data) - telugu_data['label'].sum()} negative)")
    print(f"   Tamil: {len(tamil_data)} samples ({tamil_data['label'].sum()} positive, {len(tamil_data) - tamil_data['label'].sum()} negative)")

    return hindi_data, telugu_data, tamil_data

hindi_test, telugu_test, tamil_test = create_sample_indic_data()

print(f"тЬЕ Hindi test samples: {len(hindi_test)}")
print(f"тЬЕ Telugu test samples: {len(telugu_test)}")
print(f"тЬЕ Tamil test samples: {len(tamil_test)}")



In [None]:
# ============================================================================
# STEP 2: DATA PREPROCESSING
# ============================================================================

print("\n" + "="*80)
print("STEP 2: DATA PREPROCESSING")
print("="*80)

def preprocess_text(text):
    """Minimal preprocessing for cross-lingual transfer"""
    import re
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply preprocessing
print("ЁЯФз Applying minimal preprocessing...")

# Process English data
train_texts = [preprocess_text(x['text']) for x in train_data]
train_labels = [x['label'] for x in train_data]

val_texts = [preprocess_text(x['text']) for x in val_data]
val_labels = [x['label'] for x in val_data]

# Process Indic data
hindi_test['text'] = hindi_test['text'].apply(preprocess_text)
telugu_test['text'] = telugu_test['text'].apply(preprocess_text)
tamil_test['text'] = tamil_test['text'].apply(preprocess_text)

print("тЬЕ Preprocessing complete")

# ============================================================================
# STEP 3: MODEL SELECTION & TOKENIZATION
# ============================================================================

print("\n" + "="*80)
print("STEP 3: MODEL SELECTION & TOKENIZATION")
print("="*80)

# Models to compare
MODELS = {
    'xlm-roberta-base': 'XLM-RoBERTa (Best cross-lingual)',
    'bert-base-multilingual-cased': 'mBERT (Baseline)',
    'ai4bharat/indic-bert': 'IndicBERT (Indic-focused)'
}

# Select model for this run (change to compare different models)
MODEL_NAME = 'xlm-roberta-base'  # Change this to test other models

print(f"\nЁЯдЦ Selected Model: {MODELS[MODEL_NAME]}")
print(f"Model ID: {MODEL_NAME}")

# Load tokenizer with proper handling for IndicBERT
print("ЁЯУж Loading tokenizer...")
if 'indic-bert' in MODEL_NAME:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize function
def tokenize_data(texts, labels, max_length=256):
    """Tokenize text data"""
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings, labels

# Tokenize datasets
print("ЁЯФд Tokenizing datasets...")
train_encodings, train_labels_tensor = tokenize_data(train_texts, train_labels)
val_encodings, val_labels_tensor = tokenize_data(val_texts, val_labels)

# Create PyTorch datasets
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels_tensor)

print(f"тЬЕ Train dataset size: {len(train_dataset)}")
print(f"тЬЕ Validation dataset size: {len(val_dataset)}")

In [None]:

# ============================================================================
# STEP 3: MODEL SELECTION & TOKENIZATION
# ============================================================================

print("\n" + "="*80)
print("STEP 3: MODEL SELECTION & TOKENIZATION")
print("="*80)

# Models to compare
MODELS = {
    'xlm-roberta-base': 'XLM-RoBERTa (Best cross-lingual)',
    'bert-base-multilingual-cased': 'mBERT (Baseline)',
    'ai4bharat/indic-bert': 'IndicBERT (Indic-focused)'
}

# Select model for this run (change to compare different models)
MODEL_NAME = 'xlm-roberta-base'  # Change this to test other models

print(f"\nЁЯдЦ Selected Model: {MODELS[MODEL_NAME]}")
print(f"Model ID: {MODEL_NAME}")

# Load tokenizer with proper handling for IndicBERT
print("ЁЯУж Loading tokenizer...")
if 'indic-bert' in MODEL_NAME:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize function
def tokenize_data(texts, labels, max_length=256):
    """Tokenize text data"""
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings, labels

# Tokenize datasets
print("ЁЯФд Tokenizing datasets...")
train_encodings, train_labels_tensor = tokenize_data(train_texts, train_labels)
val_encodings, val_labels_tensor = tokenize_data(val_texts, val_labels)

# Create PyTorch datasets
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels_tensor)

print(f"тЬЕ Train dataset size: {len(train_dataset)}")
print(f"тЬЕ Validation dataset size: {len(val_dataset)}")


In [None]:
# ============================================================================
# STEP 4: MODEL FINE-TUNING (English Only) тАФ VERSION SAFE
# ============================================================================

print("\n" + "="*80)
print("STEP 4: FINE-TUNING ON ENGLISH DATA")
print("="*80)

# Load model
print(f"ЁЯФз Loading {MODEL_NAME}...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)
model.to(device)

# Training arguments (OLD + NEW transformers compatible)
training_args = TrainingArguments(
    output_dir=f'./results_{MODEL_NAME.replace("/", "_")}',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
    report_to="none",
    seed=42
)

# Metrics computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": acc,
        "f1": f1
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train model
print("ЁЯЪА Starting training...")
trainer.train()

# Evaluate on English validation set
print("\nЁЯУК Evaluating on English validation set...")
eval_results = trainer.evaluate()

print(f"English Validation Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"English Validation F1: {eval_results['eval_f1']:.4f}")


In [None]:
# ============================================================================
# STEP 5: ZERO-SHOT CROSS-LINGUAL EVALUATION
# ============================================================================

print("\n" + "="*80)
print("STEP 5: ZERO-SHOT CROSS-LINGUAL EVALUATION")
print("="*80)

def evaluate_on_indic(model, tokenizer, df, language_name):
    """Evaluate model on Indic language data"""
    model.eval()

    texts = df['text'].tolist()
    true_labels = df['label'].tolist()

    # Tokenize
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors='pt'
    ).to(device)

    # Predict
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    # Calculate metrics
    acc = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    print(f"\n{language_name} Results:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(true_labels, predictions,
                                target_names=['Negative', 'Positive']))

    # Confusion Matrix
    cm = confusion_matrix(true_labels, predictions)

    return {
        'language': language_name,
        'accuracy': acc,
        'f1': f1,
        'predictions': predictions,
        'true_labels': true_labels,
        'confusion_matrix': cm
    }

# Evaluate on all Indic languages
results = {}

print("\nЁЯФН Zero-Shot Evaluation Results:")
print("-" * 80)

results['hindi'] = evaluate_on_indic(model, tokenizer, hindi_test, "Hindi")
results['telugu'] = evaluate_on_indic(model, tokenizer, telugu_test, "Telugu")
results['tamil'] = evaluate_on_indic(model, tokenizer, tamil_test, "Tamil")

In [None]:
# ============================================================================
# STEP 6: VISUALIZATION
# ============================================================================

print("\n" + "="*80)
print("STEP 6: VISUALIZATION")
print("="*80)

# 6.1 Performance Comparison Across Languages
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

languages = ['Hindi', 'Telugu', 'Tamil']
accuracies = [results['hindi']['accuracy'],
              results['telugu']['accuracy'],
              results['tamil']['accuracy']]
f1_scores = [results['hindi']['f1'],
             results['telugu']['f1'],
             results['tamil']['f1']]

# Accuracy plot
axes[0].bar(languages, accuracies, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_title('Zero-Shot Accuracy by Language', fontsize=14, fontweight='bold')
axes[0].set_ylim([0, 1])
for i, v in enumerate(accuracies):
    axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# F1-Score plot
axes[1].bar(languages, f1_scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_ylabel('F1-Score', fontsize=12)
axes[1].set_title('Zero-Shot F1-Score by Language', fontsize=14, fontweight='bold')
axes[1].set_ylim([0, 1])
for i, v in enumerate(f1_scores):
    axes[1].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('zero_shot_performance.png', dpi=300, bbox_inches='tight')
plt.show()

# 6.2 Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (lang_key, lang_name) in enumerate([('hindi', 'Hindi'),
                                               ('telugu', 'Telugu'),
                                               ('tamil', 'Tamil')]):
    cm = results[lang_key]['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'],
                ax=axes[idx])
    axes[idx].set_title(f'{lang_name} Confusion Matrix', fontweight='bold')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
import re
import json
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# ============================================================================
# STEP 7: ENHANCED ERROR ANALYSIS WITH TAXONOMY
# ============================================================================

print("\n" + "="*80)
print("STEP 7: ENHANCED ERROR ANALYSIS WITH TAXONOMY")
print("="*80)

def categorize_error(text, true_label, predicted_label):
    """Categorize errors into specific types"""
    text_lower = text.lower()

    # Negation detection (simple heuristic)
    negation_words_en = ['not', 'never', 'no', 'none', 'neither']
    negation_words_hi = ['рдирд╣реАрдВ', 'рдХрднреА рдирд╣реАрдВ', 'рдмрд┐рд▓реНрдХреБрд▓ рдирд╣реАрдВ']
    negation_words_te = ['р░Хр░╛р░жр▒Б', 'р░▓р▒Зр░жр▒Б']
    negation_words_ta = ['роЗро▓рпНро▓рпИ', 'роЗро▓рпНро▓ро╛род']

    all_negations = negation_words_en + negation_words_hi + negation_words_te + negation_words_ta

    has_negation = any(neg in text_lower for neg in all_negations)

    # Code-mixing detection (contains both Indic and English)
    has_english = bool(re.search(r'[a-zA-Z]{3,}', text))
    has_indic = bool(re.search(r'[\u0900-\u097F\u0C00-\u0C7F\u0B80-\u0BFF]', text))
    is_code_mixed = has_english and has_indic

    # Intensity words
    intensity_words = ['рдмрд╣реБрдд', 'рооро┐роХро╡рпБроорпН', 'р░Ър░╛р░▓р░╛', 'very', 'really', 'extremely']
    has_intensity = any(word in text for word in intensity_words)

    # Length-based
    is_short = len(text.split()) < 5
    is_long = len(text.split()) > 20

    # Categorize
    if has_negation and true_label != predicted_label:
        return 'Negation'
    elif is_code_mixed:
        return 'Code-mixed'
    elif is_short:
        return 'Short text'
    elif is_long:
        return 'Long text'
    elif has_intensity:
        return 'Intensity mismatch'
    else:
        return 'Other'

def enhanced_error_analysis(df, predictions, language_name):
    """Enhanced error analysis with categorization"""
    df_copy = df.copy()
    df_copy['predicted'] = predictions
    df_copy['correct'] = df_copy['label'] == df_copy['predicted']

    errors = df_copy[~df_copy['correct']]

    print(f"\n{'='*80}")
    print(f"{language_name} ENHANCED ERROR ANALYSIS")
    print(f"{'='*80}")
    print(f"Total errors: {len(errors)}/{len(df_copy)} ({len(errors)/len(df_copy)*100:.1f}%)")

    if len(errors) > 0:
        # Categorize errors
        error_categories = {}
        for idx, row in errors.iterrows():
            category = categorize_error(row['text'], row['label'], row['predicted'])
            if category not in error_categories:
                error_categories[category] = []
            error_categories[category].append({
                'text': row['text'],
                'true': row['label'],
                'pred': row['predicted']
            })

        # Print error distribution
        print(f"\nЁЯУК Error Distribution by Category:")
        print("-" * 80)
        for category, items in sorted(error_categories.items(), key=lambda x: len(x[1]), reverse=True):
            percentage = len(items) / len(errors) * 100
            print(f"  {category:20s}: {len(items):3d} errors ({percentage:5.1f}%)")

        # Show examples from each category
        print(f"\nЁЯФН Example Errors by Category:")
        print("-" * 80)
        for category, items in error_categories.items():
            if len(items) > 0:
                print(f"\n  [{category}]")
                example = items[0]
                print(f"    Text: {example['text'][:80]}...")
                print(f"    True: {'Positive' if example['true']==1 else 'Negative'} | "
                      f"Pred: {'Positive' if example['pred']==1 else 'Negative'}")

        return errors, error_categories

    return errors, {}

# Run enhanced error analysis for all languages
print("\n" + "="*80)
print("RUNNING ENHANCED ERROR ANALYSIS")
print("="*80)

hindi_errors, hindi_categories = enhanced_error_analysis(
    hindi_test, results['hindi']['predictions'], "HINDI"
)
telugu_errors, telugu_categories = enhanced_error_analysis(
    telugu_test, results['telugu']['predictions'], "TELUGU"
)
tamil_errors, tamil_categories = enhanced_error_analysis(
    tamil_test, results['tamil']['predictions'], "TAMIL"
)

# Aggregate error categories across languages
all_categories = set()
for cats in [hindi_categories, telugu_categories, tamil_categories]:
    all_categories.update(cats.keys())

# Create error taxonomy visualization
fig, ax = plt.subplots(figsize=(12, 6))

category_counts = {cat: [] for cat in all_categories}
languages = ['Hindi', 'Telugu', 'Tamil']

for cats, lang in [(hindi_categories, 'Hindi'), (telugu_categories, 'Telugu'),
                    (tamil_categories, 'Tamil')]:
    for cat in all_categories:
        count = len(cats.get(cat, []))
        category_counts[cat].append(count)

x = np.arange(len(all_categories))
width = 0.25

for i, lang in enumerate(languages):
    counts = [category_counts[cat][i] for cat in all_categories]
    ax.bar(x + i*width, counts, width, label=lang)

ax.set_ylabel('Number of Errors', fontsize=12, fontweight='bold')
ax.set_title('Error Taxonomy: Distribution Across Languages', fontsize=14, fontweight='bold')
ax.set_xticks(x + width)
ax.set_xticklabels(list(all_categories), rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('error_taxonomy.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nтЬЕ Error taxonomy saved to 'error_taxonomy.png'")

# Save error analysis
error_analysis_results = {
    'hindi': {cat: len(items) for cat, items in hindi_categories.items()},
    'telugu': {cat: len(items) for cat, items in telugu_categories.items()},
    'tamil': {cat: len(items) for cat, items in tamil_categories.items()}
}

with open('error_taxonomy.json', 'w') as f:
    json.dump(error_analysis_results, f, indent=2)

print("ЁЯТ╛ Error taxonomy saved to 'error_taxonomy.json'")


In [None]:

# ============================================================================
# STEP 8: SUMMARY & INSIGHTS
# ============================================================================

print("\n" + "="*80)
print("STEP 8: RESEARCH FINDINGS SUMMARY")
print("="*80)

summary = f"""
тХФтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХЧ
тХС           CROSS-LINGUAL SENTIMENT ANALYSIS RESULTS                    тХС
тХЪтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХЭ

ЁЯУК MODEL: {MODELS[MODEL_NAME]}
тФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ

тЬЕ ZERO-SHOT TRANSFER RESULTS:

   Language      Accuracy    F1-Score
   тФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФАтФА
   Hindi         {results['hindi']['accuracy']:.4f}      {results['hindi']['f1']:.4f}
   Telugu        {results['telugu']['accuracy']:.4f}      {results['telugu']['f1']:.4f}
   Tamil         {results['tamil']['accuracy']:.4f}      {results['tamil']['f1']:.4f}

ЁЯФН KEY INSIGHTS:

1. Zero-shot transfer {'WORKS' if min(accuracies) > 0.5 else 'NEEDS IMPROVEMENT'}
   тЖТ Model trained only on English can classify Indic languages
   тЖТ {'Strong' if min(accuracies) > 0.7 else 'Moderate' if min(accuracies) > 0.5 else 'Weak'} cross-lingual alignment

2. Language Performance Ranking:
   тЖТ Best: {languages[np.argmax(accuracies)]} ({max(accuracies):.3f})
   тЖТ Worst: {languages[np.argmin(accuracies)]} ({min(accuracies):.3f})

3. Linguistic Insights:
   тЖТ {'Indo-Aryan (Hindi) performs better than Dravidian' if results['hindi']['accuracy'] > max(results['telugu']['accuracy'], results['tamil']['accuracy']) else 'Performance varies across language families'}

тФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ

ЁЯУМ NEXT STEPS:
   1. Compare with other models (mBERT, IndicBERT)
   2. Add few-shot fine-tuning experiments
   3. Perform layer-wise transfer analysis
   4. Expand error taxonomy with more examples
   5. Test on larger datasets

тХЪтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХРтХЭ
"""

print(summary)

# Save results to JSON
results_summary = {
    'model': MODEL_NAME,
    'english_validation': {
        'accuracy': float(eval_results['eval_accuracy']),
        'f1': float(eval_results['eval_f1'])
    },
    'zero_shot': {
        'hindi': {'accuracy': float(results['hindi']['accuracy']),
                  'f1': float(results['hindi']['f1'])},
        'telugu': {'accuracy': float(results['telugu']['accuracy']),
                   'f1': float(results['telugu']['f1'])},
        'tamil': {'accuracy': float(results['tamil']['accuracy']),
                  'f1': float(results['tamil']['f1'])}
    }
}

with open('results_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\nЁЯТ╛ Results saved to 'results_summary.json'")


In [None]:

# ============================================================================
# STEP 9: FEW-SHOT LEARNING EXPERIMENTS
# ============================================================================

print("\n" + "="*80)
print("STEP 9: FEW-SHOT LEARNING EXPERIMENTS")
print("="*80)

print("""
ЁЯОп EXPERIMENT SETUP:
   We'll train with small amounts of target language data to see how
   quickly the model adapts compared to zero-shot transfer.

   Training sizes: 10, 25, 50, 75 samples per language
""")

# Store zero-shot results for comparison
zero_shot_results = {
    'hindi': results['hindi']['accuracy'],
    'telugu': results['telugu']['accuracy'],
    'tamil': results['tamil']['accuracy']
}

# Few-shot sample sizes to test
FEW_SHOT_SIZES = [10, 25, 50, 75]

# Store results for all experiments
few_shot_results = {
    'hindi': {'sizes': [], 'accuracies': [], 'f1_scores': []},
    'telugu': {'sizes': [], 'accuracies': [], 'f1_scores': []},
    'tamil': {'sizes': [], 'accuracies': [], 'f1_scores': []}
}

def create_few_shot_dataset(indic_df, n_samples, language_name):
    """Create a few-shot training dataset"""
    # Ensure balanced sampling
    n_per_class = n_samples // 2

    positive_samples = indic_df[indic_df['label'] == 1].sample(n=n_per_class, random_state=42)
    negative_samples = indic_df[indic_df['label'] == 0].sample(n=n_per_class, random_state=42)

    few_shot_df = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42)

    print(f"\nЁЯУж Created {language_name} few-shot dataset:")
    print(f"   Total samples: {len(few_shot_df)}")
    print(f"   Positive: {few_shot_df['label'].sum()}, Negative: {len(few_shot_df) - few_shot_df['label'].sum()}")

    return few_shot_df

def train_few_shot_model(train_texts_en, train_labels_en, indic_texts, indic_labels,
                         model_name, n_epochs=3):
    """Train model with English + few Indic samples"""

    # Combine English and Indic data
    combined_texts = train_texts_en + indic_texts
    combined_labels = train_labels_en + indic_labels

    print(f"   Combined training size: {len(combined_texts)} samples")
    print(f"   English: {len(train_texts_en)}, Indic: {len(indic_texts)}")

    # Tokenize combined data
    encodings = tokenizer(
        combined_texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors='pt'
    )

    # Create dataset
    combined_dataset = SentimentDataset(encodings, combined_labels)

    # Load fresh model
    few_shot_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    ).to(device)

    # Training arguments (fewer epochs for few-shot)
    few_shot_args = TrainingArguments(
        output_dir=f'./few_shot_results',
        num_train_epochs=n_epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy="no",
        save_strategy="no",
        report_to="none",
        seed=42
    )

    # Train
    few_shot_trainer = Trainer(
        model=few_shot_model,
        args=few_shot_args,
        train_dataset=combined_dataset,
        compute_metrics=compute_metrics
    )

    few_shot_trainer.train()

    return few_shot_model

# Run few-shot experiments for each language
print("\n" + "="*80)
print("RUNNING FEW-SHOT EXPERIMENTS")
print("="*80)

# We'll focus on Hindi for detailed few-shot analysis
# (You can extend this to Telugu and Tamil)

print("\nЁЯФм HINDI FEW-SHOT EXPERIMENTS")
print("-" * 80)

for n_samples in FEW_SHOT_SIZES:
    print(f"\n{'='*80}")
    print(f"Training with {n_samples} Hindi samples")
    print(f"{'='*80}")

    # Create few-shot training data
    hindi_few_shot = create_few_shot_dataset(hindi_test, n_samples, "Hindi")

    # Prepare for training (use only non-test samples)
    hindi_train_texts = hindi_few_shot['text'].apply(preprocess_text).tolist()
    hindi_train_labels = hindi_few_shot['label'].tolist()

    # Train model
    print("\nЁЯЪА Training few-shot model...")
    few_shot_model_hindi = train_few_shot_model(
        train_texts[:2000],  # Use subset of English data for faster training
        train_labels[:2000],
        hindi_train_texts,
        hindi_train_labels,
        MODEL_NAME,
        n_epochs=2
    )

    # Evaluate on remaining Hindi samples (held-out test set)
    # Create held-out test set (samples not used in training)
    hindi_test_indices = set(hindi_test.index) - set(hindi_few_shot.index)
    hindi_test_holdout = hindi_test.loc[list(hindi_test_indices)]

    print(f"\nЁЯУК Evaluating on {len(hindi_test_holdout)} held-out Hindi samples...")

    # Evaluate
    eval_result = evaluate_on_indic(few_shot_model_hindi, tokenizer,
                                   hindi_test_holdout, f"Hindi ({n_samples} shots)")

    # Store results
    few_shot_results['hindi']['sizes'].append(n_samples)
    few_shot_results['hindi']['accuracies'].append(eval_result['accuracy'])
    few_shot_results['hindi']['f1_scores'].append(eval_result['f1'])

    print(f"\nтЬЕ Few-shot ({n_samples} samples) Accuracy: {eval_result['accuracy']:.4f}")
    print(f"тЬЕ Few-shot ({n_samples} samples) F1-Score: {eval_result['f1']:.4f}")
    print(f"ЁЯУИ Improvement over zero-shot: {(eval_result['accuracy'] - zero_shot_results['hindi'])*100:.2f}%")

    # Clean up
    del few_shot_model_hindi
    torch.cuda.empty_cache()


STEP 9: FEW-SHOT LEARNING EXPERIMENTS

ЁЯОп EXPERIMENT SETUP:
   We'll train with small amounts of target language data to see how
   quickly the model adapts compared to zero-shot transfer.

   Training sizes: 10, 25, 50, 75 samples per language


RUNNING FEW-SHOT EXPERIMENTS

ЁЯФм HINDI FEW-SHOT EXPERIMENTS
--------------------------------------------------------------------------------

Training with 10 Hindi samples

ЁЯУж Created Hindi few-shot dataset:
   Total samples: 10
   Positive: 5, Negative: 5

ЁЯЪА Training few-shot model...
   Combined training size: 2010 samples
   English: 2000, Indic: 10


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6936
100,0.6501
150,0.6
200,0.4942
250,0.3416



ЁЯУК Evaluating on 89 held-out Hindi samples...

Hindi (10 shots) Results:
  Accuracy: 0.7416
  F1-Score: 0.7299

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.95      0.79        44
    Positive       0.92      0.53      0.68        45

    accuracy                           0.74        89
   macro avg       0.79      0.74      0.73        89
weighted avg       0.80      0.74      0.73        89


тЬЕ Few-shot (10 samples) Accuracy: 0.7416
тЬЕ Few-shot (10 samples) F1-Score: 0.7299
ЁЯУИ Improvement over zero-shot: -10.69%

Training with 25 Hindi samples

ЁЯУж Created Hindi few-shot dataset:
   Total samples: 24
   Positive: 12, Negative: 12

ЁЯЪА Training few-shot model...
   Combined training size: 2024 samples
   English: 2000, Indic: 24


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6935
100,0.66
150,0.5949
200,0.454
250,0.3676



ЁЯУК Evaluating on 75 held-out Hindi samples...

Hindi (25 shots) Results:
  Accuracy: 0.8000
  F1-Score: 0.7974

Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.92      0.82        37
    Positive       0.90      0.68      0.78        38

    accuracy                           0.80        75
   macro avg       0.82      0.80      0.80        75
weighted avg       0.82      0.80      0.80        75


тЬЕ Few-shot (25 samples) Accuracy: 0.8000
тЬЕ Few-shot (25 samples) F1-Score: 0.7974
ЁЯУИ Improvement over zero-shot: -4.85%

Training with 50 Hindi samples

ЁЯУж Created Hindi few-shot dataset:
   Total samples: 50
   Positive: 25, Negative: 25

ЁЯЪА Training few-shot model...
   Combined training size: 2050 samples
   English: 2000, Indic: 50


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.689
100,0.6208
150,0.5086
200,0.4315
250,0.3141



ЁЯУК Evaluating on 49 held-out Hindi samples...

Hindi (50 shots) Results:
  Accuracy: 0.7551
  F1-Score: 0.7520

Classification Report:
              precision    recall  f1-score   support

    Negative       0.70      0.88      0.78        24
    Positive       0.84      0.64      0.73        25

    accuracy                           0.76        49
   macro avg       0.77      0.76      0.75        49
weighted avg       0.77      0.76      0.75        49


тЬЕ Few-shot (50 samples) Accuracy: 0.7551
тЬЕ Few-shot (50 samples) F1-Score: 0.7520
ЁЯУИ Improvement over zero-shot: -9.34%

Training with 75 Hindi samples

ЁЯУж Created Hindi few-shot dataset:
   Total samples: 74
   Positive: 37, Negative: 37

ЁЯЪА Training few-shot model...
   Combined training size: 2074 samples
   English: 2000, Indic: 74


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6887


In [None]:

# ============================================================================
# STEP 10: FEW-SHOT LEARNING CURVES & ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 10: FEW-SHOT LEARNING CURVES & ANALYSIS")
print("="*80)

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 10.1 Learning Curve - Accuracy
ax1 = axes[0, 0]
sizes_with_zero = [0] + few_shot_results['hindi']['sizes']
accs_with_zero = [zero_shot_results['hindi']] + few_shot_results['hindi']['accuracies']

ax1.plot(sizes_with_zero, accs_with_zero, marker='o', linewidth=2,
         markersize=10, color='#2E86AB', label='Hindi')
ax1.axhline(y=zero_shot_results['hindi'], color='red', linestyle='--',
            alpha=0.5, label='Zero-shot baseline')
ax1.fill_between(sizes_with_zero, zero_shot_results['hindi'], accs_with_zero,
                  alpha=0.2, color='#2E86AB')
ax1.set_xlabel('Number of Training Samples', fontsize=12, fontweight='bold')
ax1.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax1.set_title('Few-Shot Learning Curve: Hindi', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend(fontsize=10)
ax1.set_ylim([0, 1])

# Add value labels on points
for i, (x, y) in enumerate(zip(sizes_with_zero, accs_with_zero)):
    ax1.annotate(f'{y:.3f}', (x, y), textcoords="offset points",
                xytext=(0,10), ha='center', fontsize=9, fontweight='bold')

# 10.2 Learning Curve - F1 Score
ax2 = axes[0, 1]
f1_with_zero = [results['hindi']['f1']] + few_shot_results['hindi']['f1_scores']

ax2.plot(sizes_with_zero, f1_with_zero, marker='s', linewidth=2,
         markersize=10, color='#A23B72', label='Hindi F1')
ax2.axhline(y=results['hindi']['f1'], color='red', linestyle='--',
            alpha=0.5, label='Zero-shot F1 baseline')
ax2.fill_between(sizes_with_zero, results['hindi']['f1'], f1_with_zero,
                  alpha=0.2, color='#A23B72')
ax2.set_xlabel('Number of Training Samples', fontsize=12, fontweight='bold')
ax2.set_ylabel('F1-Score', fontsize=12, fontweight='bold')
ax2.set_title('Few-Shot Learning Curve: F1-Score', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.legend(fontsize=10)
ax2.set_ylim([0, 1])

# Add value labels
for i, (x, y) in enumerate(zip(sizes_with_zero, f1_with_zero)):
    ax2.annotate(f'{y:.3f}', (x, y), textcoords="offset points",
                xytext=(0,10), ha='center', fontsize=9, fontweight='bold')

# 10.3 Improvement over Zero-Shot
ax3 = axes[1, 0]
improvements = [(acc - zero_shot_results['hindi']) * 100
                for acc in few_shot_results['hindi']['accuracies']]

bars = ax3.bar(few_shot_results['hindi']['sizes'], improvements,
               color=['#06A77D', '#F77F00', '#D62828', '#023047'],
               edgecolor='black', linewidth=1.5)
ax3.set_xlabel('Number of Training Samples', fontsize=12, fontweight='bold')
ax3.set_ylabel('Improvement over Zero-Shot (%)', fontsize=12, fontweight='bold')
ax3.set_title('Absolute Improvement in Accuracy', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='y')
ax3.axhline(y=0, color='black', linestyle='-', linewidth=1)

# Add value labels on bars
for i, (bar, val) in enumerate(zip(bars, improvements)):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height,
            f'+{val:.1f}%' if val > 0 else f'{val:.1f}%',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

# 10.4 Sample Efficiency Analysis
ax4 = axes[1, 1]

# Calculate samples needed to reach certain accuracy thresholds
thresholds = [0.70, 0.75, 0.80, 0.85]
samples_needed = []

for threshold in thresholds:
    # Find minimum samples needed to reach threshold
    reached = False
    for size, acc in zip(sizes_with_zero, accs_with_zero):
        if acc >= threshold:
            samples_needed.append(size)
            reached = True
            break
    if not reached:
        samples_needed.append(None)

# Create bar chart
valid_thresholds = [t for t, s in zip(thresholds, samples_needed) if s is not None]
valid_samples = [s for s in samples_needed if s is not None]

if valid_samples:
    bars2 = ax4.barh([f'{t*100:.0f}%' for t in valid_thresholds], valid_samples,
                     color='#F18F01', edgecolor='black', linewidth=1.5)
    ax4.set_xlabel('Samples Required', fontsize=12, fontweight='bold')
    ax4.set_ylabel('Target Accuracy', fontsize=12, fontweight='bold')
    ax4.set_title('Sample Efficiency: Samples to Reach Accuracy',
                  fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3, axis='x')

    # Add value labels
    for i, (bar, val) in enumerate(zip(bars2, valid_samples)):
        ax4.text(val, bar.get_y() + bar.get_height()/2.,
                f'  {val} samples',
                ha='left', va='center', fontsize=10, fontweight='bold')
else:
    ax4.text(0.5, 0.5, 'Accuracy thresholds\nnot reached with\ncurrent sample sizes',
            ha='center', va='center', fontsize=12, transform=ax4.transAxes)
    ax4.set_title('Sample Efficiency Analysis', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('few_shot_learning_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nтЬЕ Learning curves saved to 'few_shot_learning_curves.png'")


In [None]:
# ============================================================================
# STEP 11: COMPREHENSIVE RESULTS TABLE
# ============================================================================

print("\n" + "="*80)
print("STEP 11: COMPREHENSIVE RESULTS TABLE")
print("="*80)

# Create results DataFrame
results_df = pd.DataFrame({
    'Training Samples': ['Zero-shot (0)'] + [f'{n} samples' for n in few_shot_results['hindi']['sizes']],
    'Accuracy': [zero_shot_results['hindi']] + few_shot_results['hindi']['accuracies'],
    'F1-Score': [results['hindi']['f1']] + few_shot_results['hindi']['f1_scores'],
    'Improvement (%)': [0.0] + [(acc - zero_shot_results['hindi'])*100
                                 for acc in few_shot_results['hindi']['accuracies']]
})

print("\nЁЯУК HINDI FEW-SHOT LEARNING RESULTS:")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Statistical Analysis
print("\nЁЯУИ KEY INSIGHTS:")
print("-" * 80)

max_improvement_idx = np.argmax(improvements)
max_improvement = improvements[max_improvement_idx]
samples_for_max = few_shot_results['hindi']['sizes'][max_improvement_idx]

print(f"1. Zero-shot Accuracy: {zero_shot_results['hindi']:.4f}")
print(f"2. Best Few-shot Accuracy: {max(few_shot_results['hindi']['accuracies']):.4f} "
      f"({samples_for_max} samples)")
print(f"3. Maximum Improvement: +{max_improvement:.2f}% ({samples_for_max} samples)")
print(f"4. Average Improvement: +{np.mean(improvements):.2f}%")

# Calculate efficiency metrics
if len(few_shot_results['hindi']['sizes']) > 1:
    # Marginal gain per additional sample
    marginal_gains = []
    for i in range(1, len(few_shot_results['hindi']['sizes'])):
        prev_acc = few_shot_results['hindi']['accuracies'][i-1]
        curr_acc = few_shot_results['hindi']['accuracies'][i]
        prev_size = few_shot_results['hindi']['sizes'][i-1]
        curr_size = few_shot_results['hindi']['sizes'][i]

        marginal_gain = (curr_acc - prev_acc) / (curr_size - prev_size)
        marginal_gains.append(marginal_gain)

    print(f"5. Diminishing Returns: {'Yes' if marginal_gains[-1] < marginal_gains[0] else 'No'}")
    print(f"   - Initial marginal gain: {marginal_gains[0]*100:.3f}% per sample")
    print(f"   - Final marginal gain: {marginal_gains[-1]*100:.3f}% per sample")

print("\nЁЯТб RECOMMENDATIONS:")
print("-" * 80)
if max_improvement > 10:
    print("тЬЕ Few-shot learning provides SIGNIFICANT improvement")
    print(f"тЬЕ With just {samples_for_max} samples, accuracy improves by {max_improvement:.1f}%")
else:
    print("тЪая╕П  Few-shot learning provides MODERATE improvement")
    print("тЪая╕П  Consider using more training samples or better data quality")

# Save comprehensive results
comprehensive_results = {
    'model': MODEL_NAME,
    'zero_shot': zero_shot_results,
    'few_shot': {
        'hindi': {
            'sample_sizes': few_shot_results['hindi']['sizes'],
            'accuracies': few_shot_results['hindi']['accuracies'],
            'f1_scores': few_shot_results['hindi']['f1_scores'],
            'improvements': improvements
        }
    }
}

with open('comprehensive_results.json', 'w') as f:
    json.dump(comprehensive_results, f, indent=2)

print("\nЁЯТ╛ Comprehensive results saved to 'comprehensive_results.json'")

In [None]:
pip install -U transformers accelerate datasets


In [None]:
# ============================================================================
# STEP 12: MODEL COMPARISON (Research Question 2)
# ============================================================================

print("\n" + "="*80)
print("STEP 12: MODEL COMPARISON ACROSS ARCHITECTURES")
print("="*80)

print("""
ЁЯОп EXPERIMENT: Compare Zero-Shot Transfer Across Different Models
   Research Question: Which multilingual model transfers best?

   Models to compare:
   1. XLM-RoBERTa (xlm-roberta-base)
   2. mBERT (bert-base-multilingual-cased)
   3. IndicBERT (ai4bharat/indic-bert)
""")

# Store results for all models
all_models_results = {}

def train_and_evaluate_model(model_name, train_texts, train_labels, val_texts, val_labels,
                             test_dfs, model_description):
    """Train a model and evaluate on all Indic languages"""

    print(f"\n{'='*80}")
    print(f"Training {model_description}")
    print(f"{'='*80}")

    # Load tokenizer
    print("ЁЯУж Loading tokenizer...")
    if 'indic-bert' in model_name:
        tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    else:
        tok = AutoTokenizer.from_pretrained(model_name)

    # Tokenize
    print("ЁЯФд Tokenizing data...")
    train_enc = tok(train_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
    val_enc = tok(val_texts, truncation=True, padding=True, max_length=256, return_tensors='pt')

    train_ds = SentimentDataset(train_enc, train_labels)
    val_ds = SentimentDataset(val_enc, val_labels)

    # Load model
    print(f"ЁЯФз Loading {model_name}...")
    mdl = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # Training
    args = TrainingArguments(
        output_dir=f'./results_{model_name.replace("/", "_")}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
        seed=42
    )

    trainer = Trainer(
        model=mdl,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("ЁЯЪА Training...")
    trainer.train()

    # Evaluate on English
    eval_res = trainer.evaluate()
    print(f"тЬЕ English Val Accuracy: {eval_res['eval_accuracy']:.4f}")

    # Evaluate on Indic languages
    results = {}
    for lang, df in test_dfs.items():
        print(f"\nЁЯФН Evaluating on {lang.capitalize()}...")
        res = evaluate_on_indic(mdl, tok, df, lang.capitalize())
        results[lang] = {
            'accuracy': res['accuracy'],
            'f1': res['f1']
        }

    # Cleanup
    del mdl, trainer
    torch.cuda.empty_cache()

    return {
        'model': model_name,
        'description': model_description,
        'english_val': {'accuracy': eval_res['eval_accuracy'], 'f1': eval_res['eval_f1']},
        'indic_results': results
    }

# Prepare test dataframes
test_dataframes = {
    'hindi': hindi_test,
    'telugu': telugu_test,
    'tamil': tamil_test
}

# Run comparison (comment out models you don't want to test)
print("\nтЪая╕П  NOTE: This will train 3 models. It may take 30-45 minutes.")
print("To save time, you can comment out models in the list below.\n")

models_to_compare = [
    ('xlm-roberta-base', 'XLM-RoBERTa'),
    ('bert-base-multilingual-cased', 'mBERT'),
    # ('ai4bharat/indic-bert', 'IndicBERT'),  # Uncomment to include
]

for model_name, model_desc in models_to_compare:
    result = train_and_evaluate_model(
        model_name,
        train_texts[:3000],  # Use subset for faster comparison
        train_labels[:3000],
        val_texts[:500],
        val_labels[:500],
        test_dataframes,
        model_desc
    )
    all_models_results[model_name] = result
    print(f"\nтЬЕ Completed {model_desc}")

# Create comparison table
print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)

comparison_data = []
for model_key, res in all_models_results.items():
    row = {
        'Model': res['description'],
        'Eng_Acc': res['english_val']['accuracy'],
        'Hindi_Acc': res['indic_results']['hindi']['accuracy'],
        'Hindi_F1': res['indic_results']['hindi']['f1'],
        'Telugu_Acc': res['indic_results']['telugu']['accuracy'],
        'Telugu_F1': res['indic_results']['telugu']['f1'],
        'Tamil_Acc': res['indic_results']['tamil']['accuracy'],
        'Tamil_F1': res['indic_results']['tamil']['f1'],
    }
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
print("\nЁЯУК Zero-Shot Cross-Lingual Transfer Comparison:")
print(comparison_df.to_string(index=False))

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy comparison
ax1 = axes[0]
x = np.arange(len(comparison_data))
width = 0.25

hindi_accs = [row['Hindi_Acc'] for row in comparison_data]
telugu_accs = [row['Telugu_Acc'] for row in comparison_data]
tamil_accs = [row['Tamil_Acc'] for row in comparison_data]

ax1.bar(x - width, hindi_accs, width, label='Hindi', color='#FF6B6B')
ax1.bar(x, telugu_accs, width, label='Telugu', color='#4ECDC4')
ax1.bar(x + width, tamil_accs, width, label='Tamil', color='#45B7D1')

ax1.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax1.set_title('Model Comparison: Zero-Shot Accuracy', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels([row['Model'] for row in comparison_data], rotation=15, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax1.set_ylim([0, 1])

# F1 comparison
ax2 = axes[1]
hindi_f1s = [row['Hindi_F1'] for row in comparison_data]
telugu_f1s = [row['Telugu_F1'] for row in comparison_data]
tamil_f1s = [row['Tamil_F1'] for row in comparison_data]

ax2.bar(x - width, hindi_f1s, width, label='Hindi', color='#FF6B6B')
ax2.bar(x, telugu_f1s, width, label='Telugu', color='#4ECDC4')
ax2.bar(x + width, tamil_f1s, width, label='Tamil', color='#45B7D1')

ax2.set_ylabel('F1-Score', fontsize=12, fontweight='bold')
ax2.set_title('Model Comparison: Zero-Shot F1-Score', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels([row['Model'] for row in comparison_data], rotation=15, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_ylim([0, 1])

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nтЬЕ Model comparison saved to 'model_comparison.png'")

# Key insights
print("\nЁЯТб KEY INSIGHTS:")
print("-" * 80)
best_hindi = max(comparison_data, key=lambda x: x['Hindi_Acc'])
best_telugu = max(comparison_data, key=lambda x: x['Telugu_Acc'])
best_tamil = max(comparison_data, key=lambda x: x['Tamil_Acc'])

print(f"Best for Hindi: {best_hindi['Model']} ({best_hindi['Hindi_Acc']:.4f})")
print(f"Best for Telugu: {best_telugu['Model']} ({best_telugu['Telugu_Acc']:.4f})")
print(f"Best for Tamil: {best_tamil['Model']} ({best_tamil['Tamil_Acc']:.4f})")

# Save model comparison
with open('model_comparison_results.json', 'w') as f:
    json.dump(all_models_results, f, indent=2, default=float)

print("\nЁЯТ╛ Model comparison saved to 'model_comparison_results.json'")


In [None]:
# ============================================================================
# STEP 13: LAYER-WISE TRANSFER ANALYSIS (Research Question 4)
# ============================================================================

print("\n" + "="*80)
print("STEP 13: LAYER-WISE TRANSFER ANALYSIS")
print("="*80)

print("""
ЁЯОп EXPERIMENT: Which transformer layers encode language-agnostic knowledge?
   Research Question: Which layers encode language-agnostic sentiment knowledge?

   Method: Train linear probes on each layer's CLS embeddings

   Expected Pattern:
   - Lower layers (0-3): Language-specific features
   - Middle layers (4-8): Cross-lingual semantic features
   - Upper layers (9-11): Task-specific features
""")

def extract_layer_embeddings(model, tokenizer, texts, layer_idx):
    """Extract CLS embeddings from a specific layer"""
    model.eval()

    # Tokenize
    encodings = tokenizer(texts, truncation=True, padding=True,
                         max_length=256, return_tensors='pt').to(device)

    # Get hidden states
    with torch.no_grad():
        outputs = model(**encodings, output_hidden_states=True)
        # hidden_states: (num_layers, batch_size, seq_len, hidden_dim)
        layer_output = outputs.hidden_states[layer_idx]
        # Extract CLS token (first token)
        cls_embeddings = layer_output[:, 0, :].cpu().numpy()

    return cls_embeddings

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def probe_layer(model, tokenizer, train_texts, train_labels,
                test_texts, test_labels, layer_idx, layer_name):
    """Train and evaluate a probe on a specific layer"""

    # Extract embeddings
    train_emb = extract_layer_embeddings(model, tokenizer, train_texts, layer_idx)
    test_emb = extract_layer_embeddings(model, tokenizer, test_texts, layer_idx)

    # Normalize
    scaler = StandardScaler()
    train_emb = scaler.fit_transform(train_emb)
    test_emb = scaler.transform(test_emb)

    # Train probe
    probe = LogisticRegression(max_iter=1000, random_state=42)
    probe.fit(train_emb, train_labels)

    # Evaluate
    acc = probe.score(test_emb, test_labels)

    return acc

print("\nЁЯФм Running layer-wise probing analysis...")
print("Note: Using the first trained model from previous experiments\n")

# Use the first model that was trained
probe_model_name = list(all_models_results.keys())[0]
print(f"Using model: {all_models_results[probe_model_name]['description']}")

# Load the model
probe_model = AutoModelForSequenceClassification.from_pretrained(
    probe_model_name, num_labels=2
).to(device)

# Load tokenizer
if 'indic-bert' in probe_model_name:
    probe_tokenizer = AutoTokenizer.from_pretrained(probe_model_name, trust_remote_code=True)
else:
    probe_tokenizer = AutoTokenizer.from_pretrained(probe_model_name)

# Sample data for probing (use small subset for speed)
probe_train_texts = train_texts[:500]
probe_train_labels = train_labels[:500]

# Probe each language
layer_results = {
    'hindi': [],
    'telugu': [],
    'tamil': []
}

# Number of layers (12 for BERT-based models)
num_layers = 13  # 0 = embeddings, 1-12 = transformer layers

print("\nProbing layers...")
for layer_idx in tqdm(range(num_layers)):
    layer_name = f"Layer {layer_idx}" if layer_idx > 0 else "Embeddings"

    # Probe Hindi
    hindi_acc = probe_layer(
        probe_model, probe_tokenizer,
        probe_train_texts, probe_train_labels,
        hindi_test['text'].head(50).tolist(), hindi_test['label'].head(50).tolist(),
        layer_idx, layer_name
    )
    layer_results['hindi'].append(hindi_acc)

    # Probe Telugu
    telugu_acc = probe_layer(
        probe_model, probe_tokenizer,
        probe_train_texts, probe_train_labels,
        telugu_test['text'].head(50).tolist(), telugu_test['label'].head(50).tolist(),
        layer_idx, layer_name
    )
    layer_results['telugu'].append(telugu_acc)

    # Probe Tamil
    tamil_acc = probe_layer(
        probe_model, probe_tokenizer,
        probe_train_texts, probe_train_labels,
        tamil_test['text'].head(50).tolist(), tamil_test['label'].head(50).tolist(),
        layer_idx, layer_name
    )
    layer_results['tamil'].append(tamil_acc)

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))

layers = list(range(num_layers))
ax.plot(layers, layer_results['hindi'], marker='o', linewidth=2, label='Hindi', color='#FF6B6B')
ax.plot(layers, layer_results['telugu'], marker='s', linewidth=2, label='Telugu', color='#4ECDC4')
ax.plot(layers, layer_results['tamil'], marker='^', linewidth=2, label='Tamil', color='#45B7D1')

ax.set_xlabel('Layer Index', fontsize=12, fontweight='bold')
ax.set_ylabel('Probe Accuracy', fontsize=12, fontweight='bold')
ax.set_title('Layer-Wise Transfer Analysis: Which Layers Encode Cross-Lingual Sentiment?',
            fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_xticks(layers)
ax.set_xticklabels(['Emb'] + [str(i) for i in range(1, num_layers)])

# Highlight middle layers
ax.axvspan(4, 8, alpha=0.1, color='green', label='Middle Layers\n(Cross-lingual)')
ax.set_ylim([0.4, 1.0])

plt.tight_layout()
plt.savefig('layer_wise_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nтЬЕ Layer-wise analysis saved to 'layer_wise_analysis.png'")

# Analysis
print("\nЁЯТб LAYER-WISE INSIGHTS:")
print("-" * 80)
for lang in ['hindi', 'telugu', 'tamil']:
    best_layer = np.argmax(layer_results[lang])
    best_acc = layer_results[lang][best_layer]
    print(f"{lang.capitalize()}:")
    print(f"  Best layer: {best_layer} ({best_acc:.4f} accuracy)")
    print(f"  Lower layers avg (0-3): {np.mean(layer_results[lang][:4]):.4f}")
    print(f"  Middle layers avg (4-8): {np.mean(layer_results[lang][4:9]):.4f}")
    print(f"  Upper layers avg (9-12): {np.mean(layer_results[lang][9:]):.4f}")
    print()

print("\nЁЯФм RESEARCH FINDING:")
if all(np.argmax(layer_results[lang]) >= 4 and np.argmax(layer_results[lang]) <= 8
       for lang in ['hindi', 'telugu', 'tamil']):
    print("тЬЕ Middle layers (4-8) consistently show best cross-lingual transfer")
    print("   This confirms that semantic knowledge is language-agnostic")
else:
    print("тЪая╕П  Transfer pattern varies across languages")
    print("   Language families may influence optimal layer depth")

# Cleanup
del probe_model
torch.cuda.empty_cache()

# Save layer results
with open('layer_wise_results.json', 'w') as f:
    json.dump(layer_results, f, indent=2)

print("\nЁЯТ╛ Layer-wise results saved to 'layer_wise_results.json'")


print("\n" + "="*80)
print("тЬЕ NOTEBOOK COMPLETE!")
print("="*80)
