In [None]:
!pip install numpy indic-nlp-library indic-transliteration langchain langchain_community faiss-cpu tqdm pandas torch transformers --quiet
!pip install sentence-transformers langchain_community tiktoken gradio openai-whisper gTTs
!pip install -U bitsandbytes --quiet
!pip install -U peft accelerate --quiet
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/155.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import numpy as np
import pandas as pd
import os
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
import json
import re
import gc
from tqdm.notebook import tqdm
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoModelForCausalLM, T5ForConditionalGeneration, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling, AutoModelForSequenceClassification
import time
import logging
from pathlib import Path
import sentence_transformers
import random
import gradio as gr
import subprocess
import shutil
import uuid
import whisper
from gtts import gTTS
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training, PeftModel
from datasets import Dataset

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

MODEL_NAME = "Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1"
USE_4BIT = True
MAX_NEW_TOKENS = 150  # Even shorter to avoid role confusion
TEMPERATURE = 0.2  # Lower temperature for more predictable outputs
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EMBEDDING_MODEL = "ai4bharat/indic-bert"
RAG_DATA_PATH = "processed_hindi_dialogues.json"  # Path to your JSON file
FAISS_INDEX_PATH = "hindi_dialogue_faiss_index"  # Path to save/load FAISS index

ENCOURAGEMENT_PHRASES = [
    "Try responding in Hindi! / हिंदी में जवाब देने की कोशिश करें!",
    "Practice makes perfect! Try some Hindi! / अभ्यास से सिद्धि! कुछ हिंदी का प्रयास करें!",
    "Even simple Hindi words help you learn! / सरल हिंदी शब्द भी आपको सीखने में मदद करते हैं!",
    "Don't worry about mistakes in Hindi! / हिंदी में गलतियों की चिंता न करें!"
]

scenarios = {
    "market": {
        "role": "shopkeeper",
        "name": "Market",
        "description": "Practice buying items, haggling prices, and asking about products in a typical Indian market."
    },
    "restaurant": {
        "role": "waiter",
        "name": "Restaurant",
        "description": "Practice ordering food, asking about dishes, and handling restaurant interactions in Hindi."
    }
}

In [None]:
def clear_gpu_memory():
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
        gc.collect()

def load_model():
    print(f"Loading model {MODEL_NAME}...")
    if USE_4BIT:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
    else:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def load_and_format_hindi_dialogue_dataset(file_path="processed_hindi_dialogues.json"):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    examples = []

    for scene in raw_data:
          turns = scene.get("metadata",[])
          d_turns = turns.get("dialogue_turns",[])
          for i in range(len(d_turns) - 1):
              curr = d_turns[i]
              nxt = d_turns[i + 1]

              if not all(k in curr for k in ("speaker", "text_roman", "text_devanagari")):
                  continue
              if not all(k in nxt for k in ("speaker", "text_roman", "text_devanagari")):
                  continue

              input_text = f'{curr["speaker"]}: {curr["text_roman"]}\n{curr["speaker"]}: {curr["text_devanagari"]}'
              output_text = f'{nxt["speaker"]}: {nxt["text_roman"]}\n{nxt["speaker"]}: {nxt["text_devanagari"]}'

              examples.append({"input": input_text, "output": output_text})
    return Dataset.from_list(examples)

def tokenize_dialogue_pair(example, tokenizer, max_input_len=512, max_output_len=128):
    input_encodings = tokenizer(example["input"], truncation=True,
                               max_length=max_input_len, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], truncation=True,
                          max_length=max_output_len, padding="max_length")
    model_inputs = {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels["input_ids"],
    }
    model_inputs["labels"] = [
        label if label != tokenizer.pad_token_id else -100
        for label in model_inputs["labels"]
    ]
    return model_inputs

def train_lora_adapter(model, tokenizer, dataset_path="/content/processed_hindi_dialogues.json"):
    print("Loading and formatting dataset...")
    dataset = load_and_format_hindi_dialogue_dataset(dataset_path)
    tokenized_dataset = dataset.map(lambda x: tokenize_dialogue_pair(x, tokenizer), batched=False, remove_columns=dataset.column_names)
    print("Setting up training config...")
    training_args = TrainingArguments(
        output_dir="gaja-hindi-lora",
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        report_to="none",
        remove_unused_columns=False
    )

    print("Initializing Trainer...")
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=tokenized_dataset,
        args=training_args,
        data_collator=data_collator
    )

    print("Starting training...")
    trainer.train()

    print("Saving LoRA adapter...")
    model.save_pretrained("gaja_hindi_lora_adapter")
    tokenizer.save_pretrained("gaja_hindi_lora_adapter")
    print("Training complete.")

In [None]:
lmodel, ltokenizer = load_model()

Loading model Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

In [None]:
if not os.path.exists("gaja_hindi_lora_adapter"):
    train_lora_adapter(lmodel, ltokenizer)

Loading and formatting dataset...


Map:   0%|          | 0/9480 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Setting up training config...
Initializing Trainer...
Starting training...


  return fn(*args, **kwargs)


Step,Training Loss
10,3.3988
20,3.3933
30,3.4172
40,3.0625


In [None]:
class HindiLearningRAG:
    """RAG system for retrieving Hindi dialogues, idioms, and examples."""

    def __init__(self, dummy_mode=False):
        """Initialize the RAG system with embeddings model."""
        self.dummy_mode = dummy_mode
        if dummy_mode:
            logger.info("Initializing dummy RAG system (no retrieval capabilities)")
            return

        logger.info(f"Initializing Hindi Learning RAG on {DEVICE}...")
        self.embeddings = None
        self.vector_store = None
        self.document_data = []
        self.initialize_embeddings()
        logger.info("RAG system initialized.")

    def initialize_embeddings(self):
        """Initialize the embeddings model."""
        if self.dummy_mode:
            return

        try:
            self.embeddings = HuggingFaceEmbeddings(
                model_name=EMBEDDING_MODEL,
                model_kwargs={},
                encode_kwargs={"normalize_embeddings": True}
            )

            logger.info("Embeddings model initialized.")
        except Exception as e:
            logger.error(f"Error initializing embeddings: {e}")
            logger.warning("Continuing in dummy mode (no retrieval capabilities)")
            self.dummy_mode = True

    def load_documents(self, file_path=RAG_DATA_PATH):
        """Load documents from JSON file."""
        if self.dummy_mode:
            return False

        if not os.path.exists(file_path):
            logger.warning(f"Data file {file_path} not found. You need to load data first.")
            self.dummy_mode = True
            return False

        try:
            logger.info(f"Loading documents from {file_path}...")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            self.document_data = data
            logger.info(f"Loaded {len(data)} documents.")
            return True
        except Exception as e:
            logger.error(f"Error loading documents: {e}")
            self.dummy_mode = True
            return False

    def create_vector_store(self):
        """Create a FAISS vector store from loaded documents."""
        if self.dummy_mode or not self.document_data:
            logger.warning("No documents loaded or in dummy mode. Cannot create vector store.")
            return False

        try:
            logger.info("Creating FAISS vector store...")

            documents = []
            for item in self.document_data:
                doc = Document(
                    page_content=item["page_content"],
                    metadata=item["metadata"]
                )
                documents.append(doc)

            # Create vector store
            self.vector_store = FAISS.from_documents(documents, self.embeddings)

            logger.info(f"Created vector store with {len(documents)} documents.")

            # Save index
            if not os.path.exists(FAISS_INDEX_PATH):
                os.makedirs(FAISS_INDEX_PATH)
            self.vector_store.save_local(FAISS_INDEX_PATH)
            logger.info(f"Saved vector store to {FAISS_INDEX_PATH}")

            clear_gpu_memory()
            return True
        except Exception as e:
            logger.error(f"Error creating vector store: {e}")
            self.dummy_mode = True
            return False

    def load_vector_store(self, index_path=FAISS_INDEX_PATH):
        """Load a FAISS vector store from disk."""
        if self.dummy_mode:
            return False

        if not os.path.exists(index_path):
            logger.warning(f"Index path {index_path} not found. Create index first.")
            return False

        try:
            logger.info(f"Loading vector store from {index_path}...")
            self.vector_store = FAISS.load_local(index_path, self.embeddings)
            logger.info("Vector store loaded successfully.")
            return True
        except Exception as e:
            logger.error(f"Error loading vector store: {e}")
            self.dummy_mode = True
            return False

    def rerank_results(self, query, retrieved_docs, top_k=3, alpha=0.5):
        if self.dummy_mode:
            return retrieved_docs[:top_k]
        try:
            ce_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
            tokenizer = AutoTokenizer.from_pretrained(ce_model_name)
            model = AutoModelForSequenceClassification.from_pretrained(ce_model_name).to(DEVICE)

            # Extract documents and original FAISS scores
            docs = []
            faiss_scores = []
            for doc, score in retrieved_docs:
                docs.append(doc)
                # Convert distance to similarity score (closer to 0 is better in FAISS)
                # Normalize by taking 1/(1+score) so higher is better
                faiss_scores.append(1/(1 + score))

            # Create pairs of query and document content
            pairs = [[query, doc.page_content] for doc in docs]

            # Tokenize pairs for the model
            inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

            # Get scores from model
            with torch.no_grad():
                outputs = model(**inputs).logits
                if outputs.shape[1] > 1:
                    # For binary classification models
                    ce_scores = outputs[:, 1].cpu().numpy()
                else:
                    # For regression models
                    ce_scores = outputs.squeeze(-1).cpu().numpy()

            # Normalize both score arrays to 0-1 range
            if ce_scores.size > 0:
                ce_min, ce_max = ce_scores.min(), ce_scores.max()
                if ce_max > ce_min:
                    ce_scores = (ce_scores - ce_min) / (ce_max - ce_min)

            faiss_scores = np.array(faiss_scores)
            if faiss_scores.size > 0:
                faiss_min, faiss_max = faiss_scores.min(), faiss_scores.max()
                if faiss_max > faiss_min:
                    faiss_scores = (faiss_scores - faiss_min) / (faiss_max - faiss_min)

            # Combine scores with weighted average
            combined_scores = alpha * ce_scores + (1 - alpha) * faiss_scores

            # Create new tuples with docs and combined scores
            reranked_docs = [(docs[i], combined_scores[i]) for i in range(len(docs))]

            # Sort by combined score (higher is better)
            reranked_docs = sorted(reranked_docs, key=lambda x: x[1], reverse=True)

            # Return top_k results
            return reranked_docs[:top_k]

        except Exception as e:
            logger.error(f"Error in reranking: {e}")
            # Return original results if reranking fails
            return retrieved_docs[:top_k]

    def retrieve_dialogue_examples(self, query, top_k=3, context_tags=None, emotion_tags=None):
        """Retrieve dialogue examples based on query and optional tags."""
        if self.dummy_mode:
            return []

        if not self.vector_store:
            if not self.load_vector_store():
                logger.warning("Vector store not available. Loading documents and creating index...")
                if self.load_documents() and self.create_vector_store():
                    logger.info("Vector store created successfully.")
                else:
                    self.dummy_mode = True
                    return []

        try:
            logger.info(f"Retrieving examples for query: {query}")

            # Get base retrieval results
            retrieval_results = self.vector_store.similarity_search_with_score(query, k=top_k*3)

            # Further filter by metadata if tags are provided
            if context_tags or emotion_tags:
                filtered_results = []
                for doc, score in retrieval_results:
                    metadata = doc.metadata

                    # Check context tags
                    context_match = True
                    if context_tags:
                        doc_context = set(metadata.get("context_tags", []))
                        query_context = set(context_tags)
                        context_match = bool(doc_context.intersection(query_context))

                    # Check emotion tags
                    emotion_match = True
                    if emotion_tags:
                        doc_emotion = set(metadata.get("emotion_tags", []))
                        query_emotion = set(emotion_tags)
                        emotion_match = bool(doc_emotion.intersection(query_emotion))

                    if context_match and emotion_match:
                        filtered_results.append((doc, score))

                retrieval_results = filtered_results

            # Sort by score and truncate
            retrieval_results = sorted(retrieval_results, key=lambda x: x[1])[:top_k]

            # Extract dialogue turns for each document
            examples = []
            for doc, score in retrieval_results:
                example = {
                    "scene_description": doc.metadata.get("scene_description", ""),
                    "roman_dialogue": doc.metadata.get("roman_dialogue", ""),
                    "devanagari_dialogue": doc.metadata.get("devanagari_dialogue", ""),
                    "context_tags": doc.metadata.get("context_tags", []),
                    "emotion_tags": doc.metadata.get("emotion_tags", []),
                    "relevance_score": float(score),
                    "dialogue_turns": doc.metadata.get("dialogue_turns", [])
                }
                examples.append(example)

            logger.info(f"Retrieved {len(examples)} examples.")
            return examples
        except Exception as e:
            logger.error(f"Error retrieving examples: {e}")
            return []

    def get_hindi_phrases_for_context(self, context, top_k=3):
        """Get relevant Hindi phrases based on the context."""
        if self.dummy_mode:
            # Return default phrases for common scenarios
            market_phrases = [
                {"phrase": "Kitne ka hai?", "meaning": "How much is it?", "devanagari": "कितने का है?"},
                {"phrase": "Thoda kam kar dijiye", "meaning": "Please reduce it a little", "devanagari": "थोड़ा कम कर दीजिए"},
                {"phrase": "Badhiya maal hai", "meaning": "It's good quality", "devanagari": "बढ़िया माल है"}
            ]

            restaurant_phrases = [
                {"phrase": "Menu dikha dijiye", "meaning": "Please show me the menu", "devanagari": "मेनू दिखा दीजिए"},
                {"phrase": "Thoda teekha hai", "meaning": "It's a bit spicy", "devanagari": "थोड़ा तीखा है"},
                {"phrase": "Bill le aayiye", "meaning": "Please bring the bill", "devanagari": "बिल ले आइए"}
            ]

            if "market" in context.lower():
                return market_phrases[:top_k]
            elif "restaurant" in context.lower():
                return restaurant_phrases[:top_k]
            else:
                return market_phrases[:top_k]  # Default to market

        # If RAG is available, extract phrases from retrieved examples
        examples = self.retrieve_dialogue_examples(context, top_k=top_k)

        phrases = []
        for example in examples:
            dialogue_turns = example.get("dialogue_turns", [])

            # Extract short phrases from dialogue turns
            for turn in dialogue_turns:
                text_roman = turn.get("text_roman", "")
                text_devanagari = turn.get("text_devanagari", "")

                # Look for short phrases (3-5 words)
                words = text_roman.split()
                if 3 <= len(words) <= 10:
                    phrases.append({
                        "phrase": text_roman,
                        "devanagari": text_devanagari,
                        "meaning": ""  # We would need translation for this
                    })

        # Return unique phrases, limited to top_k
        unique_phrases = []
        seen_phrases = set()

        for phrase in phrases:
            if phrase["phrase"] not in seen_phrases:
                seen_phrases.add(phrase["phrase"])
                unique_phrases.append(phrase)

                if len(unique_phrases) >= top_k:
                    break

        # If we don't have enough phrases, add default ones
        if len(unique_phrases) < top_k:
            default_phrases = [
                {"phrase": "Kitne ka hai?", "meaning": "How much is it?", "devanagari": "कितने का है?"},
                {"phrase": "Thoda kam kar dijiye", "meaning": "Please reduce it a little", "devanagari": "थोड़ा कम कर दीजिए"},
                {"phrase": "Badhiya maal hai", "meaning": "It's good quality", "devanagari": "बढ़िया माल है"}
            ]

            for phrase in default_phrases:
                if phrase["phrase"] not in seen_phrases and len(unique_phrases) < top_k:
                    seen_phrases.add(phrase["phrase"])
                    unique_phrases.append(phrase)

        return unique_phrases

In [None]:
#RAG Prompting with Cross Encoder
def create_role_locked_prompt(scenario_type, rag_examples=None):
    """Create a prompt that strictly enforces the assistant's role, enhanced with RAG examples."""
    base_system = """You are a Hindi language tutor demonstrating ONLY the {role} role in a {scenario_type} conversation.

CRITICAL ROLE INSTRUCTIONS:
1. You ONLY play the {role} - NEVER respond as the customer.
2. The human user ALWAYS plays the customer.
3. NEVER continue the conversation as the customer.
4. NEVER put "Customer:" or similar labels in your responses.
5. If you notice yourself starting to respond as the customer, STOP IMMEDIATELY.

FORMAT REQUIREMENTS:
1. First line: Short response in Roman Hindi (max 2 sentences)
2. Second line: Same response in Devanagari script
3. NOTHING ELSE.

CONTENT GUIDELINES:
1. Keep responses SHORT and PRACTICAL.
2. Use authentic, everyday Hindi {scenario_type} language.
3. Don't create elaborate stories or explanations.
4. Use REALISTIC Hindi that would be spoken in a real {scenario_type}."""

    scenarios = {
        "market": {
            "role": "shopkeeper",
            "system": base_system.format(role="shopkeeper", scenario_type="market"),
            "examples": [
                "Namaste ji, kya chahiye aapko?\nनमस्ते जी, क्या चाहिए आपको?",
                "Haan ji, ye taza tamatar hai. Pachaas rupaye kilo.\nहां जी, ये ताज़ा टमाटर हैं। पचास रुपये किलो।"
            ]
        },
        "restaurant": {
            "role": "waiter",
            "system": base_system.format(role="waiter", scenario_type="restaurant"),
            "examples": [
                "Namaste ji, kya khaayenge aap?\nनमस्ते जी, क्या खाएंगे आप?",
                "Ji zaroor, paneer butter masala aur do roti. Kuchh aur?\nजी ज़रूर, पनीर बटर मसाला और दो रोटी। कुछ और?"
            ]
        }
    }
    scenario_info = scenarios.get(scenario_type, scenarios["market"])

    rag_content = ""
    if rag_examples:
        rag_content = "\n\nREFERENCE EXAMPLES (use these for authentic Hindi expressions):\n"
        for i, example in enumerate(rag_examples):
            if "dialogue_turns" in example:
                # Extract a few turns for examples
                turns = example.get("dialogue_turns", [])
                if turns:
                    rag_content += f"Example {i+1}:\n"
                    for j, turn in enumerate(turns[:3]):  # Limit to 3 turns
                        speaker = turn.get("speaker", "")
                        text = turn.get("text_roman", "")
                        rag_content += f"{speaker}: {text}\n"
                    rag_content += "\n"

    return scenario_info["system"] + rag_content

def generate_initial_greeting(model, tokenizer, rag_system, scenario_type="market"):
    """Generate just an initial greeting with strict role enforcement and RAG enhancement."""
    # Get relevant dialogue examples for this scenario
    context_tags = None
    if scenario_type == "market":
        context_tags = ["shopping", "market", "bazaar"]
    elif scenario_type == "restaurant":
        context_tags = ["food", "restaurant"]

    examples = []
    if rag_system and not rag_system.dummy_mode:
        examples = rag_system.retrieve_dialogue_examples(
            query=f"greeting in a {scenario_type}",
            top_k=2,
            context_tags=context_tags
        )


    system_prompt = create_role_locked_prompt(scenario_type, examples)

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Start with a typical greeting a {scenario_type} {scenarios[scenario_type]['role']} would use. Keep it short and authentic. ONLY respond as the {scenarios[scenario_type]['role']}."}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.2,
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    response = clean_response(response)

    clear_gpu_memory()
    return response

def continue_dialogue(model, tokenizer, rag_system, user_input, scenario_type="market", in_hindi=False):
    """Continue the conversation with enforced role boundaries and RAG enhancement."""
    # Get relevant dialogue examples for this input
    examples = []
    if rag_system and not rag_system.dummy_mode:
        context_tags = None
        if scenario_type == "market":
            context_tags = ["shopping", "market", "bazaar"]
        elif scenario_type == "restaurant":
            context_tags = ["food", "restaurant"]

        examples = rag_system.retrieve_dialogue_examples(
            query=user_input,
            top_k=2,
            context_tags=context_tags
        )

    system_prompt = create_role_locked_prompt(scenario_type, examples)

    if in_hindi:
        system_prompt += "\n\nNOTE: The customer is responding in Hindi, which is excellent! Encourage them by acknowledging their Hindi usage in your response."

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"The customer says: \"{user_input}\"\n\nRespond ONLY as the {scenarios[scenario_type]['role']} in short, simple Hindi (both Roman and Devanagari). NEVER respond as the customer. Keep your response brief and practical."}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.3,  # Higher to avoid repetitive patterns that might cause role confusion
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    response = clean_response(response)

    clear_gpu_memory()
    return response, examples

def clean_response(response):
    """Clean up the response to ensure proper format and remove role confusion."""
    response = re.sub(r'(Shopkeeper|Waiter|Customer|Assistant):\s*', '', response)
    lines = response.split('\n')
    cleaned_lines = []
    roman_line = ""
    devanagari_line = ""
    for line in lines:
        if line.strip():
            if not roman_line:
                roman_line = line.strip()
            elif not devanagari_line:
                devanagari_line = line.strip()
                break
    if roman_line and devanagari_line:
        return f"{roman_line}\n{devanagari_line}"
    return response

def is_hindi(text):
    """Check if the text contains Hindi (either in Devanagari or romanized)."""
    # Check for Devanagari characters
    devanagari_pattern = re.compile(r'[\u0900-\u097F]')
    if devanagari_pattern.search(text):
        return True

    hindi_romanized_words = [
        'namaste', 'dhanyavad', 'theek', 'haan', 'nahi', 'kya', 'aap', 'mai', 'tum',
        'kitna', 'rupaye', 'paisa', 'khana', 'pani', 'chai', 'acha', 'bahut', 'thoda'
    ]

    text_lower = text.lower()
    for word in hindi_romanized_words:
        if word in text_lower:
            return True

    return False

def get_encouragement_message():
    import random
    return random.choice(ENCOURAGEMENT_PHRASES)

In [None]:
class HindiPracticeApp:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.rag_system = None
        self.history = []
        self.scenario_type = "market"
        self.model_loaded = False
        self.user_used_hindi = False
        self.hindi_encouragement_count = 0
        self.last_message = ""
        self.useful_phrases = []

    def initialize_model(self, progress=gr.Progress()):
        """Initialize model and RAG system with progress updates."""
        if self.model_loaded:
            return "Model already loaded"

        progress(0, desc="Initializing Hindi Learning System...")

        # Initialize RAG system
        try:
            progress(0.1, desc="Setting up RAG system...")
            self.rag_system = HindiLearningRAG()

            # Try to load existing vector store
            progress(0.2, desc="Loading vector store...")
            if not self.rag_system.load_vector_store():
                if os.path.exists(RAG_DATA_PATH):
                    progress(0.3, desc=f"Found document data, loading...")
                    if self.rag_system.load_documents(RAG_DATA_PATH):
                        progress(0.4, desc="Creating vector store...")
                        self.rag_system.create_vector_store()
                else:
                    progress(0.3, desc="RAG data not found. Using basic mode.")
                    self.rag_system.dummy_mode = True
        except Exception as e:
            progress(0.3, desc=f"Error initializing RAG: {str(e)}")
            self.rag_system = HindiLearningRAG(dummy_mode=True)

        # Load LLM model
        progress(0.5, desc=f"Loading model for conversation practice...")
        try:
            self.model=lmodel
            self.tokenizer=ltokenizer
            progress(0.9, desc="Model loaded successfully")
            self.model_loaded = True
            self.model = PeftModel.from_pretrained(self.model, "gaja_hindi_lora_adapter")
            self.tokenizer = AutoTokenizer.from_pretrained("gaja_hindi_lora_adapter", trust_remote_code=True)

        except Exception as e:
            return f"Error loading model: {str(e)}"

        progress(1.0, desc="Initialization complete!")
        return "Hindi Learning System initialized! Select a scenario to begin."

    def get_raw_latest_tutor_message(self):
        """Get the raw content of the most recent tutor message without formatting."""
        formatted_message = self.get_latest_tutor_message()

        # Remove the formatting markers (🗣️, 📝, etc.)
        lines = formatted_message.strip().split("\n")
        if len(lines) >= 2:
            roman = lines[0].replace("🗣️ ", "")
            devanagari = lines[1].replace("📝 ", "")
            return f"{roman}\n{devanagari}"

        return formatted_message

    def change_scenario(self, scenario):
        """Change the conversation scenario."""
        self.scenario_type = scenario
        self.history = []
        self.user_used_hindi = False
        self.hindi_encouragement_count = 0

        # Get useful phrases for this scenario
        self.useful_phrases = self.rag_system.get_hindi_phrases_for_context(f"{scenario} conversation", top_k=5)

        # Format phrases for display
        phrases_text = "### Useful Hindi Phrases:\n"
        for phrase in self.useful_phrases:
            phrases_text += f"- {phrase['phrase']} - {phrase.get('devanagari', '')}\n"
            if phrase.get('meaning'):
                phrases_text += f"  ({phrase['meaning']})\n"

        # Generate initial greeting
        if not self.model_loaded:
            return [], "Model not loaded. Please initialize the model first.", phrases_text

        greeting = generate_initial_greeting(self.model, self.tokenizer, self.rag_system, scenario)

        # Format the greeting for better display
        formatted_greeting = self._format_tutor_message(greeting)

        # Update history with the greeting
        self.history = [("", formatted_greeting)]

        scenario_info = scenarios[scenario]
        scenario_desc = f"### {scenario_info['name']} Scenario\n{scenario_info['description']}\n\n"
        scenario_desc += f"You are practicing with a Hindi tutor who is playing the role of a {scenario_info['role']}."

        return self.history, scenario_desc, phrases_text

    def _format_tutor_message(self, message):
        """Format the tutor's message for better display."""
        lines = message.strip().split("\n")
        if len(lines) >= 2:
            roman = lines[0]
            devanagari = lines[1]
            return f"🗣️ {roman}\n📝 {devanagari}"
        return message

    def _format_user_message(self, message):
        """Format the user's message."""
        return f"👤 {message}"

    def chat(self, user_input, history):
        """Process user input and continue the dialogue."""
        if not self.model_loaded:
            return history + [(self._format_user_message(user_input), "Model not loaded. Please initialize the model first.")]

        if not user_input:
            return history

        current_input_in_hindi = is_hindi(user_input)
        if current_input_in_hindi:
            self.user_used_hindi = True

        response, examples = continue_dialogue(
            self.model, self.tokenizer, self.rag_system, user_input,
            self.scenario_type, in_hindi=current_input_in_hindi
        )

        formatted_user_input = self._format_user_message(user_input)
        formatted_response = self._format_tutor_message(response)

        # Add encouragement if needed
        if not self.user_used_hindi and self.hindi_encouragement_count % 2 == 0:
            encouragement = get_encouragement_message()
            formatted_response += f"\n\n💡 {encouragement}"

        if not self.user_used_hindi:
            self.hindi_encouragement_count += 1

        updated_history = history + [(formatted_user_input, formatted_response)]
        self.history = updated_history

        return updated_history

    def reset_conversation(self):
            """Reset the conversation for the current scenario."""
            if not self.model_loaded:
                return [], "Model not loaded. Please initialize the model first."

            self.user_used_hindi = False
            self.hindi_encouragement_count = 0

            # Generate a new greeting for the current scenario
            greeting = generate_initial_greeting(self.model, self.tokenizer, self.rag_system, self.scenario_type)

            # Format the greeting for better display
            formatted_greeting = self._format_tutor_message(greeting)

            # Reset history with just the new greeting
            self.history = [("", formatted_greeting)]

            scenario_info = scenarios[self.scenario_type]
            scenario_desc = f"### {scenario_info['name']} Scenario\n{scenario_info['description']}\n\n"
            scenario_desc += f"You are practicing with a Hindi tutor who is playing the role of a {scenario_info['role']}."

            # Refresh useful phrases
            self.useful_phrases = self.rag_system.get_hindi_phrases_for_context(f"{self.scenario_type} conversation", top_k=5)

            # Format phrases for display
            phrases_text = "### Useful Hindi Phrases:\n"
            for phrase in self.useful_phrases:
                phrases_text += f"- {phrase['phrase']} - {phrase.get('devanagari', '')}\n"
                if phrase.get('meaning'):
                    phrases_text += f"  ({phrase['meaning']})\n"

            return self.history, scenario_desc, phrases_text

In [None]:
class HindiSpeechRecognizer:
    def __init__(self, model_size="medium"):
        self.model = whisper.load_model(model_size)

    def transcribe(self, audio_path):
        result = self.model.transcribe(audio_path, language="hi")
        return result["text"]

    def transcribe_and_fill(self, audio_path):
        return self.transcribe(audio_path)

In [None]:
def create_gradio_interface():
    app = HindiPracticeApp()
    recognizer = HindiSpeechRecognizer(model_size="medium")

    def tutor_speak_tts():
        message = app.get_raw_latest_tutor_message()
        if not message:
            return None
        tts = gTTS(message, lang="hi")
        audio_path = "/kaggle/working/tutor_speak.mp3"
        tts.save(audio_path)
        return audio_path

    with gr.Blocks(title="Hindi Conversation Practice") as interface:
        gr.Markdown("# Hindi Conversation Practice")
        gr.Markdown("Practice speaking Hindi in realistic scenarios with this AI tutor.")

        with gr.Row():
            with gr.Column(scale=1):
                init_button = gr.Button("Initialize System", variant="primary")
                scenario_selector = gr.Radio(
                    choices=list(scenarios.keys()),
                    label="Choose a scenario",
                    value="market"
                )
                reset_button = gr.Button("Reset Conversation")

                scenario_description = gr.Markdown("Select a scenario and initialize the system to begin.")
                useful_phrases = gr.Markdown("Useful phrases will appear here.")

                with gr.Accordion("About This App", open=False):
                    gr.Markdown("""
                    This app helps you practice Hindi conversation in realistic scenarios.

                    How to use:
                    1. Click "Initialize System" to load the AI tutor
                    2. Choose a scenario to practice
                    3. Respond to the AI tutor's messages
                    4. Try using Hindi phrases in your responses!

                    The AI will play the role of a shopkeeper or waiter and provide responses in both Roman and Devanagari script.
                    """)

            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
                    height=500,
                    show_label=False,
                    elem_id="hindi_chatbot"
                )
                with gr.Row():
                    user_input = gr.Textbox(
                        placeholder="Type your response here...",
                        show_label=False,
                        scale=10
                    )
                    mic_button = gr.Button("🎤", size="sm", scale=1)

            with gr.Column(visible=False) as audio_popup:
                gr.Markdown("### Speak now")
                audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak")

                submit_audio = gr.Button("Submit")

            tutor_speak = gr.Button("Tutor Speak")
            tutor_audio = gr.Audio(label="Tutor Speaking", autoplay=True, visible=False)

        # Set up event handlers
        init_button.click(
            app.initialize_model,
            outputs=[scenario_description]
        )
        scenario_selector.change(
            app.change_scenario,
            inputs=[scenario_selector],
            outputs=[chatbot, scenario_description, useful_phrases]
        )
        reset_button.click(
            app.reset_conversation,
            outputs=[chatbot, scenario_description, useful_phrases]
        )
        user_input.submit(
            app.chat,
            inputs=[user_input, chatbot],
            outputs=[chatbot]
        ).then(
            lambda: "",  # Clear input after sending
            outputs=[user_input]
        )
        mic_button.click(
            lambda: gr.update(visible=True),
            outputs=[audio_popup]
        )
        submit_audio.click(
            recognizer.transcribe_and_fill,
            inputs=[audio_input],
            outputs=[user_input]
        ).then(
            lambda: gr.update(visible=False),
            outputs=[audio_popup]
        )
        tutor_speak.click(
            tutor_speak_tts,
            outputs=[tutor_audio]
        )

    return interface, app

In [None]:
def main():
    # Create and launch the interface
    interface, app = create_gradio_interface()

    # Launch with share=True to create a public link
    interface.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7867,
        #enable_queue=True
    )

In [None]:
main()

100%|█████████████████████████████████████| 1.42G/1.42G [01:51<00:00, 13.7MiB/s]
  chatbot = gr.Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://97e5ce37cd61f3cf5c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
