<a href="https://colab.research.google.com/github/dope232/GenAI-Project/blob/main/Pranav_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U bitsandbytes --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

# No rag based prompting
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc
import time
import re


MODEL_NAME = "Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1"
USE_4BIT = False
MAX_NEW_TOKENS = 150  # Even shorter to avoid role confusion
TEMPERATURE = 0.2  # Lower temperature for more predictable outputs
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def clear_gpu_memory():
    """Clear GPU memory to prevent OOM errors."""
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
        gc.collect()

def load_model():
    """Load the LLM with quantization."""
    print(f"Loading model {MODEL_NAME}...")

    # Configure quantization
    if USE_4BIT:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
    else:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    # Set pad token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def create_role_locked_prompt(scenario_type):
    """Create a prompt that strictly enforces the assistant's role."""
    scenarios = {
        "market": {
            "system": """You are a Hindi language tutor demonstrating ONLY the shopkeeper role in a market conversation.

CRITICAL ROLE INSTRUCTIONS:
1. You ONLY play the shopkeeper - NEVER respond as the customer.
2. The human user ALWAYS plays the customer.
3. NEVER continue the conversation as the customer.
4. NEVER put "Customer:" or similar labels in your responses.
5. If you notice yourself starting to respond as the customer, STOP IMMEDIATELY.

FORMAT REQUIREMENTS:
1. First line: Short response in Roman Hindi (max 2 sentences)
2. Second line: Same response in Devanagari script
3. NOTHING ELSE.

CONTENT GUIDELINES:
1. Keep responses SHORT and PRACTICAL.
2. Use authentic, everyday Hindi marketplace language.
3. Don't create elaborate stories or explanations.

Start with a simple greeting a shopkeeper would use in Hindi.""",
            "examples": [
                "Namaste ji, kya chahiye aapko?\nनमस्ते जी, क्या चाहिए आपको?",
                "Haan ji, ye taza tamatar hai. Pachaas rupaye kilo.\nहां जी, ये ताज़ा टमाटर हैं। पचास रुपये किलो।"
            ]
        },
        "restaurant": {
            "system": """You are a Hindi language tutor demonstrating ONLY the waiter role in a restaurant conversation.

CRITICAL ROLE INSTRUCTIONS:
1. You ONLY play the waiter - NEVER respond as the customer.
2. The human user ALWAYS plays the customer.
3. NEVER continue the conversation as the customer.
4. NEVER put "Customer:" or similar labels in your responses.
5. If you notice yourself starting to respond as the customer, STOP IMMEDIATELY.

FORMAT REQUIREMENTS:
1. First line: Short response in Roman Hindi (max 2 sentences)
2. Second line: Same response in Devanagari script
3. NOTHING ELSE.

CONTENT GUIDELINES:
1. Keep responses SHORT and PRACTICAL.
2. Use authentic, everyday Hindi restaurant language.
3. Don't create elaborate stories or explanations.

Start with a simple greeting a waiter would use in Hindi.""",
            "examples": [
                "Namaste ji, kya khaayenge aap?\nनमस्ते जी, क्या खाएंगे आप?",
                "Ji zaroor, paneer butter masala aur do roti. Kuchh aur?\nजी ज़रूर, पनीर बटर मसाला और दो रोटी। कुछ और?"
            ]
        }
    }

    return scenarios.get(scenario_type, scenarios["market"])

def generate_initial_greeting(model, tokenizer, scenario_type="market"):
    """Generate just an initial greeting with strict role enforcement."""
    scenario_data = create_role_locked_prompt(scenario_type)


    messages = [
        {"role": "system", "content": scenario_data["system"]},
        {"role": "user", "content": "Start with a typical greeting a shopkeeper/waiter would use. Keep it short and authentic. ONLY respond as the shopkeeper/waiter."}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.2,
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    response = clean_response(response)

    clear_gpu_memory()
    return response

def continue_dialogue(model, tokenizer, user_input, scenario_type="market"):
    """Continue the conversation with enforced role boundaries."""
    scenario_data = create_role_locked_prompt(scenario_type)


    system_message = scenario_data["system"] + "\n\nREMEMBER: You are ONLY the shopkeeper/waiter. DO NOT respond as the customer. DO NOT continue the conversation as both roles."


    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"The customer says: \"{user_input}\"\n\nRespond ONLY as the shopkeeper/waiter in short, simple Hindi (both Roman and Devanagari). NEVER respond as the customer. Keep your response brief and practical."}
    ]


    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.3,  # Higher to avoid repetitive patterns that might cause role confusion
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)


    response = clean_response(response)

    clear_gpu_memory()
    return response

def clean_response(response):
    """Clean up the response to ensure proper format and remove role confusion."""

    response = re.sub(r'(Shopkeeper|Waiter|Customer|Assistant):\s*', '', response)

    lines = response.split('\n')
    cleaned_lines = []

    roman_line = ""
    devanagari_line = ""

    # Simple heuristic - first non-empty line is Roman, second is Devanagari
    for line in lines:
        if line.strip():
            if not roman_line:
                roman_line = line.strip()
            elif not devanagari_line:
                devanagari_line = line.strip()
                break

    # If we found both lines, use them
    if roman_line and devanagari_line:
        return f"{roman_line}\n{devanagari_line}"

    # If we didn't find a clear structure, just return the cleaned original
    return response

# Main conversation function
def run_hindi_practice(scenario_type="market"):
    print(f"Loading model for {scenario_type} conversation practice...")
    model, tokenizer = load_model()

    # Generate initial greeting
    greeting = generate_initial_greeting(model, tokenizer, scenario_type)

    # Print setup information
    print(f"\n{'='*50}")
    print(f"HINDI {scenario_type.upper()} PRACTICE")
    print(f"{'='*50}\n")
    print("Tutor (as shopkeeper/waiter):")
    print(greeting)
    print("\n" + "-"*50)

    # Interactive loop
    while True:
        user_input = input("You (as customer): ")

        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("\nConversation ended. Dhanyavaad! (Thank you!)")
            break

        # Generate response with strict role enforcement
        response = continue_dialogue(model, tokenizer, user_input, scenario_type)

        # Print response
        print("\nTutor (as shopkeeper/waiter):")
        print(response)
        print("\n" + "-"*50)

    # Clean up
    del model
    clear_gpu_memory()

# Usage example
if __name__ == "__main__":
    scenario = "market"  # or "restaurant"
    run_hindi_practice(scenario)

Loading model for market conversation practice...
Loading model Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



HINDI MARKET PRACTICE

Tutor (as shopkeeper/waiter):
Namaste! नमस्ते!
Please come inside and take a look around at our products. We have everything from fresh fruits to dry goods like rice and spices.user

--------------------------------------------------
You (as customer): i want 2 of these soaps


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Kya apko yeh soap accha lagta hai?
क्या आप ये सोप अच्छा लगता है?user

--------------------------------------------------
You (as customer): yes these ones are loved by my family, so we only buy these.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Aapke ghar ki sabziyan acchi lagti hai kya?
आपके घर की सब्ज़ियाँ अच्छ लगती हैं क्या?user

--------------------------------------------------
You (as customer): no, that's all I want. how much will the total be?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Aapke liye kya lagta hai? ₹250 ke upar hoga.
आपके लिए क्या लगता है? रुपये दो सौ पचास और ऊपर लगेगा.user

--------------------------------------------------
You (as customer): Can I pay through cash?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Haal kya hai? Cash pe accept kar sakte hain.
हाल क्या है? नकद पर स्वीकार कर सकते हैं।user

--------------------------------------------------
You (as customer): here you go.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Kya aapko kuch aur chahiye?
क्या आप कुछ और चाहते हैं?user

--------------------------------------------------
You (as customer): no thats it, thank you


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Khushkhabri ho gaya! Aapka bhi khushiya hai?
खुश ख़बरी हो गया! आप के भी खुषियाँ हैं?user

--------------------------------------------------


KeyboardInterrupt: Interrupted by user

In [None]:
!pip install -U sentence-transformers --quiet
!pip install -U langchain_community --quiet
!pip install -U faiss-cpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#Includes RAG

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json
import os
import re
import gc
import logging
import time
from pathlib import Path
import numpy as np
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


MODEL_NAME = "Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1"
USE_4BIT = False
MAX_NEW_TOKENS = 150
TEMPERATURE = 0.2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EMBEDDING_MODEL = "ai4bharat/indic-bert"
RAG_DATA_PATH = "processed_hindi_dialogues.json"  # Path to your JSON file
FAISS_INDEX_PATH = "hindi_dialogue_faiss_index"  # Path to save/load FAISS index

ENCOURAGEMENT_PHRASES = [
    "Try responding in Hindi! / हिंदी में जवाब देने की कोशिश करें!",
    "Practice makes perfect! Try some Hindi! / अभ्यास से सिद्धि! कुछ हिंदी का प्रयास करें!",
    "Even simple Hindi words help you learn! / सरल हिंदी शब्द भी आपको सीखने में मदद करते हैं!",
    "Don't worry about mistakes in Hindi! / हिंदी में गलतियों की चिंता न करें!"
]

def clear_gpu_memory():
    """Clear GPU memory to prevent OOM errors."""
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
        gc.collect()

# RAG System Implementation
class HindiLearningRAG:
    """RAG system for retrieving Hindi dialogues, idioms, and examples."""

    def __init__(self, dummy_mode=False):
        """Initialize the RAG system with embeddings model."""
        self.dummy_mode = dummy_mode
        if dummy_mode:
            logger.info("Initializing dummy RAG system (no retrieval capabilities)")
            return

        logger.info(f"Initializing Hindi Learning RAG on {DEVICE}...")
        self.embeddings = None
        self.vector_store = None
        self.document_data = []
        self.initialize_embeddings()
        logger.info("RAG system initialized.")

    def initialize_embeddings(self):
        """Initialize the embeddings model."""
        if self.dummy_mode:
            return

        try:
            # Install sentence-transformers if needed
            try:
                import sentence_transformers
            except ImportError:
                print("Installing sentence-transformers...")
                import subprocess
                subprocess.check_call(["pip", "install", "-q", "sentence-transformers"])
                import sentence_transformers

            # Set up embeddings with model quantization if on GPU
            model_kwargs = {
                "device": DEVICE
            }


            self.embeddings = HuggingFaceEmbeddings(
                model_name=EMBEDDING_MODEL,
                model_kwargs=model_kwargs,
                encode_kwargs={"normalize_embeddings": True}
            )

            logger.info("Embeddings model initialized.")
        except Exception as e:
            logger.error(f"Error initializing embeddings: {e}")
            logger.warning("Continuing in dummy mode (no retrieval capabilities)")
            self.dummy_mode = True

    def load_documents(self, file_path=RAG_DATA_PATH):
        """Load documents from JSON file."""
        if self.dummy_mode:
            return False

        if not os.path.exists(file_path):
            logger.warning(f"Data file {file_path} not found. You need to load data first.")
            self.dummy_mode = True
            return False

        try:
            logger.info(f"Loading documents from {file_path}...")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            self.document_data = data
            logger.info(f"Loaded {len(data)} documents.")
            return True
        except Exception as e:
            logger.error(f"Error loading documents: {e}")
            self.dummy_mode = True
            return False

    def create_vector_store(self):
        """Create a FAISS vector store from loaded documents."""
        if self.dummy_mode or not self.document_data:
            logger.warning("No documents loaded or in dummy mode. Cannot create vector store.")
            return False

        try:
            logger.info("Creating FAISS vector store...")


            documents = []
            for item in self.document_data:
                doc = Document(
                    page_content=item["page_content"],
                    metadata=item["metadata"]
                )
                documents.append(doc)

            # Create vector store
            self.vector_store = FAISS.from_documents(documents, self.embeddings)

            logger.info(f"Created vector store with {len(documents)} documents.")

            # Save index
            if not os.path.exists(FAISS_INDEX_PATH):
                os.makedirs(FAISS_INDEX_PATH)
            self.vector_store.save_local(FAISS_INDEX_PATH)
            logger.info(f"Saved vector store to {FAISS_INDEX_PATH}")

            clear_gpu_memory()
            return True
        except Exception as e:
            logger.error(f"Error creating vector store: {e}")
            self.dummy_mode = True
            return False

    def load_vector_store(self, index_path=FAISS_INDEX_PATH):
        """Load a FAISS vector store from disk."""
        if self.dummy_mode:
            return False

        if not os.path.exists(index_path):
            logger.warning(f"Index path {index_path} not found. Create index first.")
            return False

        try:
            logger.info(f"Loading vector store from {index_path}...")
            self.vector_store = FAISS.load_local(index_path, self.embeddings)
            logger.info("Vector store loaded successfully.")
            return True
        except Exception as e:
            logger.error(f"Error loading vector store: {e}")
            self.dummy_mode = True
            return False

    def retrieve_dialogue_examples(self, query, top_k=3, context_tags=None, emotion_tags=None):
        """Retrieve dialogue examples based on query and optional tags."""
        if self.dummy_mode:
            return []

        if not self.vector_store:
            if not self.load_vector_store():
                logger.warning("Vector store not available. Loading documents and creating index...")
                if self.load_documents() and self.create_vector_store():
                    logger.info("Vector store created successfully.")
                else:
                    self.dummy_mode = True
                    return []

        try:
            logger.info(f"Retrieving examples for query: {query}")

            # Get base retrieval results
            retrieval_results = self.vector_store.similarity_search_with_score(query, k=top_k*3)

            # Further filter by metadata if tags are provided
            if context_tags or emotion_tags:
                filtered_results = []
                for doc, score in retrieval_results:
                    metadata = doc.metadata

                    # Check context tags
                    context_match = True
                    if context_tags:
                        doc_context = set(metadata.get("context_tags", []))
                        query_context = set(context_tags)
                        context_match = bool(doc_context.intersection(query_context))

                    # Check emotion tags
                    emotion_match = True
                    if emotion_tags:
                        doc_emotion = set(metadata.get("emotion_tags", []))
                        query_emotion = set(emotion_tags)
                        emotion_match = bool(doc_emotion.intersection(query_emotion))

                    if context_match and emotion_match:
                        filtered_results.append((doc, score))

                retrieval_results = filtered_results

            # Sort by score and truncate
            retrieval_results = sorted(retrieval_results, key=lambda x: x[1])[:top_k]

            # Extract dialogue turns for each document
            examples = []
            for doc, score in retrieval_results:
                example = {
                    "scene_description": doc.metadata.get("scene_description", ""),
                    "roman_dialogue": doc.metadata.get("roman_dialogue", ""),
                    "devanagari_dialogue": doc.metadata.get("devanagari_dialogue", ""),
                    "context_tags": doc.metadata.get("context_tags", []),
                    "emotion_tags": doc.metadata.get("emotion_tags", []),
                    "relevance_score": float(score),
                    "dialogue_turns": doc.metadata.get("dialogue_turns", [])
                }
                examples.append(example)

            logger.info(f"Retrieved {len(examples)} examples.")
            return examples
        except Exception as e:
            logger.error(f"Error retrieving examples: {e}")
            return []

    def get_hindi_phrases_for_context(self, context, top_k=3):
        """Get relevant Hindi phrases based on the context."""
        if self.dummy_mode:
            # Return default phrases for common scenarios
            market_phrases = [
                {"phrase": "Kitne ka hai?", "meaning": "How much is it?", "devanagari": "कितने का है?"},
                {"phrase": "Thoda kam kar dijiye", "meaning": "Please reduce it a little", "devanagari": "थोड़ा कम कर दीजिए"},
                {"phrase": "Badhiya maal hai", "meaning": "It's good quality", "devanagari": "बढ़िया माल है"}
            ]

            restaurant_phrases = [
                {"phrase": "Menu dikha dijiye", "meaning": "Please show me the menu", "devanagari": "मेनू दिखा दीजिए"},
                {"phrase": "Thoda teekha hai", "meaning": "It's a bit spicy", "devanagari": "थोड़ा तीखा है"},
                {"phrase": "Bill le aayiye", "meaning": "Please bring the bill", "devanagari": "बिल ले आइए"}
            ]

            if "market" in context.lower():
                return market_phrases[:top_k]
            elif "restaurant" in context.lower():
                return restaurant_phrases[:top_k]
            else:
                return market_phrases[:top_k]  # Default to market

        # If RAG is available, extract phrases from retrieved examples
        examples = self.retrieve_dialogue_examples(context, top_k=top_k)

        phrases = []
        for example in examples:
            dialogue_turns = example.get("dialogue_turns", [])

            # Extract short phrases from dialogue turns
            for turn in dialogue_turns:
                text_roman = turn.get("text_roman", "")
                text_devanagari = turn.get("text_devanagari", "")

                # Look for short phrases (3-5 words)
                words = text_roman.split()
                if 3 <= len(words) <= 10:
                    phrases.append({
                        "phrase": text_roman,
                        "devanagari": text_devanagari,
                        "meaning": ""  # We would need translation for this
                    })

        # Return unique phrases, limited to top_k
        unique_phrases = []
        seen_phrases = set()

        for phrase in phrases:
            if phrase["phrase"] not in seen_phrases:
                seen_phrases.add(phrase["phrase"])
                unique_phrases.append(phrase)

                if len(unique_phrases) >= top_k:
                    break

        # If we don't have enough phrases, add default ones
        if len(unique_phrases) < top_k:
            default_phrases = [
                {"phrase": "Kitne ka hai?", "meaning": "How much is it?", "devanagari": "कितने का है?"},
                {"phrase": "Thoda kam kar dijiye", "meaning": "Please reduce it a little", "devanagari": "थोड़ा कम कर दीजिए"},
                {"phrase": "Badhiya maal hai", "meaning": "It's good quality", "devanagari": "बढ़िया माल है"}
            ]

            for phrase in default_phrases:
                if phrase["phrase"] not in seen_phrases and len(unique_phrases) < top_k:
                    seen_phrases.add(phrase["phrase"])
                    unique_phrases.append(phrase)

        return unique_phrases

# LLM Functions
def load_model():
    """Load the LLM with quantization."""
    print(f"Loading model {MODEL_NAME}...")

    # Configure quantization
    if USE_4BIT:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
    else:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    # Set pad token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def create_role_locked_prompt(scenario_type, rag_examples=None):
    """Create a prompt that strictly enforces the assistant's role, enhanced with RAG examples."""
    base_system = """You are a Hindi language tutor demonstrating ONLY the {role} role in a {scenario_type} conversation.

CRITICAL ROLE INSTRUCTIONS:
1. You ONLY play the {role} - NEVER respond as the customer.
2. The human user ALWAYS plays the customer.
3. NEVER continue the conversation as the customer.
4. NEVER put "Customer:" or similar labels in your responses.
5. If you notice yourself starting to respond as the customer, STOP IMMEDIATELY.

FORMAT REQUIREMENTS:
1. First line: Short response in Roman Hindi (max 2 sentences)
2. Second line: Same response in Devanagari script
3. NOTHING ELSE.

CONTENT GUIDELINES:
1. Keep responses SHORT and PRACTICAL.
2. Use authentic, everyday Hindi {scenario_type} language.
3. Don't create elaborate stories or explanations.
4. Use REALISTIC Hindi that would be spoken in a real {scenario_type}."""

    scenarios = {
        "market": {
            "role": "shopkeeper",
            "system": base_system.format(role="shopkeeper", scenario_type="market"),
            "examples": [
                "Namaste ji, kya chahiye aapko?\nनमस्ते जी, क्या चाहिए आपको?",
                "Haan ji, ye taza tamatar hai. Pachaas rupaye kilo.\nहां जी, ये ताज़ा टमाटर हैं। पचास रुपये किलो।"
            ]
        },
        "restaurant": {
            "role": "waiter",
            "system": base_system.format(role="waiter", scenario_type="restaurant"),
            "examples": [
                "Namaste ji, kya khaayenge aap?\nनमस्ते जी, क्या खाएंगे आप?",
                "Ji zaroor, paneer butter masala aur do roti. Kuchh aur?\nजी ज़रूर, पनीर बटर मसाला और दो रोटी। कुछ और?"
            ]
        }
    }

    scenario_info = scenarios.get(scenario_type, scenarios["market"])

    # Add RAG examples if available
    rag_content = ""
    if rag_examples:
        rag_content = "\n\nREFERENCE EXAMPLES (use these for authentic Hindi expressions):\n"
        for i, example in enumerate(rag_examples):
            if "dialogue_turns" in example:
                # Extract a few turns for examples
                turns = example.get("dialogue_turns", [])
                if turns:
                    rag_content += f"Example {i+1}:\n"
                    for j, turn in enumerate(turns[:3]):  # Limit to 3 turns
                        speaker = turn.get("speaker", "")
                        text = turn.get("text_roman", "")
                        rag_content += f"{speaker}: {text}\n"
                    rag_content += "\n"

    return scenario_info["system"] + rag_content

def generate_initial_greeting(model, tokenizer, rag_system, scenario_type="market"):
    """Generate just an initial greeting with strict role enforcement and RAG enhancement."""
    # Get relevant dialogue examples for this scenario
    context_tags = None
    if scenario_type == "market":
        context_tags = ["shopping", "market", "bazaar"]
    elif scenario_type == "restaurant":
        context_tags = ["food", "restaurant"]

    examples = []
    if rag_system and not rag_system.dummy_mode:
        examples = rag_system.retrieve_dialogue_examples(
            query=f"greeting in a {scenario_type}",
            top_k=2,
            context_tags=context_tags
        )


    system_prompt = create_role_locked_prompt(scenario_type, examples)

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Start with a typical greeting a {scenario_type} {scenarios[scenario_type]['role']} would use. Keep it short and authentic. ONLY respond as the {scenarios[scenario_type]['role']}."}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.2,
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    response = clean_response(response)

    clear_gpu_memory()
    return response

def continue_dialogue(model, tokenizer, rag_system, user_input, scenario_type="market", in_hindi=False):
    """Continue the conversation with enforced role boundaries and RAG enhancement."""
    # Get relevant dialogue examples for this input
    examples = []
    if rag_system and not rag_system.dummy_mode:
        context_tags = None
        if scenario_type == "market":
            context_tags = ["shopping", "market", "bazaar"]
        elif scenario_type == "restaurant":
            context_tags = ["food", "restaurant"]

        examples = rag_system.retrieve_dialogue_examples(
            query=user_input,
            top_k=2,
            context_tags=context_tags
        )


    system_prompt = create_role_locked_prompt(scenario_type, examples)


    if in_hindi:
        system_prompt += "\n\nNOTE: The customer is responding in Hindi, which is excellent! Encourage them by acknowledging their Hindi usage in your response."

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"The customer says: \"{user_input}\"\n\nRespond ONLY as the {scenarios[scenario_type]['role']} in short, simple Hindi (both Roman and Devanagari). NEVER respond as the customer. Keep your response brief and practical."}
    ]


    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=100,  # Short to avoid role confusion
        do_sample=True,
        temperature=TEMPERATURE,
        repetition_penalty=1.3,  # Higher to avoid repetitive patterns that might cause role confusion
        eos_token_id=tokenizer.eos_token_id,
    )

    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    # Clean up response to ensure proper format
    response = clean_response(response)

    clear_gpu_memory()
    return response, examples

def clean_response(response):
    """Clean up the response to ensure proper format and remove role confusion."""
    # Remove any "Shopkeeper:" or "Waiter:" or "Customer:" labels
    response = re.sub(r'(Shopkeeper|Waiter|Customer|Assistant):\s*', '', response)

    lines = response.split('\n')
    cleaned_lines = []

    roman_line = ""
    devanagari_line = ""

    # Simple heuristic - first non-empty line is Roman, second is Devanagari
    for line in lines:
        if line.strip():
            if not roman_line:
                roman_line = line.strip()
            elif not devanagari_line:
                devanagari_line = line.strip()
                break

    # If we found both lines, use them
    if roman_line and devanagari_line:
        return f"{roman_line}\n{devanagari_line}"

    # If we didn't find a clear structure, just return the cleaned original
    return response

def is_hindi(text):
    """Check if the text contains Hindi (either in Devanagari or romanized)."""
    # Check for Devanagari characters
    devanagari_pattern = re.compile(r'[\u0900-\u097F]')
    if devanagari_pattern.search(text):
        return True

    # Check for likely romanized Hindi words
    hindi_romanized_words = [
        'namaste', 'dhanyavad', 'theek', 'haan', 'nahi', 'kya', 'aap', 'mai', 'tum',
        'kitna', 'rupaye', 'paisa', 'khana', 'pani', 'chai', 'acha', 'bahut', 'thoda'
    ]

    text_lower = text.lower()
    for word in hindi_romanized_words:
        if word in text_lower:
            return True

    return False

def get_encouragement_message():
    """Get a random encouragement message to use Hindi."""
    import random
    return random.choice(ENCOURAGEMENT_PHRASES)

# Scenario info
scenarios = {
    "market": {
        "role": "shopkeeper",
        "name": "Market",
        "description": "Practice buying items, haggling prices, and asking about products in a typical Indian market."
    },
    "restaurant": {
        "role": "waiter",
        "name": "Restaurant",
        "description": "Practice ordering food, asking about dishes, and handling restaurant interactions in Hindi."
    }
}

# Main conversation function
def run_hindi_practice(scenario_type="market"):
    print("Initializing Hindi Language Learning System...")

    # Initialize RAG system
    rag_system = None
    try:
        print("Setting up RAG system...")
        rag_system = HindiLearningRAG()

        # Try to load existing vector store
        if not rag_system.load_vector_store():
            if os.path.exists(RAG_DATA_PATH):
                print(f"Found document data at {RAG_DATA_PATH}, loading...")
                if rag_system.load_documents(RAG_DATA_PATH):
                    rag_system.create_vector_store()
            else:
                print(f"RAG data not found at {RAG_DATA_PATH}. Using basic mode.")
                rag_system.dummy_mode = True
    except Exception as e:
        print(f"Error initializing RAG system: {e}")
        rag_system = HindiLearningRAG(dummy_mode=True)

    # Load LLM model
    print(f"Loading model for {scenario_type} conversation practice...")
    model, tokenizer = load_model()

    # Generate initial greeting
    greeting = generate_initial_greeting(model, tokenizer, rag_system, scenario_type)

    # Get useful Hindi phrases for this scenario
    useful_phrases = rag_system.get_hindi_phrases_for_context(f"{scenario_type} conversation", top_k=3)

    # Print setup information
    print(f"\n{'='*50}")
    print(f"HINDI {scenario_type.upper()} PRACTICE")
    print(f"{'='*50}\n")

    # Print Hindi phrases suggestion
    print("Useful Hindi phrases for this scenario:")
    for phrase in useful_phrases:
        print(f"• {phrase['phrase']} - {phrase.get('devanagari', '')}")
        if phrase.get('meaning'):
            print(f"  ({phrase['meaning']})")
    print(f"\n{'-'*50}")

    # Start conversation
    print("Tutor (as shopkeeper/waiter):")
    print(greeting)
    print("\n" + "-"*50)

    user_used_hindi = False
    hindi_encouragement_count = 0

    while True:
        if not user_used_hindi and hindi_encouragement_count % 2 == 0:
            print(f"\n💡 {get_encouragement_message()}")

        user_input = input("You (as customer): ")

        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("\nConversation ended. Dhanyavaad! (Thank you!)")
            break

        current_input_in_hindi = is_hindi(user_input)
        if current_input_in_hindi:
            user_used_hindi = True

        response, examples = continue_dialogue(
            model, tokenizer, rag_system, user_input,
            scenario_type, in_hindi=current_input_in_hindi
        )

        print("\nTutor (as shopkeeper/waiter):")
        print(response)

        if examples and False:
            print("\nRetrieved examples:")
            for i, example in enumerate(examples):
                print(f"Example {i+1}: {example.get('scene_description', '')}")

        print("\n" + "-"*50)

        if not user_used_hindi:
            hindi_encouragement_count += 1


    del model
    clear_gpu_memory()

# Usage example
if __name__ == "__main__":
    scenario = "market"  # or "restaurant"
    run_hindi_practice(scenario)

Initializing Hindi Language Learning System...
Setting up RAG system...


  self.embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ERROR:__main__:Error loading vector store: The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loadin

Found document data at processed_hindi_dialogues.json, loading...
Loading model for market conversation practice...
Loading model Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



HINDI MARKET PRACTICE

Useful Hindi phrases for this scenario:
• Kitne ka hai? - कितने का है?
  (How much is it?)
• Thoda kam kar dijiye - थोड़ा कम कर दीजिए
  (Please reduce it a little)
• Badhiya maal hai - बढ़िया माल है
  (It's good quality)

--------------------------------------------------
Tutor (as shopkeeper/waiter):
Namaste bhaiya, kaise ho? Aapka kaam kaisa chal raha hai?
नमस्ते भैया, कैसे हो? आपका काम कैसा चल रहा है?user

--------------------------------------------------

💡 Even simple Hindi words help you learn! / सरल हिंदी शब्द भी आपको सीखने में मदद करते हैं!
You (as customer): काम तो काम है, मुझे कुछ केले चाहिए, उनकी कीमत क्या है?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Keliyon ki khaasiyat hai ye sabse acchi hain aur unka price Rs.40 per dozen hota hai.
खरीदने वालों से, हमारे पास सबसे अच्छा और लोकप्रिय फल होते हैं जिन्हें आप खरीद सकते हैं, और इसका दाम एक दर्जन तक ₹४० रुपये तक रहता है।user

--------------------------------------------------
You (as customer): हा हा, तुम सही कह रहे हो। मैं आधा दर्जन लूंगा।


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Roman Hindi Response
अच्छे, कौन सा प्रकार के चॉकलेट्स चाहिए?

--------------------------------------------------
You (as customer): नहीं, मुझे आधा दर्जन केले चाहिए, मेरे पास अभी सिर्फ़ 20 रुपये हैं


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Kya pata hai kuchh bhi ho sakta tha? Aapko abhi ek doosra packet mil jayega aur wohi price mein hi dena hoga ₹40 tak.
क्या, कुछ भी हो सकता था। आप एक और बॉक्स लेंगे, वैस्ट अमाउंट पर देने जाने। ₹40 तक।user

--------------------------------------------------
You (as customer): Here you go 40 Rs


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Tutor (as shopkeeper/waiter):
Tumne kaun sa product chune? ₹40 ke liye kya mil raha hai?
तुमने कौन सा प्रोडक्ट चुने, ₹४० की लिए क्या मिल रहा है?user

--------------------------------------------------


KeyboardInterrupt: Interrupted by user