In [1]:
# ==========================================
# INSTALL LIBRARIES
# ==========================================
import os
try:
    import faster_whisper
    import piper_phonemize
except ImportError:
    print("⏳ Installing Libraries...")
    # 1. Install System Dependencies (Required for Piper)
    !sudo apt-get update -y > /dev/null
    !sudo apt-get install -y espeak-ng > /dev/null

    # 2. Install Python Libraries
    !pip install -q faster-whisper accelerate bitsandbytes gradio
    !pip install -q piper-tts
    !pip install -q git+https://github.com/huggingface/transformers.git

    # 3. Download Hindi Voice (Piper ONNX)
    print("⏳ Downloading Hindi Voice Model...")
    !wget -q -O hindi_voice.onnx https://huggingface.co/rhasspy/piper-voices/resolve/main/hi/hi_IN/pratham/medium/hi_IN-pratham-medium.onnx
    !wget -q -O hindi_voice.onnx.json https://huggingface.co/rhasspy/piper-voices/resolve/main/hi/hi_IN/pratham/medium/hi_IN-pratham-medium.onnx.json

print("✅ Installation Complete.")

⏳ Installing Libraries...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 5.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m24.1 MB/s[0m et

In [2]:
# ==========================================
# LOADING AI MODELS
# ==========================================
import torch
import subprocess
import gradio as gr
import time
from faster_whisper import WhisperModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Configuration
class Config:

    LLM_ID = "Qwen/Qwen2.5-1.5B-Instruct"
    WHISPER_SIZE = "medium"
    PIPER_VOICE = "hindi_voice.onnx"
    OUTPUT_AUDIO = "reply.wav"

print("⏳ Loading Models")

# 1. Load Brain (Qwen 1.5B in 4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_ID)
model = AutoModelForCausalLM.from_pretrained(
    Config.LLM_ID,
    quantization_config=bnb_config,
    device_map="auto"
)

# 2. Load Ears (Faster-Whisper Medium)
ear_model = WhisperModel(Config.WHISPER_SIZE, device="cuda", compute_type="float16")

print("✅ SYSTEM READY")

⏳ Loading Models


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✅ SYSTEM READY


In [3]:
# ==========================================
# THE Hindi BOT V2
# ==========================================
class TurboHindiBot:

    def listen(self, audio_path):
        if not audio_path: return ""

        # Changed beam_size to 1
        segments, _ = ear_model.transcribe(
            audio_path,
            language="hi",
            beam_size=1,
            initial_prompt="नमस्ते", # Force Hindi
            condition_on_previous_text=False
        )
        return " ".join([s.text for s in segments]).strip()

    def think(self, user_text):
        messages = [
            {"role": "system", "content": "You are a Hindi assistant. Reply ONLY in Hindi. Keep answers under 20 words."},
            {"role": "user", "content": user_text}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer([text], return_tensors="pt").to("cuda")

        generated_ids = model.generate(
            **inputs,
            max_new_tokens=60,
            temperature=0.3,
            do_sample=True,
        )

        response = tokenizer.decode(generated_ids[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
        return response.strip()

    def speak(self, text):
        if not text: return None

        # -- CLEAN THE TEXT To prevent PiperTTS crashe---

        import re
        # Remove English letters
        clean_text = re.sub(r'[a-zA-Z]', '', text)
        # Remove special symbols
        clean_text = re.sub(r'[^\w\s\u0900-\u097F\u002E\u002C\u003F\u0021]', '', clean_text)
        # Remove newlines
        clean_text = clean_text.replace("\n", " ").strip()

        if not clean_text:
            print("⚠️ Warning: Text was empty after cleaning.")
            return None

        print(f"🗣️ Sending to Piper: {clean_text}")


        command = f'echo "{clean_text}" | piper --model {Config.PIPER_VOICE} --output_file {Config.OUTPUT_AUDIO}'

        # Run with error capturing
        try:
            result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f"❌ PIPER ERROR: {e.stderr}")
            return None

        if os.path.exists(Config.OUTPUT_AUDIO):
            return Config.OUTPUT_AUDIO
        else:
            print("❌ Error: Audio file was not created.")
            return None

In [None]:
# ==========================================
# THE UI
# ==========================================
bot = TurboHindiBot()

def run_chat(audio_path):
    if audio_path is None: return None, None, None

    # Pipeline
    user_text = bot.listen(audio_path)
    if not user_text: return None, None, None

    ai_reply = bot.think(user_text)
    audio_file = bot.speak(ai_reply)

    return user_text, ai_reply, audio_file

with gr.Blocks(title="Hindi AI Bot V2") as demo:
    gr.Markdown("#  Hindi AI Bot")
    gr.Markdown("Using **Qwen 1.5B** + **Whisper Medium** + **Piper TTS**")

    with gr.Row():

        audio_in = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="🎤 Record or 📂 Upload Audio"
        )

    with gr.Row():
        user_out = gr.Textbox(label="You")
        ai_out = gr.Textbox(label="AI")
        audio_out = gr.Audio(label="Reply", autoplay=True)

    # Trigger on recording stop (Mic)
    audio_in.stop_recording(run_chat, inputs=[audio_in], outputs=[user_out, ai_out, audio_out])

    # Trigger on file upload (File)
    audio_in.upload(run_chat, inputs=[audio_in], outputs=[user_out, ai_out, audio_out])

print("⏳ Launching UI...")
demo.queue().launch(share=True, debug=True)

# Keep Alive
import time
while True:
    time.sleep(60)
    print(".", end="", flush=True)