In [18]:
!pip install -q requests sarvamai

In [19]:
import requests
import base64
from IPython.display import Audio, display
from google.colab import userdata
from sarvamai import SarvamAI

API_KEY = userdata.get("SARVAM_AI_API_KEY")

client = SarvamAI(api_subscription_key=API_KEY)

HEADERS = {
    "api-subscription-key": API_KEY,
    "Content-Type": "application/json"
}

# üî• YOUR ML BACKEND (ngrok)
BACKEND_URL = os.getenv("BACKEND_URL")

In [20]:
VOICE_MAP = {
    "en-IN": "en-IN-Standard-1",
    "hi-IN": "hi-IN-Standard-1",
    "bn-IN": "bn-IN-Standard-1",
    "gu-IN": "gu-IN-Standard-1",
    "kn-IN": "kn-IN-Standard-1",
    "ml-IN": "ml-IN-Standard-1",
    "mr-IN": "mr-IN-Standard-1",
    "od-IN": "od-IN-Standard-1",
    "pa-IN": "pa-IN-Standard-1",
    "ta-IN": "ta-IN-Standard-1",
    "te-IN": "te-IN-Standard-1"
}

In [21]:
def speech_to_text(audio_path):

    url = "https://api.sarvam.ai/speech-to-text"

    files = {
        "file": ("audio.wav", open(audio_path, "rb"), "audio/wav")
    }

    data = {
        "model": "saaras:v3",
        "mode": "translate"
    }

    response = requests.post(
        url,
        headers={"api-subscription-key": API_KEY},
        files=files,
        data=data
    )

    result = response.json()

    return {
        "english_text": result.get("transcript"),
        "language_code": result.get("language_code", "en-IN")
    }

In [22]:
def translate_text(text, source, target):

    if source == target:
        return text

    response = client.text.translate(
        input=text,
        source_language_code=source,
        target_language_code=target,
        model="sarvam-translate:v1"
    )

    return response.translated_text

In [23]:
def text_to_speech(text, lang_code):

    voice = VOICE_MAP.get(lang_code, "en-IN-Standard-1")

    url = "https://api.sarvam.ai/text-to-speech"

    payload = {
        "text": text,
        "voice": voice,
        "format": "wav"
    }

    response = requests.post(url, headers=HEADERS, json=payload)

    result = response.json()
    audio_base64 = result.get("audios", [None])[0]

    return audio_base64

In [24]:
def call_ml_backend(english_text):

    response = requests.post(
        BACKEND_URL,
        json={"message": english_text}   # ‚úÖ send JSON
    )

    print("Status Code:", response.status_code)

    if response.status_code != 200:
        print("Error Response:", response.text)
        return "Backend error."

    result = response.json()
    print("Backend Response:", result)

    return result.get("reply", "No reply")

In [25]:
!apt-get -qq install ffmpeg -y

from IPython.display import Javascript
from google.colab import output
import base64

record_js = """
async function recordAudio() {
  const stream = await navigator.mediaDevices.getUserMedia({audio: true});
  const recorder = new MediaRecorder(stream);
  let chunks = [];

  recorder.ondataavailable = e => chunks.push(e.data);
  recorder.start();

  await new Promise(resolve => setTimeout(resolve, 5000)); // 5 sec recording
  recorder.stop();

  await new Promise(resolve => recorder.onstop = resolve);

  const blob = new Blob(chunks);
  const arrayBuffer = await blob.arrayBuffer();
  const base64String = btoa(
    new Uint8Array(arrayBuffer)
      .reduce((data, byte) => data + String.fromCharCode(byte), '')
  );

  return base64String;
}
recordAudio();
"""

audio_base64 = output.eval_js(record_js)
audio_bytes = base64.b64decode(audio_base64)

with open("recorded.webm", "wb") as f:
    f.write(audio_bytes)

!ffmpeg -y -i recorded.webm -ar 16000 -ac 1 recorded.wav

print("Recording saved as recorded.wav")

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [26]:
def process_audio(audio_path):

    # 1Ô∏è‚É£ STT
    stt = speech_to_text(audio_path)
    english_text = stt["english_text"]
    user_lang = stt["language_code"]

    print("Detected Language:", user_lang)
    print("English Text:", english_text)

    # 2Ô∏è‚É£ ML Backend
    english_response = call_ml_backend(english_text)

    # 3Ô∏è‚É£ Translate Back
    if user_lang != "en-IN":
        final_text = translate_text(english_response, "en-IN", user_lang)
    else:
        final_text = english_response

    print("Final Text:", final_text)

    # 4Ô∏è‚É£ TTS
    audio_base64 = text_to_speech(final_text, user_lang)

    if audio_base64:
        audio_bytes = base64.b64decode(audio_base64)
        display(Audio(audio_bytes, autoplay=True))

In [27]:
def process_text(user_text):

    # Detect language once
    lid_url = "https://api.sarvam.ai/text-lid"
    lid_response = requests.post(
        lid_url,
        headers=HEADERS,
        json={"input": user_text}
    )

    user_lang = lid_response.json().get("language_code", "en-IN")

    print("Detected Language:", user_lang)

    # Translate to English if needed
    if user_lang != "en-IN":
        english_text = translate_text(user_text, user_lang, "en-IN")
    else:
        english_text = user_text

    print("English Text:", english_text)

    # ML Backend
    english_response = call_ml_backend(english_text)

    # Translate Back
    if user_lang != "en-IN":
        final_text = translate_text(english_response, "en-IN", user_lang)
    else:
        final_text = english_response

    print("Final Text:", final_text)

    # TTS
    audio_base64 = text_to_speech(final_text, user_lang)

    if audio_base64:
        audio_bytes = base64.b64decode(audio_base64)
        display(Audio(audio_bytes, autoplay=True))

In [28]:
process_audio("recorded.wav")

Detected Language: te-IN
English Text: Leaf curl disease has appeared in the crop. How to prevent it?
Status Code: 200
Backend Response: {'confidence': 1, 'reply': 'Check for bollworm infestation. Use pheromone traps and spray recommended insecticide if needed.', 'source': 'rule_based'}
Final Text: ‡∞¨‡±ä‡∞≤‡±ç‡∞≤‡∞ø ‡∞™‡±Å‡∞∞‡±Å‡∞ó‡±Å‡∞≤ ‡∞¨‡±Ü‡∞°‡∞¶ ‡∞â‡∞Ç‡∞¶‡±á‡∞Æ‡±ã ‡∞ö‡±Ç‡∞°‡∞Ç‡∞°‡∞ø. ‡∞´‡±Ü‡∞∞‡±ã‡∞Æ‡±ã‡∞®‡±ç ‡∞ü‡±ç‡∞∞‡∞æ‡∞™‡±ç‚Äå‡∞≤‡∞®‡±Å ‡∞â‡∞™‡∞Ø‡±ã‡∞ó‡∞ø‡∞Ç‡∞ö‡∞Ç‡∞°‡∞ø ‡∞Æ‡∞∞‡∞ø‡∞Ø‡±Å ‡∞Ö‡∞µ‡∞∏‡∞∞‡∞Æ‡±à‡∞§‡±á ‡∞∏‡∞ø‡∞´‡∞æ‡∞∞‡±ç‡∞∏‡±Å ‡∞ö‡±á‡∞∏‡∞ø‡∞® ‡∞™‡±Å‡∞∞‡±Å‡∞ó‡±Å‡∞Æ‡∞Ç‡∞¶‡±Å‡∞®‡±Å ‡∞™‡∞ø‡∞ö‡∞ø‡∞ï‡∞æ‡∞∞‡±Ä ‡∞ö‡±á‡∞Ø‡∞Ç‡∞°‡∞ø.


In [29]:
process_text("Leaf curl disease in chilli")

Detected Language: en-IN
English Text: Leaf curl disease in chilli
Status Code: 200
Backend Response: {'confidence': 1, 'reply': 'Spray neem-based insecticide or recommended chemical if damage is severe.', 'source': 'rule_based'}
Final Text: Spray neem-based insecticide or recommended chemical if damage is severe.


In [30]:
process_text("‡§Æ‡§ø‡§∞‡•ç‡§ö ‡§Æ‡•á‡§Ç ‡§≤‡•Ä‡§´ ‡§ï‡§∞‡•ç‡§≤ ‡§∞‡•ã‡§ó")

Detected Language: hi-IN
English Text: Leaf curl disease in chilies
Status Code: 200
Backend Response: {'confidence': 1, 'reply': 'Spray neem-based insecticide or recommended chemical if damage is severe.', 'source': 'rule_based'}
Final Text: ‡§Ø‡§¶‡§ø ‡§®‡•Å‡§ï‡§∏‡§æ‡§® ‡§ó‡§Ç‡§≠‡•Ä‡§∞ ‡§π‡•à ‡§§‡•ã ‡§®‡•Ä‡§Æ ‡§Ü‡§ß‡§æ‡§∞‡§ø‡§§ ‡§ï‡•Ä‡§ü‡§®‡§æ‡§∂‡§ï ‡§Ø‡§æ ‡§Ö‡§®‡•Å‡§∂‡§Ç‡§∏‡§ø‡§§ ‡§∞‡§∏‡§æ‡§Ø‡§® ‡§ï‡§æ ‡§õ‡§ø‡§°‡§º‡§ï‡§æ‡§µ ‡§ï‡§∞‡•á‡§Ç‡•§
