In [14]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-wue470bf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-wue470bf
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [15]:
!pip install langchain



In [16]:
import os
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import whisper
from langchain.chains.base import Chain
from typing import Dict, Any

In [17]:
#Load your models once (this may take some time)
# Load Whisper ASR model (adjust model size as needed)
asr_model = whisper.load_model("tiny")

# Load MBart model & tokenizer for translation
translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

  checkpoint = torch.load(fp, map_location=device)


In [18]:
def transcribe_audio(audio_path: str) -> (str, str):
    """
    Uses Whisper to transcribe audio and detect the language.
    Returns:
      - transcription (str): the transcribed text.
      - detected_language (str): language code (e.g., "en_XX", "fr_XX").
        (Here, we assume Whisper returns a two-letter ISO code; you might need to map it.)
    """
    result = asr_model.transcribe(audio_path)
    # For demonstration, we assume the detected language from Whisper is in ISO 639-1 format.
    # Map it to MBart language code if needed (this mapping is simplified).
    iso_to_mbart = {
        "en": "en_XX",
        "fr": "fr_XX",
        "de": "de_DE",
        "es": "es_XX",
        "hi": "hi_IN",
        # add other mappings as required
    }
    detected_iso = result.get("language", "en")  # default to English if not detected
    detected_lang = iso_to_mbart.get(detected_iso, "en_XX")
    return result["text"], detected_lang

In [19]:
def translate_text(text: str, src_lang: str, tgt_lang: str) -> str:
    """
    Translates the input text from src_lang to tgt_lang using MBart.
    """
    # Set the source language on the tokenizer
    tokenizer.src_lang = src_lang
    forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]
    encoded_text = tokenizer(text, return_tensors="pt")
    generated_tokens = translation_model.generate(**encoded_text, forced_bos_token_id=forced_bos_token_id)
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translation

In [20]:
from typing import ClassVar

class AudioTranslationChain(Chain):
    """Custom LangChain chain that transcribes audio and then translates the text."""
    input_keys: ClassVar[list] = ["audio_path", "target_lang"]  # Type annotation added with ClassVar
    output_keys: ClassVar[list] = ["transcription", "detected_lang", "translation"] # Type annotation added with ClassVar

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        audio_path = inputs["audio_path"]
        target_lang = inputs["target_lang"]  # e.g., "en_XX", "fr_XX", etc.
        # Step 1: Transcribe audio
        transcription, detected_lang = transcribe_audio(audio_path)
        # Step 2: Translate if detected language differs from target
        if detected_lang != target_lang:
            translation = translate_text(transcription, src_lang=detected_lang, tgt_lang=target_lang)
        else:
            translation = transcription  # No translation needed if already in target language
        return {
            "transcription": transcription,
            "detected_lang": detected_lang,
            "translation": translation
        }

    @property
    def _chain_type(self) -> str:
        return "audio_translation_chain"

In [21]:
# Example usage:
if __name__ == "__main__":
    # Provide your audio file path and desired target language (using MBart language codes)
    input_data = {
        "audio_path": "/content/videoplayback (2).m4a",
        "target_lang": "en_XX"  # e.g., translate to English
    }
    chain = AudioTranslationChain()
    result = chain(input_data)
    print("Detected Language:", result["detected_lang"])
    print("Transcription:", result["transcription"])
    print("Translation:", result["translation"])

Detected Language: hi_IN
Transcription:  Yaadash bhekit ne jeep jeezo te kis jeez ko poor izindeki bhol ne kis ko shish karo ho bhol te ne kabe ek choti sit jeez yaat kar ne kis ko shish karo te yaat ne jeat ne
Translation: Yaadash bhekit ne jeep jeezo te kis jeez ko poor izindeki bhol ne kis koshish karo ho bhol te ne kabe ek choti sit jeez yaat kar ne kis koshish karo te yaat ne jeat ne


In [22]:
!pip install gradio



In [23]:
import os
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import whisper
from langchain.chains.base import Chain
from typing import Dict, Any, ClassVar
import gradio as gr

# -------------------------------
# Load models once (this may take some time)

# Load a smaller Whisper ASR model ("tiny") for quick testing.
asr_model = whisper.load_model("tiny")

# Load MBart model & tokenizer for translation
translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# -------------------------------
# Functions from your code

def transcribe_audio(audio_path: str) -> (str, str):
    """
    Uses Whisper to transcribe audio and detect the language.
    Returns:
      - transcription (str): the transcribed text.
      - detected_language (str): MBart language code (e.g., "en_XX", "fr_XX").
    """
    result = asr_model.transcribe(audio_path)
    # Map ISO 639-1 codes to MBart language codes
    iso_to_mbart = {
        "en": "en_XX",
        "fr": "fr_XX",
        "de": "de_DE",
        "es": "es_XX",
        "hi": "hi_IN",
        # Add other mappings as needed.
    }
    detected_iso = result.get("language", "en")  # default to English
    detected_lang = iso_to_mbart.get(detected_iso, "en_XX")
    return result["text"], detected_lang

def translate_text(text: str, src_lang: str, tgt_lang: str) -> str:
    """
    Translates the input text from src_lang to tgt_lang using MBart.
    """
    tokenizer.src_lang = src_lang  # Set source language for tokenizer
    forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]
    encoded_text = tokenizer(text, return_tensors="pt")
    generated_tokens = translation_model.generate(**encoded_text, forced_bos_token_id=forced_bos_token_id)
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translation

class AudioTranslationChain(Chain):
    """Custom LangChain chain that transcribes audio and then translates the text."""
    input_keys: ClassVar[list] = ["audio_path", "target_lang"]
    output_keys: ClassVar[list] = ["transcription", "detected_lang", "translation"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        audio_path = inputs["audio_path"]
        target_lang = inputs["target_lang"]  # e.g., "en_XX", "fr_XX", etc.
        # Step 1: Transcribe audio
        transcription, detected_lang = transcribe_audio(audio_path)
        # Step 2: Translate if detected language differs from target
        if detected_lang != target_lang:
            translation = translate_text(transcription, src_lang=detected_lang, tgt_lang=target_lang)
        else:
            translation = transcription  # No translation needed if already in target language
        return {
            "transcription": transcription,
            "detected_lang": detected_lang,
            "translation": translation
        }

    @property
    def _chain_type(self) -> str:
        return "audio_translation_chain"

# -------------------------------
# Gradio Interface Function

def process_audio(audio_path: str, target_lang: str):
    """
    Takes an audio file path and target language code,
    uses AudioTranslationChain to process the audio,
    and returns detected language, transcription, and translation.
    """
    chain = AudioTranslationChain()
    result = chain({"audio_path": audio_path, "target_lang": target_lang})
    return result["detected_lang"], result["transcription"], result["translation"]

# Define available target language choices (using MBart language codes)
target_lang_choices = ["en_XX", "fr_XX", "de_DE", "es_XX", "hi_IN"]

# Create a Gradio interface:
iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio File"),
        gr.Dropdown(choices=target_lang_choices, label="Target Language", value="en_XX")
    ],
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Translation")
    ],
    title="Multilingual Speech Translation",
    description="Upload an audio file and select the target language. The app will transcribe the audio using Whisper and then translate the transcription using Facebook's MBart-50."
)

if __name__ == "__main__":
    iface.launch()


  checkpoint = torch.load(fp, map_location=device)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ec172e615c6b317e06.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
