In [5]:
!pip install torch transformers librosa gradio

import gradio as gr
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM
import librosa

# Define models and processors for ASR, Translation, and Summarization
asr_models = {
    "hindi": {
        "processor": Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi"),
        "model": Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
    },
    "english": {
        "processor": Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h"),
        "model": Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    },
    "malayalam": {
        "processor": Wav2Vec2Processor.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam"),
        "model": Wav2Vec2ForCTC.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
    }
}






Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The ASR Models which will be used to transcribe the respective languages

In [6]:
translation_models = {
    "hindi": AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-hi-en"),
    "malayalam": AutoModelForSeq2SeqLM.from_pretrained("ArunIcfoss/mbart-large-50-many-to-many-mmt-ICFOSS-Malayalam_English_Translation")
}
translation_tokenizers = {
    "hindi": AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en"),
    "malayalam": AutoTokenizer.from_pretrained("ArunIcfoss/mbart-large-50-many-to-many-mmt-ICFOSS-Malayalam_English_Translation")
}

summarization_models = {
    "hindi": AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base"),
    "english": AutoModelForSeq2SeqLM.from_pretrained("t5-base"),
    "malayalam": AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")
}
summarization_tokenizers = {
    "hindi": AutoTokenizer.from_pretrained("google/mt5-base"),
    "english": AutoTokenizer.from_pretrained("t5-base"),
    "malayalam": AutoTokenizer.from_pretrained("ai4bharat/IndicBART")
}




The Translation and Summarizing Models For the Respective Regional as well as Global Language

In [7]:
def clean_output(text):
    return text.replace("<extra_id_0>", "").replace("<tero_id_0>", "").strip()

# Function for transcription
def transcribe_audio(audio_path, language):
    try:
        # Load audio file
        audio, sr = librosa.load(audio_path, sr=16000)
        if sr != 16000:
            return "Error: Audio sampling rate must be 16 kHz."

        # Process the audio
        processor = asr_models[language]["processor"]
        model = asr_models[language]["model"]
        input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values

        # Get predictions
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode transcription
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        return transcription

    except Exception as e:
        return f"Transcription Error: {str(e)}"

# Function for translation
def translate_text(text, source_language, target_language):
    try:
        tokenizer = translation_tokenizers[source_language]
        model = translation_models[source_language]

        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

        # Generate translation
        outputs = model.generate(
            inputs.input_ids,
            max_length=512,
            num_beams=5,
            repetition_penalty=1.5,
            length_penalty=1.0,
            early_stopping=True,
        )

        # Decode the translation
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation

    except Exception as e:
        return f"Translation Error: {str(e)}"

# Function for summarization
def summarize_text(text, language):
    try:
        tokenizer = summarization_tokenizers[language]
        model = summarization_models[language]

        # Prepare input for summarization
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

        # Generate summary
        outputs = model.generate(
            inputs.input_ids,
            max_length=150,
            num_beams=5,
            repetition_penalty=2.0,
            length_penalty=1.0,
            early_stopping=True,
        )

        # Decode and clean up the summary
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return clean_output(summary)  # Clean output
    except Exception as e:
        return f"Summarization Error: {str(e)}"

# Updated process_audio function to handle matching languages
def process_audio(audio_path, language, target_lang):
    transcription = transcribe_audio(audio_path, language)
    if "Error" in transcription:
        return transcription, "N/A", "N/A", "N/A"

    # If source language and target language are the same, set translation and summaries to N/A
    if language == target_lang:
        translation = "N/A"
        source_summary = summarize_text(transcription, language)
        target_summary = "N/A"
    else:
        # Translate transcription directly
        translation = translate_text(transcription, language, target_lang)
        # Summarize in the source language
        source_summary = summarize_text(transcription, language)
        # Translate the source summary to the target language
        target_summary = translate_text(source_summary, language, target_lang)

    return transcription, translation, source_summary, clean_output(target_summary)



The Necessary Functions for Transcribing,Translating and Summarizing the Inputted Audio

In [8]:
def gradio_interface(audio, language, target_lang):
    return process_audio(audio, language, target_lang)

# Launch Gradio App
interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio File"),
        gr.Dropdown(choices=["hindi", "english", "malayalam"], label="Source Language"),
        gr.Dropdown(choices=["english", "hindi", "malayalam"], label="Target Language")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Translation of Transcription"),
        gr.Textbox(label="Source Summary"),
        gr.Textbox(label="Target Summary")
    ],
    title="Vaani - A Speech-To-Text Summarizer",
    description="Upload an audio file, select the source language, and choose the target language. Outputs include transcription, its translation, and summaries in both languages."
)

interface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a9fdc005bfd5844971.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a9fdc005bfd5844971.gradio.live




The Web Interface where you can input the Audio and recieve outputs on the depending upon your requirements