In [1]:
import gradio as gr
from transformers import (
    Wav2Vec2ConformerForCTC, 
    Wav2Vec2Processor, 
    pipeline
)
import torch
import librosa
import sounddevice as sd
import warnings
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class AudioModel:
    def __init__(
        self,
        s2t_model_name_or_path: str = "facebook/wav2vec2-conformer-rel-pos-large-960h-ft",
        t2s_model_name_or_path: str = "suno/bark-small",
        s2t_is_gpu: bool = False,
        t2s_is_gpu: bool = False    
    ) -> None:
        self.s2t_device = 'cuda' if s2t_is_gpu else "cpu"
        self.t2s_device = 'cuda' if t2s_is_gpu else "cpu"
        self.s2t_processor = Wav2Vec2Processor.from_pretrained(s2t_model_name_or_path)
        self.s2t_model = Wav2Vec2ConformerForCTC.from_pretrained(s2t_model_name_or_path).to(self.s2t_device)
        self.t2s_model = pipeline("text-to-speech", t2s_model_name_or_path, device=self.t2s_device)

    def s2t_transcribe(
        self, 
        audio_input: tuple
    ) -> str:
        sr, audio = audio_input
        audio = audio.astype(np.float32)
        audio /= np.max(np.abs(audio))

        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        input_values = self.s2t_processor(audio, sampling_rate=16000, return_tensors="pt").input_values
        logits = self.s2t_model(input_values.to(self.s2t_device)).logits
        pred_ids = torch.argmax(logits, dim=-1)
        pred_transcript = self.s2t_processor.batch_decode(pred_ids)[0]
        print(pred_transcript)
        return pred_transcript 
    
    def t2s_transcribe(
        self, 
        text: str
    ) -> None:
        speech =  self.t2s_model(text, forward_params={"do_sample": True})
        print("Speaking...")
        sd.play(speech["audio"][0], speech['sampling_rate'])
        sd.wait()

In [4]:
def s2t(audio):
    global model
    # print(audio)
    text = model.s2t_transcribe(audio)
    return text

def t2s(text):
    global model
    model.t2s_transcribe(text)
    
model = AudioModel()
with gr.Blocks() as demo:
    audio = gr.Audio(source='microphone', type='numpy')
    text = gr.Text(interactive=False)
    audio.stop_recording(s2t, audio, text).then(t2s, text, None)

demo.launch()

Some weights of the model checkpoint at facebook/wav2vec2-conformer-rel-pos-large-960h-ft were not used when initializing Wav2Vec2ConformerForCTC: ['wav2vec2_conformer.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2_conformer.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ConformerForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ConformerForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ConformerForCTC were not initialized from the model checkpoint at facebook/wav2vec2-conformer-rel-pos-large-960h-ft and are newly initialized: ['wav2vec2_conformer.encoder.pos_conv_embed.conv.parametrizations.weight.original0', '

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


HENO HANO HENO HELO HELO
Speaking...


In [1]:
import gradio as gr
import os
import time

# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.


def add_text(history, text):
    history = history + [(text, None)]
    return history, gr.Textbox(value="", interactive=False)


def add_file(history, file):
    print(file.name)
    history = history + [(None, (file.name,))]
    print(history)
    return history


def bot(history):
    response = "**That's cool!**"
    history[-1][1] = ""
    for character in response:
        history[-1][1] += character
        time.sleep(0.05)
        yield history


with gr.Blocks() as demo:
    chatbot = gr.Chatbot(
        [(None, (r"C:\Users\minhd\AppData\Local\Temp\gradio\5a683f8a6f45f11742a0c046be5e9bb163010192\Recording.mp3",))],
        elem_id="chatbot",
    )

    with gr.Row():
        txt = gr.Textbox(
            scale=4,
            show_label=False,
            placeholder="Enter text and press enter, or upload an image",
            container=False,
        )
        btn = gr.UploadButton("📁", file_types=["file"])

    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
        bot, chatbot, chatbot, api_name="bot_response"
    )
    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)
    gr.update("chatbot")

demo.queue()
if __name__ == "__main__":
    demo.launch(allowed_paths=["avatar.png"])


  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


C:\Users\minhd\AppData\Local\Temp\gradio\37ba341e1e3abb1d48c9755cca23fc85ce6c5dc1\en_test.wav
[[None, ('C:\\Users\\minhd\\AppData\\Local\\Temp\\gradio\\5a683f8a6f45f11742a0c046be5e9bb163010192\\Recording.mp3',)], (None, ('C:\\Users\\minhd\\AppData\\Local\\Temp\\gradio\\37ba341e1e3abb1d48c9755cca23fc85ce6c5dc1\\en_test.wav',))]


In [1]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Chatbot([
        ("Show me an image and an audio file", "Here is an image"), 
        (None, ("dog.jpg",)), 
        (None, "And here is an audio file:"), 
        ((r"C:\Users\minhd\AppData\Local\Temp\gradio\0da9d8e97a4c1ac5dea616b6c960ae5e46950c98\input.wav",), None)
    ])#.style(height=1000)

demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


