# Install

In [1]:
INSTALL = False

In [2]:
if INSTALL:
    ! pip install -r requirements.txt > /dev/null

In [3]:
from ipdb import set_trace

# Main

In [4]:
DEFAULT_MODEL_SIZE = "tiny"
DEFAULT_DELAY = 2

In [5]:
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time

In [6]:
current_size = DEFAULT_MODEL_SIZE
model = whisper.load_model(current_size)
audio_chunks = []

In [7]:
def get_full_transcript(model, audio_chunks):
    if len(audio_chunks) == 0:
        return ""
    
    chunk_texts = []
    
    for audio_chunk in audio_chunks:
        chunk_texts.append(audio_chunk['result'].text)
    
    return " ".join(chunk_texts)

In [8]:
def transcribe_chunk(model, audio_chunks, audio):
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    #_, probs = model.detect_language(mel)
    #print(f"Detected language: {max(probs, key=probs.get)}")
    
    last_chunk = audio_chunks[:-1]
    
    if len(last_chunk) == 0:
        last_chunk = None
    else:
        last_chunk = last_chunk[0]
    
    # decode the audio
    options = whisper.DecodingOptions(
        language="en",
        prompt="" if not last_chunk else last_chunklast_chunk['result'].text
        #suppress_tokens
    )
    
    result = whisper.decode(model, mel, options)
    
    return result

In [9]:
def transcribe(chunk, state, delay):
    print("@transcribe")
    
    global model
    global audio_chunks
   
    # Why -1?
    #time.sleep(delay - 1)
    
    time.sleep(delay)
    print(f"delaying: {delay}")
    
    #for n in range(round(delay * 10)):
        #print(f".", end='', flush=True)
        #time.sleep(n/10)
    
    print("Transcribing chunk...")
    
    result = transcribe_chunk(model, audio_chunks, chunk)
    
    audio_chunks.append({
        'chunk': chunk,
        'result': result
    })
    
    transcript = get_full_transcript(model, audio_chunks)

    state['transcription'] = transcript
    
    print("=====================================")
    print(result.text)
    print("-------------------------------------")
    print(transcript)
    print()

    return state['transcription'], state

In [10]:
title = "two-way-speech"
description = "A demo of two-way-speech"

#model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value=DEFAULT_MODEL_SIZE)

delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=DEFAULT_DELAY, label="Rate of transcription")

transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=500)

state = gr.State({"transcription": ""})

gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        state,
        delay_slider,
    ],
    outputs=[
        transcription_tb,
        state
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
).launch(
    # enable_queue=True,
    debug=True,
    share=True
  )



Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://12534.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


@transcribe
delaying: 3
Transcribing chunk...
you

you
-------------------------------------
you

@transcribe
delaying: 3
Transcribing chunk...
Okay, so we're going to try and make sure that we're going to get the right amount of

Okay, so we're going to try and make sure that we're going to get the right amount of
-------------------------------------
you Okay, so we're going to try and make sure that we're going to get the right amount of

Keyboard interruption in main thread... closing server.


(<gradio.routes.App at 0x7f5455695090>,
 'http://127.0.0.1:7863/',
 'https://12534.gradio.app')

In [11]:
audio_chunks

[{'chunk': '/tmp/audiobwbn4hv6.wav',
  'result': DecodingResult(audio_features=tensor([[-0.6465, -0.8188, -0.3569,  ...,  0.5605, -0.9712,  0.6470],
          [-0.2820, -0.8750,  0.4353,  ...,  0.7393, -0.2871,  0.5088],
          [ 0.1086, -0.6426,  0.1007,  ...,  0.6079, -0.2150,  0.2878],
          ...,
          [-0.2246, -0.9438,  0.3901,  ...,  0.3325, -0.5723,  0.2727],
          [-0.6548, -0.7056,  0.1461,  ...,  0.3655, -0.2693,  0.1015],
          [-0.7979, -0.5146,  0.4526,  ...,  0.2416, -1.0088, -1.0889]],
         device='cuda:0', dtype=torch.float16), language='en', language_probs=None, tokens=[50364, 291, 50464], text='you', avg_logprob=-0.9292147159576416, no_speech_prob=0.9420355558395386, temperature=0.0, compression_ratio=0.2727272727272727)},
 {'chunk': '/tmp/audioyt9ozeqg.wav',
  'result': DecodingResult(audio_features=tensor([[-1.4961, -0.3057, -0.6582,  ..., -0.9312,  0.4146, -0.4819],
          [-1.7656, -0.3223,  0.3740,  ..., -0.5493, -0.0768, -0.3401],
     