# Install

In [1]:
INSTALL = False

In [2]:
if INSTALL:
    ! pip install -r requirements.txt > /dev/null

In [3]:
from ipdb import set_trace

# Main

In [4]:
DEFAULT_MODEL_SIZE = "tiny"
DEFAULT_DELAY = 5

In [5]:
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time

In [6]:
current_size = DEFAULT_MODEL_SIZE
model = whisper.load_model(current_size)
audio_chunks = []

In [7]:
def get_full_transcript(model, audio_chunks):
    if len(audio_chunks) == 0:
        return ""
    
    chunk_texts = []
    
    for audio_chunk in audio_chunks:
        chunk_texts.append(audio_chunk['result'].text)
    
    return " ".join(chunk_texts)

In [8]:
def transcribe_chunk(model, audio_chunks, audio):
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")
    
    last_chunk = audio_chunks[:-1]
    
    if len(last_chunk) == 0:
        last_chunk = None
    else:
        last_chunk = last_chunk[0]
    
    # decode the audio
    options = whisper.DecodingOptions(
        language="en",
        #prompt="" if not last_chunk else last_chunklast_chunk['result'].text
        #suppress_tokens
    )
    
    return whisper.decode(model, mel, options)

In [9]:
def transcribe(chunk, state, delay):
    print("@transcribe")
    
    global model
    global audio_chunks
   
    # Why -1?
    #time.sleep(delay - 1)
    
    state['debug'] += "@transcribe "
    
    time.sleep(delay - 1)
    print(f"delaying: {delay}")
    
    #for n in range(round(delay * 10)):
        #print(f".", end='', flush=True)
        #time.sleep(n/10)
    
    print("Transcribing chunk...")
    
    result = transcribe_chunk(model, audio_chunks, chunk)
    
    audio_chunks.append({
        'chunk': chunk,
        'result': result
    })
    
    transcript = get_full_transcript(model, audio_chunks)

    state['transcription'] = transcript
    
    print("=====================================")
    print(result.text)
    print("-------------------------------------")
    print(transcript)
    print()

    return state['transcription'], state['debug'], state

In [None]:
title = "two-way-speech"
description = "A demo of two-way-speech"

delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=DEFAULT_DELAY, label="Rate of transcription")

transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=500)

debug_tb = gr.Textbox(label="Debug", lines=50, max_lines=200)

state = gr.State({"transcription": "", "debug": ""})

gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        state,
        delay_slider,
    ],
    outputs=[
        transcription_tb,
        debug_tb,
        state
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
).launch(
    # enable_queue=True,
    debug=True,
    share=True
  )

"Started"



Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://29396.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


@transcribe
delaying: 5
Transcribing chunk...
Detected language: en
you
-------------------------------------
you

@transcribe
delaying: 5
Transcribing chunk...
Detected language: en
Okay, see if debug updates is printing.
-------------------------------------
you Okay, see if debug updates is printing.

@transcribe
delaying: 5
Transcribing chunk...
Detected language: en
There we go. Watch what we got. A weird one. Oh well.
-------------------------------------
you Okay, see if debug updates is printing. There we go. Watch what we got. A weird one. Oh well.



In [None]:
get_full_transcript(model, audio_chunks)

In [None]:
import IPython

def display_audio_chunk(index):
    global audio_chunks
    chunk = audio_chunks[index]['chunk']
    return IPython.display.Audio(chunk)

def display_audio_chunks():
    global audio_chunks
    for chunk in audio_chunks:
        IPython.display.display(IPython.display.Audio(chunk['chunk']))

In [None]:
display_audio_chunks()