# Install

In [None]:
import os

if not os.path.exists("INSTALLED"):
    ! pip install -r requirements.txt > /dev/null && touch INSTALLED

In [None]:
from ipdb import set_trace

# Config

In [None]:
DEFAULT_MODEL_SIZE = "base"
DEFAULT_DELAY = 2

# Main

In [None]:
import os

from timeit import default_timer as timer
import time
from datetime import datetime
from datetime import timedelta

import copy from copy

import numpy as np

import gradio as gr
import whisper
from whisper import tokenizer

In [None]:
%load_ext gradio

In [None]:
current_size = DEFAULT_MODEL_SIZE
model = whisper.load_model(current_size)
audio_chunks = []
audio_stream = False

In [None]:
import IPython

def display_audio_chunk(index):
    global audio_chunks
    chunk = audio_chunks[index]['chunk']
    return IPython.display.Audio(chunk)

def display_audio_chunks():
    global audio_chunks
    for chunk in audio_chunks:
        IPython.display.display(IPython.display.Audio(chunk['chunk']))

In [None]:
def get_full_transcript(model, audio_chunks):
    if len(audio_chunks) == 0:
        return ""
    
    chunk_texts = []
    
    for audio_chunk in audio_chunks:
        chunk_texts.append(audio_chunk['result'].text)
    
    return " ".join(chunk_texts)

In [None]:
def transcribe_chunk(model, audio_chunks, audio):
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    #_, probs = model.detect_language(mel)
    #print(f"Detected language: {max(probs, key=probs.get)}")
    
    last_chunk = audio_chunks[:-1]
    
    if len(last_chunk) == 0:
        last_chunk = None
    else:
        last_chunk = last_chunk[0]
    
    # decode the audio
    options = whisper.DecodingOptions(
        language="en",
        prompt="" if not last_chunk else last_chunk['result'].text
        #suppress_tokens
    )
    
    return whisper.decode(model, mel, options)

In [None]:
def transcribe(chunk, state, delay):
    #time.sleep(delay)
    print("@transcribe")
    
    global model
    global audio_chunks
   
    print(f"delaying: {delay}")
    
    for n in range(round(delay * 10)):
        print(f".", end='', flush=True)
        time.sleep(delay/10)
    
    print("Transcribing chunk...")
    
    start = timer()
    result = transcribe_chunk(model, audio_chunks, chunk)
    end = timer()
    print(timedelta(seconds=end-start))
    
    #if result['
    
    audio_chunks.append({
        'chunk': chunk,
        'result': result
    })
    
    transcript = get_full_transcript(model, audio_chunks)

    state['debug'] += "@transcribe "
    state['transcription'] = transcript
    
    print("=====================================")
    print(result.text)
    print("-------------------------------------")
    print(transcript)
    print()

    return state['transcription'], state['debug'], state

In [None]:
def transcribe(chunk, state, delay):
    print(datetime.utcnow().isoformat(sep=' ', timespec='milliseconds'))
    global audio_stream
    
    if not audio_stream:
        audio_stream = copy(chunk)
    else:
        sr = chunk[0]
        samples = chunk[1]

        audio_stream = (sr, np.concatenate(audio_stream[1], samples)
    
    print(audio_stream[2].size)
    
    return state['transcription'], state['debug'], state

# Main

In [None]:
title = "two-wai"
description = "desc"

delay_slider = gr.Slider(minimum=0, maximum=10, value=DEFAULT_DELAY, label="Rate of transcription")

transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=500)

debug_tb = gr.Textbox(label="Debug", lines=10, max_lines=200)

state = gr.State({"transcription": "", "debug": ""})

interface = gr.Interface(
    fn=transcribe,
    inputs=[
        # The format the audio file is converted to before being passed into the prediction function. "numpy" converts the audio to a tuple consisting of: (int sample rate, numpy.array for the data),
        # "filepath" passes a str path to a temporary file containing the audio.
        gr.Microphone(type="filepath", streaming=True),
        state,
        delay_slider,
    ],
    outputs=[
        transcription_tb,
        debug_tb,
        state
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
)

In [None]:
interface.launch(
    enable_queue=True,
    debug=True,
    share=True,
    inline=False
)

# Debug

In [None]:
if len(audio_chunks) > 0:
    transcript = get_full_transcript(model, audio_chunks)
    print(transcript)
    display_audio_chunks()