In [1]:
! pip install git+https://github.com/openai/whisper.git -q

[K     |████████████████████████████████| 4.9 MB 38.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.0 MB/s 
[K     |████████████████████████████████| 163 kB 49.2 MB/s 
[?25h  Building wheel for whisper (setup.py) ... [?25l[?25hdone


In [2]:
import whisper

base_model = whisper.load_model("base")


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 120MiB/s]


In [3]:
!wget -O audio.mp3 http://www.moviesoundclips.net/movies1/darkknightrises/darkness.mp3

--2022-09-30 10:08:29--  http://www.moviesoundclips.net/movies1/darkknightrises/darkness.mp3
Resolving www.moviesoundclips.net (www.moviesoundclips.net)... 198.54.115.219
Connecting to www.moviesoundclips.net (www.moviesoundclips.net)|198.54.115.219|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 168872 (165K) [audio/mpeg]
Saving to: ‘audio.mp3’


2022-09-30 10:08:30 (141 KB/s) - ‘audio.mp3’ saved [168872/168872]



In [4]:
from IPython.display import Audio
Audio("/content/audio.mp3")

## Original transcription
"Oh, you think darkness is your ally. But you merely adopted the dark. I was born in it, molded by it. I didn't see the light until I was already a man, by then it was nothing to me but blinding!"

In [5]:
result = base_model.transcribe("/content/audio.mp3")
print(result["text"])

 Oh you think darkness is your ally? Are you merely adopted the dark? I was born in it, more lived by it. I didn't see the light until I was already a man, but then it was nothing to me but... I'm a man...


In [7]:

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("/content/audio.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(base_model.device)

# detect the spoken language
_, probs = base_model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(base_model, mel, options)

# print the recognized text
print(result.text)

Detected language: en
Oh, you think darkness is your ally? You merely adopted the dark. I was born in it, more lived by it. I didn't see the light until I was already a man, but then it was nothing to me but bright!


In [8]:
! pip install gradio -q

[K     |████████████████████████████████| 5.3 MB 38.0 MB/s 
[K     |████████████████████████████████| 212 kB 66.8 MB/s 
[K     |████████████████████████████████| 57 kB 5.3 MB/s 
[K     |████████████████████████████████| 270 kB 65.5 MB/s 
[K     |████████████████████████████████| 84 kB 4.0 MB/s 
[K     |████████████████████████████████| 2.3 MB 46.7 MB/s 
[K     |████████████████████████████████| 84 kB 3.4 MB/s 
[K     |████████████████████████████████| 54 kB 3.4 MB/s 
[K     |████████████████████████████████| 112 kB 66.8 MB/s 
[K     |████████████████████████████████| 55 kB 4.0 MB/s 
[K     |████████████████████████████████| 63 kB 2.3 MB/s 
[K     |████████████████████████████████| 80 kB 10.7 MB/s 
[K     |████████████████████████████████| 68 kB 6.4 MB/s 
[K     |████████████████████████████████| 46 kB 4.0 MB/s 
[K     |████████████████████████████████| 856 kB 62.0 MB/s 
[K     |████████████████████████████████| 594 kB 75.2 MB/s 
[K     |████████████████████████████████

In [9]:
import gradio as gr 
import time

In [12]:
def inference(audio):
    
    time.sleep(3)
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(base_model.device)

        # decode the audio
    options = whisper.DecodingOptions(without_timestamps=True)
    result = whisper.decode(base_model, mel, options)
    return result.text



In [11]:

gr.Interface(
    title = 'Whisper-app', 
    fn=inference, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath")
    ],
    outputs=[
        "textbox"
    ],
    live=True).launch()

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your components from gradio.components",


Hint: Set streaming=True for Audio component to use live streaming.
Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Your interface requires microphone or webcam permissions - this may cause issues in Colab. Use the External URL in case of issues.
Running on public URL: https://22655.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7fb14e45ed10>,
 'http://127.0.0.1:7860/',
 'https://22655.gradio.app')