In [1]:
! pip install git+https://github.com/openai/whisper.git -q
! pip install gradio -q
! pip install jiwer -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.1/288.1 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.3/75.3 kB[0m [31m10.0

In [2]:
import whisper
import time
# language = "Malay" 
import gradio as gr
from jiwer import wer

In [3]:
# Function to transcribe the speech
def transcribe1(micRecord, audioFileUpload, model_size):

    # source = upload if upload is not None else micRecord
    # model = whisper.load_model(model_size)
    # # mel = whisper.log_mel_spectrogram(source).to(model.device)
    # result = model.transcribe(source)
    # global hypothesis
    # hypothesis = result["text"]

    source = audioFileUpload if audioFileUpload is not None else micRecord
    model = whisper.load_model(model_size)
    source = whisper.load_audio(source)
    source = whisper.pad_or_trim(source)
    mel = whisper.log_mel_spectrogram(source).to(model.device)
    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(model, mel, options)
    global hypothesis
    hypothesis = result.text
 
    return {
        output_transcript: hypothesis,
        hiddenRow2: gr.update(visible=True)
    }
        
# Function to calculate the Word Error Rate
def calculate_WER(reference):

    error = wer(reference, hypothesis) * 100

    return {
        output_wer: error
    }

# Function to display the visibility of the WER calculation section
def vis_calculate_WER():
    return gr.update(visible=True)

# Function to conceal conclusion
def vis_hide_concl():
    return gr.update(visible=False)

# Function to display the conclusion
def vis_conclude():
    return gr.update(visible=True)

# Function to conceal WER calculation
def vis_hide_calc():
    return gr.update(visible=False)

with gr.Blocks() as systemOverview:

  gr.Markdown(
        """# Benchmarking Whisper OpenAI For Sarawak Languages
        <br> The Whisper model that is based on the end-to-end (E2E), encoder-decoder transformer has the potential to serve as a very effective model in facilitating rapid breakthroughs in automatic speech recognition (ASR) for under-resourced languages of Sarawak, namely the Sarawak Malay, Iban, Melanau, and the Bidayuh dialects of Jagoi and Bukar Sadong. This developed system integrates the aforementioned model, Whisper, and the JIWER package with the aim of measuring the recognition accuracy of Whisper on under-resourced Sarawak languages. The results may then serve as a benchmark to indicate the recent advances in ASR for under-resourced Sarawak languages.   
        
        
        <br> Fundamentally, there are a number of models for Whisper that correspond to the parameter size namely the tiny, base, small, medium, and large models. They are distinguished according to the size of parameters that are 39 million, 74 million, 244 million, 769 million and 1.55 billion respectively.
        
        
        <br> The accuracy of Whisper for Sarawak languages is evaluated based on the Word Error Rate (WER). This evaluation metric consists of word substitutions (S), deletions (D), insertions (I), and number of words (N), where it is calculated by summing the word substitutions, deletions, and insertions together, subsequently dividing the result by the number of words. In essence, the WER cannot be a negative number, but it could be above 100% in instances where there exist more errors than words in the reference. The formula is as follows: 
        
        WER = (S + D + I) / N

    
        <br> Created by Gerald E.
        """
  )


#Audio Recording 
with gr.Blocks() as audioRec:

  gr.Markdown(
        """# Benchmarking Whisper OpenAI For Sarawak Languages
        <br> User Guide
        1. Select a Whisper model from the dropdown options according to the parameter size.
        2. Upload an audio file (M4A, MP3, MP4, MPEG, MPGA, WAV and WEBM) OR record your voice on your microphone.
        3. Hit the "Transcribe" button to prompt Whisper to transcribe the speech recording.
        4. You may also calculate the WER for an innacurate transcription upon obtaining Whisper's transcription by hitting the "Inaccurate Transcription, Calculate WER" button.
        5. Enter the reference text transcript, then hit the "Calculate" button.        
        """
  )
  
  input_model_type = gr.Dropdown(label="Select The Whisper Model According To Parameter Size",value="Please Select A Model",choices=["tiny", "base", "small", "medium", "large"])
  input_audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
  gr.Markdown("OR")
  input_audio2 = gr.Audio(label="Record Your Voice On Your Microphone",source="microphone", type="filepath")
  btn_submit2  = gr.Button("Transcribe")
  output_transcript = gr.Textbox(type="text", label="Whisper's Transcription Result")

  with gr.Column(visible=False) as hiddenRow2:
      gr.Markdown("Is Whisper's transcription accurate?")
      with gr.Row() as selectionBtn:
        btn_submit3 = gr.Button("Accurate Transcription")
        btn_submit4 = gr.Button("Inaccurate Transcription, Calculate WER")

  with gr.Column(visible=False) as hiddenRow3: 
      input_ref = gr.Textbox(label="Reference Text Transcript", placeholder="Please Enter The Reference Text Transcript")
      btn_submit5 = gr.Button("Calculate")
      output_wer = gr.Textbox(label="Word Error Rate (%)")

  with gr.Row(visible=False) as hiddenRowConclude: 
      gr.Markdown("Great! Thank you for using this transcriber, be seeing you.")

  btn_submit2.click(
        transcribe1,
        [input_audio1, input_audio2, input_model_type],
        [output_transcript, hiddenRow2]
    )

  btn_submit3.click(
        vis_conclude,
        [],
        hiddenRowConclude
    )

  btn_submit3.click(
        vis_hide_calc,
        [],
        hiddenRow3
    )

  btn_submit4.click(
        vis_calculate_WER,
        [],
        hiddenRow3
    )
  
  btn_submit4.click(
        vis_hide_concl,
        [],
        hiddenRowConclude
    )

  btn_submit5.click(
        calculate_WER,
        input_ref,
        output_wer
    )


demo1 = gr.TabbedInterface([systemOverview, audioRec], ["Overview","Transcriber"])

demo1.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2c1a94584b8a44fe55.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


