In [1]:
%%capture
!pip install gradio gtts pydub faster-whisper jiwer tensorflow matplotlib numpy 

In [None]:
import os
import gradio as gr
import moonshine
import warnings
import tempfile
import time
import matplotlib.pyplot as plt
import jiwer
import numpy as np
from gtts import gTTS
from pydub import AudioSegment
from faster_whisper import WhisperModel

In [3]:
os.makedirs('charts', exist_ok=True)
warnings.filterwarnings('ignore')
model_name = "tiny"

In [4]:
cumulative_moonshine_times = []
cumulative_faster_whisper_times = []
cumulative_moonshine_wers = []
cumulative_faster_whisper_wers = []
cumulative_percentage_differences = []
cumulative_absolute_differences = []

In [5]:
def google_text_to_speech(text: str, language: str = 'en') -> str:
    """Converts text to speech using Google Text To Speech and saves it as a .wav file.

    Args:
        text (str): The text to convert to speech.
        language (str): The language code for the speech. Default is 'en' for English.

    Returns:
        str: The path to the saved .wav audio file.
    """
    tts = gTTS(text=text, lang=language, slow=False)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        tts.save(temp_audio_file.name)
        return temp_audio_file.name

In [6]:
def transcribe_with_moonshine(audio_file: str) -> tuple[str, float]:
    """Transcribes audio using the Moonshine model.

    Args:
        audio_file (str): The path to the audio file to transcribe.

    Returns:
        tuple: A tuple containing the transcribed text and processing time.
    """
    audio = AudioSegment.from_file(audio_file)
    segment_duration = 30 * 1000  
    transcriptions = []
    start_time = time.time()

    for i in range(0, len(audio), segment_duration):
        segment = audio[i:i + segment_duration]
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
            segment.export(temp_audio_file.name, format="wav")
            transcription = moonshine.transcribe(temp_audio_file.name)[0]
            transcriptions.append(transcription)

    processing_time = time.time() - start_time
    return ' '.join(transcriptions), processing_time

In [7]:
def transcribe_with_faster_whisper(audio_file: str, model_name: str) -> tuple[str, float]:
    """Transcribes audio using the Faster-Whisper model.

    Args:
        audio_file (str): The path to the audio file to transcribe.
        model_name (str): The model name to use for transcription.

    Returns:
        tuple: A tuple containing the transcribed text and processing time.
    """
    model = WhisperModel(model_size_or_path=model_name, device="cpu", compute_type="int8")
    start_time = time.time()
    segments, _ = model.transcribe(audio_file, beam_size=5, language='en')
    
    transcription = ' '.join([segment.text for segment in segments])
    processing_time = time.time() - start_time
    
    return transcription, processing_time

In [8]:
def calculate_wer(reference: str, hypothesis: str) -> tuple[float, str]:
    """Calculates the Word Error Rate (WER) between reference and hypothesis transcriptions.

    Args:
        reference (str): The reference transcription.
        hypothesis (str): The transcribed hypothesis.

    Returns:
        tuple: A tuple containing the WER and the normalized hypothesis transcription.
    """
    transformation = jiwer.Compose([
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation()])
    
    transformed_reference = transformation(reference)
    transformed_hypothesis = transformation(hypothesis)

    return jiwer.wer(reference=transformed_reference, hypothesis=transformed_hypothesis), transformed_hypothesis

In [9]:
def create_comparison_chart(moonshine_time: float, faster_whisper_time: float, wer_moonshine: float, wer_whisper: float) -> str:
    """Creates a comparison chart for latency and WER.

    Args:
        moonshine_time (float): Processing time for Moonshine.
        faster_whisper_time (float): Processing time for Faster-Whisper.
        wer_moonshine (float): WER for Moonshine.
        wer_whisper (float): WER for Faster-Whisper.

    Returns:
        str: The path to the saved comparison chart image.
    """
    models = ['Moonshine', 'Faster-Whisper Tiny']
    times = [moonshine_time, faster_whisper_time]
    wers = [wer_moonshine, wer_whisper]

    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.bar(models, times, color=['red', 'yellow'])
    plt.ylabel('Processing Time (seconds)')
    plt.title('Latency Comparison')

    plt.subplot(1, 2, 2)
    plt.bar(models, wers, color=['orange', 'green'])
    plt.ylabel('Word Error Rate (WER)')
    plt.title('WER Comparison')

    plt.tight_layout()
    plt.savefig('charts/latency_and_wer_comparison.png')
    
    return 'charts/latency_and_wer_comparison.png'

In [10]:
def update_cumulative_statistics(moonshine_time: float, faster_whisper_time: float, wer_moonshine: float, wer_whisper: float) -> None:
    """Updates cumulative statistics with the latest transcription results.

    Args:
        moonshine_time (float): Processing time for Moonshine.
        faster_whisper_time (float): Processing time for Faster-Whisper.
        wer_moonshine (float): WER for Moonshine.
        wer_whisper (float): WER for Faster-Whisper.
    """
    cumulative_moonshine_times.append(moonshine_time)
    cumulative_faster_whisper_times.append(faster_whisper_time)
    cumulative_moonshine_wers.append(wer_moonshine)
    cumulative_faster_whisper_wers.append(wer_whisper)

In [11]:
def create_cumulative_chart() -> str:
    """Creates a cumulative chart for average latency and WER.

    Returns:
        str: The path to the saved cumulative chart image.
    """
    global cumulative_moonshine_times, cumulative_faster_whisper_times, cumulative_moonshine_wers, cumulative_faster_whisper_wers

    models = ['Moonshine', 'Faster-Whisper Tiny']

    avg_moonshine_time = np.mean(cumulative_moonshine_times) 
    avg_faster_whisper_time = np.mean(cumulative_faster_whisper_times) 
    avg_moonshine_wer = np.mean(cumulative_moonshine_wers)
    avg_faster_whisper_wer = np.mean(cumulative_faster_whisper_wers) 

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.bar(models, [avg_moonshine_time, avg_faster_whisper_time], color=['red', 'yellow'])
    plt.ylabel('Average Processing Time (seconds)')
    plt.title('Average Latency Over All Runs')

    plt.subplot(1, 2, 2)
    plt.bar(models, [avg_moonshine_wer, avg_faster_whisper_wer], color=['orange', 'green'])
    plt.ylabel('Average WER')
    plt.title('Average WER Over All Runs')

    plt.tight_layout()
    plt.savefig('charts/cumulative_latency_and_wer_comparison.png')
    
    return 'charts/cumulative_latency_and_wer_comparison.png'


In [12]:
def main(text: str = None) -> tuple:
    """Main function to perform text-to-speech, transcription, WER calculation, and chart creation.

    Args:
        text (str, optional): Input text for conversion. If None, prompts for text input.

    Returns:
        tuple: A tuple containing the audio file path, normalized transcriptions, statistics, and chart file paths.
    """
    if text is None:
        return "Please provide text input.", None

    audio_file = google_text_to_speech(text)

    moonshine_transcription, moonshine_time = transcribe_with_moonshine(audio_file)
    faster_whisper_transcription, faster_whisper_time = transcribe_with_faster_whisper(audio_file, "tiny")

    wer_moonshine, normalized_moonshine_transcription = calculate_wer(text, moonshine_transcription)
    wer_whisper, normalized_faster_whisper_transcription = calculate_wer(text, faster_whisper_transcription)

    update_cumulative_statistics(moonshine_time, faster_whisper_time, wer_moonshine, wer_whisper)

    absolute_difference = abs(moonshine_time - faster_whisper_time)
    percentage_difference = (absolute_difference / min(moonshine_time, faster_whisper_time)) * 100 if min(moonshine_time, faster_whisper_time) > 0 else 0

    cumulative_percentage_differences.append(percentage_difference)
    cumulative_absolute_differences.append(absolute_difference)
    
    chart_file = create_comparison_chart(moonshine_time, faster_whisper_time, wer_moonshine, wer_whisper)
    cumulative_chart_file = create_cumulative_chart()

    statistics = (f"Percentage Difference Latency: {percentage_difference/100:.2f}x\n"
                  f"Absolute Difference Latency: {absolute_difference:.2f} seconds\n"
                  f"\n"
                  f"Moonshine Latency: {moonshine_time:.2f} seconds\n"
                  f"Faster-Whisper Tiny Latency: {faster_whisper_time:.2f} seconds\n"
                  f"\n"
                  f"WER Moonshine: {wer_moonshine*100:.2f}%\n"
                  f"WER Faster-Whisper Tiny: {wer_whisper*100:.2f}%")

    total_runs = len(cumulative_moonshine_times)
    cumulative_statistics = (f"Total Runs: {total_runs}\n"
                             f"\n"
                             f"Average Percentage Difference Latency: {np.mean(cumulative_percentage_differences)/100:.2f}x\n"
                             f"Average Absolute Difference Latency: {np.mean(cumulative_absolute_differences):.2f} seconds\n"
                             f"\n"
                             f"Average Moonshine Latency: {np.mean(cumulative_moonshine_times):.2f} seconds\n"
                             f"Average Faster-Whisper Tiny Latency: {np.mean(cumulative_faster_whisper_times):.2f} seconds\n"
                             f"\n"
                             f"Average Moonshine WER: {np.mean(cumulative_moonshine_wers)*100:.2f}%\n"
                             f"Average Faster-Whisper Tiny WER: {np.mean(cumulative_faster_whisper_wers)*100:.2f}%")

    return (audio_file, normalized_moonshine_transcription, normalized_faster_whisper_transcription,
            statistics, chart_file, cumulative_statistics, cumulative_chart_file)

In [None]:
iface = gr.Interface(
    fn=main,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter text here (English only)", label="📝 Text Input (English only)")
    ],
    outputs=[
    gr.Audio(label="🔊 Google Text to Speech Audio Output"),
    gr.Textbox(label="📝 Moonshine Transcription Output"), 
    gr.Textbox(label="📝 Faster-Whisper Tiny Transcription Output"),
    gr.Textbox(label="📈 Statistics Output"), 
    gr.Image(label="📊 Chart"),
    gr.Textbox(label="📈 Cumulative Statistics Output"), 
    gr.Image(label="📊 Cumulative Chart")
    ],
    title="Audio Transcription Benchmark: Moonshine vs Faster-Whisper Tiny",
    description = """
    Compare Latency and WER for Each Transcription Run: The comparator works by taking textual inputs that are converted into .wav audio files using Google Text To Speech 🔊. These audio files are then used as inputs for the two models, Moonshine 🗣️ and Whisper Tiny 💬. Each model will be evaluated and compared for individual runs and across all runs that the user wants to conduct, providing both single and cumulative statistics Charts 📊 regarding latency and Word Error Rate (WER) 📈. The WER is evaluated using the **jiwer** library, which assesses the accuracy of the transcription by comparing the reference input text with the generated transcription. The WER calculation is based on the formula:

    WER = (S + D + I) / N

    where \( S \) is the number of substitutions, \( D \) is the number of deletions, \( I \) is the number of insertions, and \( N \) is the total number of words in the reference text.

    Before the evaluation, several normalization steps are applied to the transcription results to ensure consistency and accuracy ✨. These steps include expanding common English contractions to their full forms, removing any empty strings, converting all text to lowercase, eliminating multiple spaces, stripping leading and trailing whitespace, and removing punctuation to create a standardized basis for comparison and enhancing the reliability of the WER measurement.

    Additionally, you can save the results (statistics and charts) of each run and cumulative results by pressing the "Flag" button. The results will be saved in the local `.gradio` directory, in subdirectories corresponding to the fields and chat IDs.
    """
    )

iface.launch(share=True)