# Exercise 4
This exercise implements speech recognition on 22 audio files using Mozilla DeepSpeech, and displays their word error rate (WER). Additional audio features have been implemented for some of the audio files to improve their WER.

Firstly, we need to install and import the libraries required in the application.

In [None]:
pip install deepspeech

In [None]:
pip install librosa --user

In [None]:
pip install pydub

In [None]:
pip install prettytable

In [None]:
# Import libraries
import os
from deepspeech import Model
import librosa as lr
import numpy as np
import pydub
from IPython.display import Audio
from prettytable import PrettyTable

Next, the file paths of the models, scorers, and audio files are declared for each of the languages (English, Spanish and Italian). Sentences for each of the audio files are also declared for WER calculation.

In [None]:
# English language configuration
en_model = "Models/deepspeech-0.9.3-models.pbmm"
en_scorer = "Models/deepspeech-0.9.3-models.scorer"
en_audio_files = ["Audio_Files/EN/checkin.wav",
                  "Audio_Files/EN/checkin_child.wav",
                  "Audio_Files/EN/parents.wav",
                  "Audio_Files/EN/parents_child.wav",
                  "Audio_Files/EN/suitcase.wav",
                  "Audio_Files/EN/suitcase_child.wav",
                  "Audio_Files/EN/what_time.wav",
                  "Audio_Files/EN/what_time_child.wav",
                  "Audio_Files/EN/where.wav",
                  "Audio_Files/EN/where_child.wav"]
en_sentences = ["where is the check in desk",
                "where is the check in desk",
                "i have lost my parents",
                "i have lost my parents",
                "please i have lost my suitcase",
                "please i have lost my suitcase",
                "what time is my plane",
                "what time is my plane",
                "where are the restaurants and shops",
                "where are the restaurants and shops"]

# Spanish language configuration
es_model = "Models/output_graph_es.pbmm"
es_scorer = "Models/kenlm_es.scorer"
es_audio_files = ["Audio_Files/ES/checkin_es.wav",
                  "Audio_Files/ES/parents_es.wav",
                  "Audio_Files/ES/suitcase_es.wav",
                  "Audio_Files/ES/what_time_es.wav",
                  "Audio_Files/ES/where_es.wav"]
es_sentences = ["donde estan los mostradores",
                "he perdido a mis padres",
                "por favor he perdido mi maleta",
                "ahora es miedo",
                "donde estan los restaurantes en las tierras"]

# Italian language configuration
it_model = "Models/output_graph_it.pbmm"
it_scorer = "Models/kenlm_it.scorer"
it_audio_files = ["Audio_Files/IT/checkin_it.wav",
                  "Audio_Files/IT/parents_it.wav",
                  "Audio_Files/IT/suitcase_it.wav",
                  "Audio_Files/IT/what_time_it.wav",
                  "Audio_Files/IT/where_it.wav"]
it_sentences = ["dove e il pancone",
                "ho perso i miei genitori",
                "per favore ho perso la mia valigia",
                "a che ora e mio aereo",
                "dove sono ristoranti negozi"]

# Recorded sentences configuration
recorded_audio_files = ["Audio_Files/your_sentence1.wav",
                        "Audio_Files/your_sentence2.wav"]
recorded_sentences = ["where is the nearest toilet",
                      "how do i get to gate twenty five"]

The file paths of the models, scorers and audio files are then checked using assertions to ensure that they exist.

In [None]:
# Assertions to check the existence of required files
assert os.path.exists(en_model), en_model + " not found"
assert os.path.exists(en_scorer), en_scorer + " not found"
for i in range(len(en_audio_files)):
    assert os.path.exists(en_audio_files[i]), en_audio_files[i] + " not found"
    
assert os.path.exists(es_model), es_model + " not found"
assert os.path.exists(es_scorer), es_scorer + " not found"
for i in range(len(es_audio_files)):
    assert os.path.exists(es_audio_files[i]), es_audio_files[i] + " not found"
    
assert os.path.exists(it_model), it_model + " not found"
assert os.path.exists(it_scorer), it_scorer + " not found"
for i in range(len(it_audio_files)):
    assert os.path.exists(it_audio_files[i]), it_audio_files[i] + " not found"
    
for i in range(len(recorded_audio_files)):
    assert os.path.exists(recorded_audio_files[i]), recorded_audio_files[i] + " not found"

If no assertions occur, the models will be loaded into the application.

In [None]:
# Load DeepSpeech models for English, Spanish, and Italian
en_ds = Model(en_model)
en_ds.enableExternalScorer(en_scorer)

es_ds = Model(es_model)
es_ds.enableExternalScorer(es_scorer)

it_ds = Model(it_model)
it_ds.enableExternalScorer(it_scorer)

The speech_recognition function performs speech recognition on each of the audio files.

The function takes in a list of audio files, a model and a list of correct sentences, and performs speech-to-text (STT) on the audio files using DeepSpeech. The interpreted sentences are then compared with the correct sentences to calculate the WER. Finally, a list of all the WERs is returned.

Audio features such as pitch shifting and gain are implemented on some of the audio files before STT, to improve the WER.

In [None]:
# Function for speech recognition using DeepSpeech
def speech_recognition(audio_files, ds, sentences):
    # Initialize a list to store Word Error Rates (WERs)
    wers = []
    
    # Loop through each audio file
    for i in range(len(audio_files)):
        # Load and preprocess the audio file
        audio = lr.load(audio_files[i], sr=ds.sampleRate())[0]
        audio = (audio * 32767).astype(np.int16)
        audio = pydub.AudioSegment(audio.tobytes(), frame_rate=ds.sampleRate(), sample_width=2, channels=1)
        
        # Audio-specific processing based on file names
        if audio_files[i][15:] == "checkin.wav" or audio_files[i][15:] == "checkin_child.wav" or audio_files[i][15:] == "suitcase_child.wav":
            # Shift the pitch for specific files
            sample_array = np.array(audio.get_array_of_samples())
            shifted_array = np.interp(np.arange(0, len(sample_array), 2 ** (-2 / 12)),
                                      np.arange(len(sample_array)),
                                      sample_array).astype(np.int16)
            audio = pydub.AudioSegment(shifted_array.tobytes(), frame_rate=audio.frame_rate, sample_width=2, channels=1)
        elif audio_files[i][15:] == "what_time_it.wav":
            # Increase the volume for a specific file
            audio += 10

        # Perform speech-to-text (STT) using DeepSpeech
        res = ds.stt(np.array(audio.get_array_of_samples()))

        # Calculate Word Error Rate (WER) between the original and interpreted texts
        original_text = sentences[i].split()
        interpreted_text = res.split()
        original_length = len(original_text)
        index = 0

        while index < len(original_text):
            match_found = False

            for j in range(len(interpreted_text)):
                if original_text[index] == interpreted_text[j]:
                    original_removed = original_text.pop(index)
                    interpreted_removed = interpreted_text.pop(j)
                    match_found = True
                    break

            if not match_found:
                index += 1

        no_of_errors = 0

        if len(original_text) > len(interpreted_text):
            no_of_errors = len(original_text)
        else:
            no_of_errors = len(interpreted_text)

        wer = no_of_errors / original_length * 100
        wers.append(wer)

        # Display information about the current recognition
        if audio_files[i][12:] == "your_sentence1.wav" or audio_files[i][12:] == "your_sentence2.wav":
            print(audio_files[i][12:])
        else:
            print(audio_files[i][15:])
        display(Audio(audio_files[i]))
        print(f"Original text: {sentences[i]}")
        print(f"Interpreted text: {res}")
        print(f"WER: {wer}%")
        print("--------------------------------------------------------")
    
    return wers

Lastly, the speech_recognition function is called on the audio files, and their WERs are displayed in a table using PrettyTable.

In [None]:
# Perform speech recognition for English audio files
print("English")
print("-------")
en_wers = speech_recognition(en_audio_files, en_ds, en_sentences)

# Perform speech recognition for Spanish audio files
print("Spanish")
print("-------")
es_wers = speech_recognition(es_audio_files, es_ds, es_sentences)

# Perform speech recognition for Italian audio files
print("Italian")
print("-------")
it_wers = speech_recognition(it_audio_files, it_ds, it_sentences)

# Perform speech recognition for recorded sentences in English
print("Recorded sentences")
print("------------------")
recorded_wers = speech_recognition(recorded_audio_files, en_ds, recorded_sentences)

# Create a PrettyTable to display the WERs in a tabular format
table = PrettyTable(["Language", "File", "WER"])

# Add rows for English audio files
for i in range(len(en_audio_files)):
    table.add_row(["English", en_audio_files[i][15:], f"{en_wers[i]}%"])

# Add rows for Spanish audio files
for i in range(len(es_audio_files)):
    table.add_row(["Spanish", es_audio_files[i][15:], f"{es_wers[i]}%"])

# Add rows for Italian audio files
for i in range(len(it_audio_files)):
    table.add_row(["Italian", it_audio_files[i][15:], f"{it_wers[i]}%"])

# Add rows for recorded sentences
for i in range(len(recorded_audio_files)):
    table.add_row(["English", recorded_audio_files[i][12:], f"{recorded_wers[i]}%"])

# Display the final table
print(table)