# ASR Fine Tuning

In this notebook I will record myself reading pieces of text so that I can evaluate ASR on my voice and build some fine tuning data for ASR also.

In [1]:
import os
import sys
from glob import glob
import librosa
import ipywidgets
import IPython.display as ipd
import numpy as np
import sounddevice as sd
import soundfile as sf
import string
from tqdm.notebook import tqdm
import wave

# For accessing transcribers later
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
with open('ASR_Sentences.txt') as f:
    text_sents = [line.strip() for line in f]
len(text_sents)

99

In [3]:
import pyaudio
import wave
import IPython.display as ipd
import ipywidgets as widgets
import threading

recording_thread = None
stream = None
def record():
    global stream
    while recording:
        data = stream.read(1024, exception_on_overflow=False)
        frames.append(data)

# Function to start recording audio
def start_recording(button):
    global recording, current_sentence_index, frames, recording_thread, stream
    recording = True
    button_start.disabled = True
    button_stop.disabled = False
    frames = []
    sentence_text.value = sentences[current_sentence_index]
    stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=1024,
                stream_callback=None)
    stream.start_stream()
    recording_thread = threading.Thread(target=record)
    recording_thread.start()

# Function to stop recording audio
def stop_recording(button):
    global recording, current_sentence_index, recording_thread, stream
    recording = False
    recording_thread.join()
    button_start.disabled = False
    button_stop.disabled = True
    stream.stop_stream()
    stream.close()
    sentence_text.value = "Recording complete."
    write_audio(frames, f'fine_tuning_data/recording_sentence_{current_sentence_index+1}.wav')

# Function to write audio frames to wav file
def write_audio(frames, filename):
    p = pyaudio.PyAudio()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(16000)
    wf.writeframes(b''.join(frames))
    wf.close()

# Function to move to next sentence
def next_sentence(button):
    global current_sentence_index
    if current_sentence_index < len(sentences) - 1:
        current_sentence_index += 1
        sentence_text.value = sentences[current_sentence_index]
    else:
        sentence_text.value = "All sentences recorded."

# List of sentences
sentences = text_sents
current_sentence_index = 0

# Create buttons
button_start = widgets.Button(description="Start Recording")
button_stop = widgets.Button(description="Stop Recording")
button_stop.disabled = True
button_next = widgets.Button(description="Next")
button_next.disabled = False

# Add button click event handlers
button_start.on_click(start_recording)
button_stop.on_click(stop_recording)
button_next.on_click(next_sentence)

# Create sentence text widget
sentence_text = widgets.Textarea(value=sentences[current_sentence_index],
                                  description='Sentence:',
                                  layout={'height': '100px'}) 
# Display widgets
display(sentence_text)
display(button_start)
display(button_stop)
display(button_next)

recording = False

# Create PyAudio stream
p = pyaudio.PyAudio()
frames = []

Textarea(value='Hello', description='Sentence:', layout=Layout(height='100px'))

Button(description='Start Recording', style=ButtonStyle())

Button(description='Stop Recording', disabled=True, style=ButtonStyle())

Button(description='Next', style=ButtonStyle())

In [4]:
files = glob('fine_tuning_data/recording_sentence_*.wav')

with open('fine_tuning_data/train.txt', 'w') as f:
    f.write('path|transcript|duration\n')
    for fname in files:
        index = int(fname.split('_')[-1].split('.')[0])
        y, sr = librosa.load(fname)
        duration = len(y) / sr
        f.write(f'{fname}|{text_sents[index - 1]}|{duration}\n')


## Evaluation

In [5]:
def word_error_rate(ref, hyp):
    """
    Computes Word Error Rate (WER) while ignoring punctuation, even if it occurs within a word.

    Args:
        ref (str): Reference string
        hyp (str): Hypothesis string

    Returns:
        float: Word Error Rate (WER) normalized to [0, 1]
    """
    # Remove punctuation from reference and hypothesis strings
    ref = ref.translate(str.maketrans('', '', string.punctuation))
    hyp = hyp.translate(str.maketrans('', '', string.punctuation))

    # Split reference and hypothesis into words
    ref_words = ref.split()
    hyp_words = hyp.split()

    # Compute WER
    num_errors = len(set(ref_words) ^ set(hyp_words)) # XOR to get unique words
    wer = float(num_errors) / len(ref_words)

    return wer

### Evaluate Vosk Transcriber

In [6]:
from source.speech_to_text.pretrained_vosk import PretrainedVoskTranscriber


transcriber = PretrainedVoskTranscriber(
    model_path='vosk-model-en-us-0.22',
    sampling_rate=22050
)

word_error_rates = []
    
for fname in tqdm(files):
    index = int(fname.split('_')[-1].split('.')[0])
    with wave.open(fname, 'rb') as wf:
        buffer = wf.readframes(wf.getnframes())
        actual = transcriber.transcribe(buffer)
    expected = text_sents[index - 1]
    word_error_rates.append(word_error_rate(expected, actual))
np.mean(np.array(word_error_rates))

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:11:12:13:14:15
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-en-us-0.22/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from vosk-model-en-us-0.22/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from vosk-model-en-us-0.22/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-en-us-0.22/graph/phones/word_boundary.int
LOG (VoskAPI:ReadDataFiles():model.cc:310) Loading subtract 

  0%|          | 0/99 [00:00<?, ?it/s]

0.6804708503474562

In [8]:
from source.speech_to_text.pretrained_whisper import PretrainedWhisperTranscriber


transcriber = PretrainedWhisperTranscriber()

for fname in tqdm(files):
    index = int(fname.split('_')[-1].split('.')[0])
    y, sr = sf.read(fname, dtype='float32')
    y = librosa.resample(y, sr, target_sr=16000)
    actual = transcriber.transcribe(y)
    expected = text_sents[index - 1]
    word_error_rates.append(word_error_rate(expected, actual))
np.mean(np.array(word_error_rates))

  0%|          | 0/99 [00:00<?, ?it/s]

0.3513062611027567