# ASR Fine Tuning

In this notebook I will record myself reading pieces of text so that I can fine tune an ASR model on my voice.

In [1]:
# We need NLTK for sentence tokenisation
!pip install nltk
import nltk

nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/declanatkins/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
import os
import librosa
import ipywidgets
import IPython.display as ipd
import numpy as np
import sounddevice as sd
import soundfile as sf
import string

In [3]:
# First chapter of Dubliners by James Joyce
with open('dubliners_chapter_1.txt') as f:
    text = f.read().replace('\n', ' ').replace('“', '"').replace('”', '"').replace('’', "'")
print(text)

There was no hope for him this time: it was the third stroke. Night after night I had passed the house (it was vacation time) and studied the lighted square of window: and night after night I had found it lighted in the same way, faintly and evenly. If he was dead, I thought, I would see the reflection of candles on the darkened blind for I knew that two candles must be set at the head of a corpse. He had often said to me: "I am not long for this world," and I had thought his words idle. Now I knew they were true. Every night as I gazed up at the window I said softly to myself the word paralysis. It had always sounded strangely in my ears, like the word gnomon in the Euclid and the word simony in the Catechism. But now it sounded to me like the name of some maleficent and sinful being. It filled me with fear, and yet I longed to be nearer to it and to look upon its deadly work.  Old Cotter was sitting at the fire, smoking, when I came downstairs to supper. While my aunt was ladling out

In [12]:
text_sents = nltk.tokenize.sent_tokenize(text)
for i, sent in enumerate(text_sents):
    text_sents[i] = sent.translate(str.maketrans('', '', string.punctuation))

In [13]:
text_sents[:20]

['There was no hope for him this time it was the third stroke',
 'Night after night I had passed the house it was vacation time and studied the lighted square of window and night after night I had found it lighted in the same way faintly and evenly',
 'If he was dead I thought I would see the reflection of candles on the darkened blind for I knew that two candles must be set at the head of a corpse',
 'He had often said to me I am not long for this world and I had thought his words idle',
 'Now I knew they were true',
 'Every night as I gazed up at the window I said softly to myself the word paralysis',
 'It had always sounded strangely in my ears like the word gnomon in the Euclid and the word simony in the Catechism',
 'But now it sounded to me like the name of some maleficent and sinful being',
 'It filled me with fear and yet I longed to be nearer to it and to look upon its deadly work',
 'Old Cotter was sitting at the fire smoking when I came downstairs to supper',
 'While my aunt

In [14]:
# We should also record multiple instances of the activation phrase

activation_word = 'hello'

text_sents = 5 * [activation_word] + text_sents

In [16]:
import pyaudio
import wave
import IPython.display as ipd
import ipywidgets as widgets
import threading

recording_thread = None
stream = None
def record():
    global stream
    while recording:
        data = stream.read(1024, exception_on_overflow=False)
        frames.append(data)

# Function to start recording audio
def start_recording(button):
    global recording, current_sentence_index, frames, recording_thread, stream
    recording = True
    button_start.disabled = True
    button_stop.disabled = False
    button_repeat.disabled = True
    frames = []
    sentence_text.value = sentences[current_sentence_index]
    stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=1024,
                stream_callback=None)
    stream.start_stream()
    recording_thread = threading.Thread(target=record)
    recording_thread.start()

# Function to stop recording audio
def stop_recording(button):
    global recording, current_sentence_index, recording_thread, stream
    recording = False
    recording_thread.join()
    button_start.disabled = False
    button_stop.disabled = True
    button_repeat.disabled = False
    stream.stop_stream()
    stream.close()
    sentence_text.value = "Recording complete."
    write_audio(frames, f'fine_tuning_data/recording_sentence_{current_sentence_index+1}.wav')

# Function to write audio frames to wav file
def write_audio(frames, filename):
    p = pyaudio.PyAudio()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(16000)
    wf.writeframes(b''.join(frames))
    wf.close()

# Function to move to next sentence
def next_sentence(button):
    global current_sentence_index
    if current_sentence_index < len(sentences) - 1:
        current_sentence_index += 1
        sentence_text.value = sentences[current_sentence_index]
    else:
        sentence_text.value = "All sentences recorded."

# List of sentences
sentences = text_sents
current_sentence_index = 0

# Create buttons
button_start = widgets.Button(description="Start Recording")
button_stop = widgets.Button(description="Stop Recording")
button_stop.disabled = True
button_repeat.disabled = True
button_next = widgets.Button(description="Next")
button_next.disabled = False

# Add button click event handlers
button_start.on_click(start_recording)
button_stop.on_click(stop_recording)
button_next.on_click(next_sentence)

# Create sentence text widget
sentence_text = widgets.Textarea(value=sentences[current_sentence_index],
                                  description='Sentence:',
                                  layout={'height': '100px'}) 
# Display widgets
display(sentence_text)
display(button_start)
display(button_stop)
display(button_next)

recording = False

# Create PyAudio stream
p = pyaudio.PyAudio()
frames = []

Textarea(value='hello', description='Sentence:', layout=Layout(height='100px'))

Button(description='Start Recording', style=ButtonStyle())

Button(description='Stop Recording', disabled=True, style=ButtonStyle())

Button(description='Next', style=ButtonStyle())

In [18]:
current_sentence_index

90

In [22]:
with open('fine_tuning_data/train.txt', 'w') as f:
    f.write('path|transcript|duration\n')
    for i in range(current_sentence_index):
        file_name = f'fine_tuning_data/recording_sentence_{i+1}.wav'
        y, sr = librosa.load(file_name)
        duration = len(y) / sr
        f.write(f'{file_name}|{text_sents[i]}|{duration}\n')
