In [1]:
# Mount to your personal google drive for saving

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip
!pip install noisereduce

# download dataset for script reading
!pip install git+https://github.com/huggingface/datasets 

[?25l[K     - 3.0 kB 8.3 MB/s
[?25h  Building wheel for Deep-Learning-Colab-Notebook-Utils (setup.py) ... [?25l[?25hdone
Collecting noisereduce
  Downloading noisereduce-2.0.0-py3-none-any.whl (15 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-2.0.0
Collecting git+https://github.com/huggingface/datasets
  Cloning https://github.com/huggingface/datasets to /tmp/pip-req-build-2flqjr1r
  Running command git clone -q https://github.com/huggingface/datasets /tmp/pip-req-build-2flqjr1r
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 4.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 47.1 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 59.5 MB/s 
[?25hC

In [3]:
from IPython.display import display, Audio, clear_output
import ipywidgets as widgets
from dl_colab_notebooks.audio import record_audio, upload_audio
from scipy.io.wavfile import write, read
import time
from google.colab import output

In [8]:
# Download script

from datasets import load_dataset, load_metric
import pandas as pd

timit = load_dataset("timit_asr")
timit = timit.remove_columns(["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id",'file'])
df = pd.DataFrame(timit['train'])

In [9]:
print(df['text'][:10])

0              Would such an act of refusal be useful?
1         Don't ask me to carry an oily rag like that.
2    Butterscotch fudge goes well with vanilla ice ...
3    She had your dark suit in greasy wash water al...
4                                      I honor my mom.
5            Ambidextrous pickpockets accomplish more.
6    Pledge to participate in Nevada's aquatic comp...
7                      We'll talk over at your office.
8    Soil redeposition is evaluated by washing clea...
9    The groundhog clearly saw his shadow, but stay...
Name: text, dtype: object


There is overlapped scripts so save only unique scripts

In [23]:
print(len(df['text']))

# drop duplicated values 
df = df.drop_duplicates()

print('Total length of script data :', len(df['text']))

1736
Total length of script data : 1736


You should pick how many sentences you want to record. The more the better quality is. It normally takes 30 minutes to record 100 sentences. Please choose your size of the dataset.

How to record a speech data.
0. Set the value of variable 'count' which sets the size of the speech dataset
1. Before recording every sentence, there is 3 second stanby time. While then, please check the following script and prepare for recording.   
2. After 3 sec, you can see the new message 'Starting recording for (6) seconds...', which mean recording has started.
3. Read the popped sentence loudly and clearly. If recording time is too short, you can change the 'record_seconds' value longer.

In [19]:
#@title Record

import noisereduce as nr
import librosa
import numpy as np

SAMPLE_RATE = 22050
record_or_upload = "Record"
record_seconds =   6#@param {type:"number", min:1, max:10, step:1}

def _record_audio(b):
  clear_output()
  global audio
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))

# Please set the value
count = 100

for i in df.index[ :count]:
  print(f'Output {i}/{count - 1}')
  print('''
  Recording will be started in 3 seconds
  ↓Please read below sentence↓
  ''')
  print(df['text'][i])
  time.sleep(3)
  # Check the output directory
  output_dir = '/content/drive/MyDrive/record/output_' + str(i) + '.wav'
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  reduced_noise = nr.reduce_noise(y=audio, sr=SAMPLE_RATE) # reduce noise
  reduced_noise_trim, index = librosa.effects.trim(reduced_noise) # trim silent part
  write(output_dir, SAMPLE_RATE, reduced_noise_trim)
  print('file saved')
  time.sleep(2)
  output.clear()
  

Output 0/99

  Recording will be started in 3 seconds
  ↓Please read below sentence↓
  
Would such an act of refusal be useful?
Starting recording for 6 seconds...


<IPython.core.display.Javascript object>

KeyboardInterrupt: ignored

You can check the .wav files whether recording is successful or not

In [None]:
display(Audio('/content/drive/MyDrive/record/output_13.wav', rate=SAMPLE_RATE, autoplay=True))

After recording, we should make aligned filename-script .txt file

In [33]:
# Preprocess text

import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

for i in df.index:
    df['text'][i] = re.sub(chars_to_ignore_regex, '', df['text'][i]).lower() + "."

In [None]:
# Get a list of file_names|script
texts = []
for i in df.index[ :count]:
    temp = 'wavs/output_'+ str(i) + '.wav' + '|' + df['text'][i]
    texts.append(temp)

In [None]:
# Save list as list.txt
with open("/content/drive/MyDrive/record/list.txt", "w") as f:
    for item in texts:
        f.write("%s\n" % item)

Last step, we should change our .wav files to PCM 16 bit files

In [None]:
import soundfile
import glob

wavs = glob.glob('/content/drive/MyDrive/record/*.wav')
for i in wavs:
    data, samplerate = soundfile.read(i)
    soundfile.write(i, data, samplerate, subtype='PCM_16')