[![Works with Edge Impulse](../.assets/images/ei-badge.svg)](http://edgeimpulse.com) [![Open in Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/edgeimpulse/notebooks/blob/main/notebooks/03-generate-keyword-spotting-dataset.ipynb) 

# Use Google Text-To-Speech to generate a dataset for keyword spotting

## 1. Obtain an API key from your project

In [53]:

import os
# Insert the path to your service account API key json file here, for google cloud
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../jim-sandbox-369711-9493c9a29d62.json'
# Keyword or short sentence and label (e.g. 'hello world')
keyword = [
    {'string':' infineon','label':'infineon'},
    {'string':' arrow','label':'arrow'},
           ]
# Languages, remove as appropriate
# languages = [
#     'ar-XA', 'bn-IN',  'en-GB',  'fr-CA',
#     'en-US', 'es-ES',  'fi-FI',  'gu-IN',
#     'ja-JP', 'kn-IN',  'ml-IN',  'sv-SE',
#     'ta-IN', 'tr-TR',  'cs-CZ',  'de-DE',
#     'en-AU', 'en-IN',  'fr-FR',  'hi-IN',
#     'id-ID', 'it-IT',  'ko-KR',  'ru-RU',
#     'uk-UA', 'cmn-CN', 'cmn-TW', 'da-DK',
#     'el-GR', 'fil-PH', 'hu-HU',  'nb-NO',
#     'nl-NL', 'pt-PT',  'sk-SK',  'vi-VN',
#     'pl-PL', 'pt-BR',  'ca-ES',  'yue-HK',
#     'af-ZA', 'bg-BG',  'lv-LV',  'ro-RO',
#     'sr-RS', 'th-TH',  'te-IN',  'is-IS'
# ]
languages = [
    'en-GB',  'fr-CA',
    'en-US', 'es-ES',  'gu-IN',
    'ja-JP', 'kn-IN',  'ml-IN',  'sv-SE',
    'ta-IN', 'tr-TR',  'cs-CZ',  'de-DE',
    'en-AU', 'en-IN',  'fr-FR',  'hi-IN',
    'id-ID', 'it-IT',  'ko-KR',  'ru-RU',
    'uk-UA', 'cmn-CN', 'cmn-TW', 'da-DK',
    'el-GR', 'fil-PH', 'hu-HU',  'nb-NO',
    'nl-NL', 'pt-PT',  'sk-SK',  'vi-VN',
    'pl-PL', 'pt-BR',  'ca-ES',  'yue-HK',
    'af-ZA', 'bg-BG',  'lv-LV',  'ro-RO',
    'sr-RS', 'th-TH',  'te-IN',  'is-IS'
]
# Number of keywords to generate
count = 2
# Out length (default: 00:01)
out_length = 2
# Pitches to generate
pitches = [-2, 0, 2]
# Voice genders to use
genders = ["NEUTRAL", "FEMALE", "MALE"]
# Speaking rates to use
speakingRates = [0.9, 1, 1.1]


In [54]:
import shutil
import json
import time
from google.cloud import texttospeech

all_opts = []
for p in pitches:
    for g in genders:
        for l in languages:
            for s in speakingRates:
                for kw in keyword:
                    all_opts.append({
                            "pitch": p,
                            "gender": g,
                            "language": l,
                            "speakingRate": s,
                            "text": kw['string'],
                            "label": kw['label']
                        })

if len(all_opts) > count:
    selectEvery = len(all_opts) // count
    selectNext = 0
    all_opts = [all_opts[ix] for ix in range(len(all_opts)) if ix > selectNext and (selectNext := selectNext + selectEvery) or True]

if os.path.exists('out-wav'):
    shutil.rmtree('out-wav')
os.makedirs('out-wav', exist_ok=True)

downloaded_files = []

# Instantiates a client
client = texttospeech.TextToSpeechClient()

ix = 0
for o in all_opts:
    ix += 1
    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=o['text'])
    # Build the voice request
    voice = texttospeech.VoiceSelectionParams(
        language_code=o['language'],
        ssml_gender=o['gender']
    )
    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        pitch=o['pitch'],
        speaking_rate=o['speakingRate'],
        sample_rate_hertz=16000
    )
    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    

    wav_file_name = f"out-wav/{o['label']}.{o['language']}-{o['gender']}-{o['pitch']}-{o['speakingRate']}.tts.wav"

    if not os.path.exists(wav_file_name):
        print(f"[{ix}/{len(all_opts)}] Text-to-speeching...")
        response = client.synthesize_speech(
            input=synthesis_input, voice=voice, audio_config=audio_config
        )
        with open(wav_file_name, "wb") as f:
            f.write(response.audio_content)
        has_hit_api = True

    downloaded_files.append({
        "path": str(wav_file_name),
        "label": o['label'],
        "category": "split",
        "metadata": {
            "pitch": o['pitch'],
            "gender": o['gender'],
            "language": o['language'],
            "speakingRate": o['speakingRate'],
            "text": o['text'],
            "imported_from": "Google Cloud TTS"
        }
    })

    if has_hit_api:
        time.sleep(2)

print("Done text-to-speeching")
print("")

input_file = os.path.join(os.path.join(os.path.sep, 'tmp', 'ei-s3-sync-'), 'input.json')
os.makedirs(input_file.parent, exist_ok=True)

info_file = {
    "version": 1,
    "files": downloaded_files
}

with open(input_file, "w") as f:
    json.dump(info_file, f)


[1/2430] Text-to-speeching...
[2/2430] Text-to-speeching...
[3/2430] Text-to-speeching...
[4/2430] Text-to-speeching...
[5/2430] Text-to-speeching...
[6/2430] Text-to-speeching...
[7/2430] Text-to-speeching...
[8/2430] Text-to-speeching...
[9/2430] Text-to-speeching...
[10/2430] Text-to-speeching...
[11/2430] Text-to-speeching...
[12/2430] Text-to-speeching...
[13/2430] Text-to-speeching...
[14/2430] Text-to-speeching...
[15/2430] Text-to-speeching...
[16/2430] Text-to-speeching...
[17/2430] Text-to-speeching...
[18/2430] Text-to-speeching...
[19/2430] Text-to-speeching...
[20/2430] Text-to-speeching...
[21/2430] Text-to-speeching...
[22/2430] Text-to-speeching...
[23/2430] Text-to-speeching...
[24/2430] Text-to-speeching...
[25/2430] Text-to-speeching...
[26/2430] Text-to-speeching...
[27/2430] Text-to-speeching...
[28/2430] Text-to-speeching...
[29/2430] Text-to-speeching...
[30/2430] Text-to-speeching...
[31/2430] Text-to-speeching...
[32/2430] Text-to-speeching...
[33/2430] Text-to

InternalServerError: 500 Internal error encountered.

In [31]:
import os
import requests
import io
import random
import pydub
from pydub import AudioSegment

# Set parameters
voice_dir = 'out-wav'
# Creative commons background noise from freesound.org:https://freesound.org/people/Astounded/sounds/483561/
noise_url = 'https://cdn.freesound.org/previews/483/483561_10201334-lq.ogg'
output_dir = 'out-noisy'
num_copies = 3  # Number of noisy copies to create for each input sample
max_noise_level = -6  # Maximum noise level to add in dBFS (negative value)

# Check if output directory exists and create it if it doesn't
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Download background noise file
response = requests.get(noise_url)
response.raise_for_status()
noise_audio = AudioSegment.from_file(io.BytesIO(response.content), format='ogg')

# Loop through voice samples in directory
for file_name in os.listdir(voice_dir):
    if file_name.endswith('.wav'):
        # Load voice sample
        voice_file = os.path.join(voice_dir, file_name)
        voice_audio = AudioSegment.from_file(voice_file)

        for i in range(num_copies):
            # Select random section of noise and random noise level
            start_time = random.randint(0, len(noise_audio) - len(voice_audio))
            end_time = start_time + len(voice_audio)
            noise_level = random.uniform(max_noise_level, 0)

            # Extract selected section of noise and adjust volume
            noise_segment = noise_audio[start_time:end_time]
            noise_segment = noise_segment - abs(noise_level)

            # Mix voice sample with noise segment
            mixed_audio = voice_audio.overlay(noise_segment)

            # Create new file name for mixed audio
            output_file = file_name.split('.wav')[0] + f'_mixed_{i}.wav'
            output_path = os.path.join(output_dir, output_file)

            # Save mixed audio to file
            mixed_audio.export(output_path, format='wav')

            print(f'Saved mixed audio to {output_path}')

Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.25.tts_mixed_0.wav
Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.25.tts_mixed_1.wav
Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.25.tts_mixed_2.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.tts_mixed_0.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.tts_mixed_1.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.tts_mixed_2.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.25.tts_mixed_0.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.25.tts_mixed_1.wav
Saved mixed audio to out-noisy/helloworld.en-GB-FEMALE--10-1.25.tts_mixed_2.wav
Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.tts_mixed_0.wav
Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.tts_mixed_1.wav
Saved mixed audio to out-noisy/helloworld.bn-IN-FEMALE--10-1.tts_mixed_2.wav
Saved mixed audio to out-noisy/helloworld.ar-XA-FEMALE--10

In [27]:
import requests
import os
import random
from pydub import AudioSegment

# Creative commons background noise from freesound.org:https://freesound.org/people/Astounded/sounds/483561/
sound_url = 'https://cdn.freesound.org/previews/483/483561_10201334-lq.ogg'
output_file = 'sound_file.ogg'

# Download sound file
response = requests.get(sound_url)
response.raise_for_status()
sound_data = response.content

# Save sound file to disk
with open(output_file, 'wb') as f:
    f.write(sound_data)

print(f'Saved sound file to {output_file}')

# Set parameters
input_folder = 'out-wav'
output_folder = 'out-noisy'
db_reduction = 20  # dB reduction of background noise relative to voice sample
num_copies = 3  # number of noisy copies to create for each input file

# Load background noise
background_noise = AudioSegment.from_file(output_file)

# Iterate over input files
for filename in os.listdir(input_folder):
    if filename.endswith('.wav'):
        # Load voice sample
        voice_sample = AudioSegment.from_file(os.path.join(input_folder, filename))

        # Randomly select start point for background noise
        start_time = random.randint(0, len(background_noise) - len(voice_sample))

        # Extract noise segment with same length as voice sample
        noise_segment = background_noise[start_time:start_time + len(voice_sample)]

        # Reduce noise level relative to voice sample
        noise_segment = noise_segment - db_reduction

        # Mix noise into voice sample
        for i in range(num_copies):
            # Add noise to voice sample
            noisy_sample = voice_sample.overlay(noise_segment)

            # Save noisy sample to output folder
            output_filename = f'{os.path.splitext(filename)[0]}_noisy_{i+1}.wav'
            output_path = os.path.join(output_folder, output_filename)
            noisy_sample.export(output_path, format='wav')

Saved sound file to sound_file.ogg


FileNotFoundError: [Errno 2] No such file or directory: 'out-noisy/helloworld.bn-IN-FEMALE--10-1.25.tts_noisy_1.wav'

In [65]:
import shutil
import json
import subprocess
import time
from google.cloud import texttospeech
import os
import requests
import io
import random
import pydub
from pydub import AudioSegment

# Set parameters
voice_dir = 'out-wav'
# Creative commons background noise from freesound.org:https://freesound.org/people/Astounded/sounds/483561/
noise_url = 'https://cdn.freesound.org/previews/483/483561_10201334-lq.ogg'
output_folder = 'out-noisy'
num_copies = 3  # Number of noisy copies to create for each input sample
max_noise_level = -6  # Maximum noise level to add in dBFS (negative value)

# Check if output directory exists and create it if it doesn't
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    

# Download background noise file
response = requests.get(noise_url)
response.raise_for_status()
noise_audio = AudioSegment.from_file(io.BytesIO(response.content), format='ogg')


all_opts = []
for p in pitches:
    for g in genders:
        for l in languages:
            for s in speakingRates:
                for kw in keyword:
                    all_opts.append({
                            "pitch": p,
                            "gender": g,
                            "language": l,
                            "speakingRate": s,
                            "text": kw['string'],
                            "label": kw['label']
                        })

if len(all_opts) > count:
    selectEvery = len(all_opts) // count
    selectNext = 0
    all_opts = [all_opts[ix] for ix in range(len(all_opts)) if ix > selectNext and (selectNext := selectNext + selectEvery) or True]

# if os.path.exists(voice_dir):
#     shutil.rmtree(voice_dir)
# os.makedirs(voice_dir, exist_ok=True)
# Check if output directory exists and create it if it doesn't
if not os.path.exists(voice_dir):
    os.makedirs(voice_dir)
downloaded_files = []

# Instantiates a client
client = texttospeech.TextToSpeechClient()

ix = 0
for o in all_opts:
    ix += 1
    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=o['text'])
    # Build the voice request
    voice = texttospeech.VoiceSelectionParams(
        language_code=o['language'],
        ssml_gender=o['gender']
    )
    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        pitch=o['pitch'],
        speaking_rate=o['speakingRate'],
        sample_rate_hertz=16000
    )
    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    

    wav_file_name = f"{voice_dir}/{o['label']}.{o['language']}-{o['gender']}-{o['pitch']}-{o['speakingRate']}.tts.wav"

    if not os.path.exists(wav_file_name):
        print(f"[{ix}/{len(all_opts)}] Text-to-speeching...")
        response = client.synthesize_speech(
            input=synthesis_input, voice=voice, audio_config=audio_config
        )
        with open(wav_file_name, "wb") as f:
            f.write(response.audio_content)
        has_hit_api = True
    else:
        print(f'skipping {wav_file_name}')
        has_hit_api = False

    # Load voice sample
    voice_audio = AudioSegment.from_file(wav_file_name)

    for i in range(num_copies):
        # Save noisy sample to output folder
        output_filename = f"{o['label']}.{o['language']}-{o['gender']}-{o['pitch']}-{o['speakingRate']}_noisy_{i+1}.wav"
        output_path = os.path.join(output_folder, output_filename)
        if not os.path.exists(output_path):
            # Select random section of noise and random noise level
            start_time = random.randint(0, len(noise_audio) - len(voice_audio))
            end_time = start_time + len(voice_audio)
            noise_level = random.uniform(max_noise_level, 0)

            # Extract selected section of noise and adjust volume
            noise_segment = noise_audio[start_time:end_time]
            noise_segment = noise_segment - abs(noise_level)

            # Mix voice sample with noise segment
            mixed_audio = voice_audio.overlay(noise_segment)
            # Save mixed audio to file
            mixed_audio.export(output_path, format='wav')

            print(f'Saved mixed audio to {output_path}')
        else:
            print(f'skipping {output_path}')
        downloaded_files.append({
            "path": str(output_filename),
            "label": o['label'],
            "category": "split",
            "metadata": {
                "pitch": str(['pitch']),
                "gender": str(o['gender']),
                "language": str(o['language']),
                "speakingRate": str(o['speakingRate']),
                "text": o['text'],
                "imported_from": "Google Cloud TTS"
            }
        })

    if has_hit_api:
        time.sleep(0.5)

print("Done text-to-speeching")
print("")

input_file = os.path.join(output_folder, 'input.json')
info_file = {
    "version": 1,
    "files": downloaded_files
}

with open(input_file, "w") as f:
    json.dump(info_file, f)

skipping out-wav/infineon.en-GB-NEUTRAL--2-0.9.tts.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-0.9_noisy_1.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-0.9_noisy_2.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-0.9_noisy_3.wav
skipping out-wav/arrow.en-GB-NEUTRAL--2-0.9.tts.wav
Saved mixed audio to out-noisy/arrow.en-GB-NEUTRAL--2-0.9_noisy_1.wav
Saved mixed audio to out-noisy/arrow.en-GB-NEUTRAL--2-0.9_noisy_2.wav
Saved mixed audio to out-noisy/arrow.en-GB-NEUTRAL--2-0.9_noisy_3.wav
skipping out-wav/infineon.en-GB-NEUTRAL--2-1.tts.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-1_noisy_1.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-1_noisy_2.wav
Saved mixed audio to out-noisy/infineon.en-GB-NEUTRAL--2-1_noisy_3.wav
skipping out-wav/arrow.en-GB-NEUTRAL--2-1.tts.wav
Saved mixed audio to out-noisy/arrow.en-GB-NEUTRAL--2-1_noisy_1.wav
Saved mixed audio to out-noisy/arrow.en-GB-NEUTRAL--2-1_noisy_2.wav
Saved mixed 

In [5]:

from pydub import AudioSegment
import random
import os, requests, io

# specify the input file path

# specify the number of samples you want to extract
num_samples = 400

# specify the length of each sample in milliseconds
sample_length_ms = 1000

# open the input file
# Creative commons background noise from freesound.org:https://freesound.org/people/Astounded/sounds/483561/
noise_url = 'https://cdn.freesound.org/previews/483/483561_10201334-lq.ogg'

# Check if output directory exists and create it if it doesn't
if not os.path.exists('background'):
    os.makedirs('background')
# Download background noise file
response = requests.get(noise_url)
response.raise_for_status()
audio = AudioSegment.from_file(io.BytesIO(response.content), format='ogg')

# get the duration of the audio in milliseconds
audio_duration_ms = len(audio)

# calculate the maximum start time for each sample
max_start_time_ms = audio_duration_ms - sample_length_ms

# initialize a list to hold the extracted samples
samples = []

# extract X number of 1-second samples
for i in range(num_samples):
    # generate a random start time for each sample
    start_time_ms = random.randint(0, max_start_time_ms)
    end_time_ms = start_time_ms + sample_length_ms
    
    # extract the sample
    sample = audio[start_time_ms:end_time_ms]
    
    # append the sample to the list
    samples.append(sample)
    
# export each sample to a WAV file
for i, sample in enumerate(samples):
    output_file_path = f"background/background.sample_{i+1}.wav"
    sample.export(output_file_path, format="wav")