# [Text To Speech] Synthetic Audio Dataset Creation

This sample demonstrates how to use Azure AI Speech API to generate synthetic audio dataset from text.

> ✨ **_Note_** <br>
> Please check the supported languages and region availabilty before you get started - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts / https://learn.microsoft.com/en-us/azure/ai-services/speech-service/regions

## Prerequisites

Configure a Python virtual environment for 3.10 or later:

1.  open the Command Palette (Ctrl+Shift+P).
1.  Search for Python: Create Environment.
1.  select Venv / Conda and choose where to create the new environment.
1.  Select the Python interpreter version. Create with version 3.10 or later.


## 1. Test Text to Speech Using the Speech SDK


In [None]:
import azure.cognitiveservices.speech as speechsdk
import os
import time
import json
import html
from dotenv import load_dotenv
load_dotenv()

SPEECH_KEY = os.getenv("AZURE_AI_SPEECH_API_KEY")
SPEECH_REGION = os.getenv("AZURE_AI_SPEECH_REGION")
CUSTOM_SPEECH_LANG = os.getenv("CUSTOM_SPEECH_LANG")
CUSTOM_SPEECH_LOCALE = os.getenv("CUSTOM_SPEECH_LOCALE")
TTS_FOR_TRAIN = os.getenv("TTS_FOR_TRAIN")
TTS_FOR_EVAL = os.getenv("TTS_FOR_EVAL")

synthetic_text_file = ""
%store -r synthetic_text_file
try:
    synthetic_text_file
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the previous notebook again.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

print("=== Azure AI Speech Info ===")
print(f"SPEECH_REGION={SPEECH_REGION}")
print(f"CUSTOM_SPEECH_LANG={CUSTOM_SPEECH_LANG}")
print(f"CUSTOM_SPEECH_LOCALE={CUSTOM_SPEECH_LOCALE}")  
print(f"TTS_FOR_TRAIN={TTS_FOR_TRAIN}") 
print(f"TTS_FOR_EVAL={TTS_FOR_EVAL}") 
print(f"Synthetic Text File={synthetic_text_file}")

Create an instance of a speech config with specified subscription key and service region.
Replace with your own subscription key and service region (e.g., "westus").


In [2]:
speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

In [3]:
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

In [None]:
print("Type some text that you want to speak...")
text = input()

### Test Speak text and Store the output stream to a wav file (speak_text_async)


In [19]:
MIN_RETRIES = 2
for _ in range(MIN_RETRIES):
    try:
        result = speech_synthesizer.speak_text_async(text).get()
    except Exception as e:
        time.sleep(10)
        continue

In [None]:
import os

output_dir = "sample"
    
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized to speaker for text [{}]".format(text))
    stream = speechsdk.AudioDataStream(result)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    stream.save_to_wav_file(os.path.join(output_dir, "result_text.wav"))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        if cancellation_details.error_details:
            print("Error details: {}".format(cancellation_details.error_details))
    print("Did you update the subscription info?")

### Test Speak SSML and Store the output stream to a wav file (speak_ssml_async)

-   To get TTS voice id for specific languages, please check the voice gallery for more options, https://speech.microsoft.com/portal?projecttype=voicegallery


In [8]:
default_tts_voice = 'en-US-JennyMultilingualV2Neural' # Default TTS voice for English, To get TTS voice id for specific languages, please check the voice gallery for more options
lang = "en-US"

ssml = f"""<speak version='1.0'  xmlns="https://www.w3.org/2001/10/synthesis" xml:lang='{lang}'>
                     <voice name='{default_tts_voice}'>
                             {html.escape(text)}
                     </voice>
                   </speak>"""

In [9]:
if not os.path.exists(output_dir):
        os.makedirs(output_dir)
speech_sythesis_result = speech_synthesizer.speak_ssml_async(ssml).get()
stream = speechsdk.AudioDataStream(speech_sythesis_result)
stream.save_to_wav_file(os.path.join(output_dir,"result_ssml.wav"))

## 2. Generate syntethic dataset and manifest.txt file as a training dataset


### Set the ssml template for the dataset

-   The ssml template is used to generate the synthetic dataset. Here is the reference for the ssml template: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup


In [5]:
def get_audio_file_by_speech_synthesis(text, file_path, lang, default_tts_voice):
    ssml = f"""<speak version='1.0'  xmlns="https://www.w3.org/2001/10/synthesis" xml:lang='{lang}'>
                     <voice name='{default_tts_voice}'>
                             {html.escape(text)}
                     </voice>
                   </speak>"""
    speech_sythesis_result = speech_synthesizer.speak_ssml_async(ssml).get()
    stream = speechsdk.AudioDataStream(speech_sythesis_result)
    stream.save_to_wav_file(file_path)

### Generate synthetic wav dataset and manifest for Specific language from the synthetic_text_file


In [None]:
import datetime

# Check https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt for supported locale
language = CUSTOM_SPEECH_LOCALE

output_dir = "synthetic_data"
DELETE_OLD_DATA = True

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if(DELETE_OLD_DATA):
    for file in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, file))    

train_tts_voices = TTS_FOR_TRAIN.split(',')

for tts_voice in train_tts_voices:
    with open(synthetic_text_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                expression = json.loads(line)
                no = expression['no']
                text = expression[language]
                timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                file_name = f"{no}_locale_{language}_speaker_{tts_voice}_{timestamp}.wav"
                print(f"Generating {file_name}")
                get_audio_file_by_speech_synthesis(text, os.path.join(output_dir,file_name), language, tts_voice)
                with open(f'{output_dir}/manifest.txt', 'a', encoding='utf-8') as manifest_file:
                    manifest_file.write(f"{file_name}\t{text}\n")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line}")
                print(e)

### Play WAV Files to test result in Output Folder

-   Use the os library to list all WAV files in the output folder.


In [None]:
import os
from IPython.display import Audio, display

files = os.listdir(output_dir)
wav_files = [file for file in files if file.endswith('.wav')]

# Sort wav_files by 'no' in ascending order
wav_files.sort(key=lambda x: int(x.split('_')[0]))
wav_files

### Play WAV Files

-   Use IPython.display.Audio to play each WAV file listed in the output folder.


In [None]:
# Play each WAV file in the output folder
for wav_file in wav_files[:3]:
    file_path = os.path.join(output_dir, wav_file)
    display(Audio(filename=file_path))

## 3. Audio Data Augmentation

In this section, we will give various modulations to the synthetic data through audio data augmentation.

[Audiomentations](https://github.com/iver56/audiomentations) is a Python library for audio data augmentation, commonly used in machine learning projects to improve model robustness by creating variations in training data. It provides a range of transforms like adding noise, time stretching, pitch shifting, and applying reverb. The library is designed for ease of use, allowing users to apply multiple transformations to audio samples with customizable parameters. Audiomentations supports both mono and stereo audio and integrates seamlessly with common audio processing workflows. It's lightweight, efficient, and helps simulate real-world audio conditions for better generalization in models.


In [17]:
from audiomentations import (
    AddBackgroundNoise, OneOf, Compose, Aliasing, AddGaussianNoise, LoudnessNormalization, TimeStretch, PitchShift, Shift, Gain, GainTransition, 
    BandPassFilter, BandStopFilter, AddGaussianSNR, AddColorNoise, LowPassFilter, LowShelfFilter, HighPassFilter, HighShelfFilter, TimeStretch,
    PitchShift, Shift, AdjustDuration, ClippingDistortion, AirAbsorption, PeakingFilter, Normalize
)
import numpy as np

augment = Compose([
    OneOf([
        AddBackgroundNoise(
            sounds_path="noise_sample/bg-noise.mp3",
            noise_rms="absolute",
            min_absolute_rms_db=-30,
            max_absolute_rms_db=-10,
        ),
        AddBackgroundNoise(
            sounds_path="noise_sample/bg-noise.mp3",
            min_snr_db=2,
            max_snr_db=4,
        ),
    ], p=0.3),      
    OneOf([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.0),   
        AddGaussianSNR(min_snr_db=5.0, max_snr_db=40.0, p=1.0),
        LoudnessNormalization(p=1.0),
        Aliasing(p=1.0) 
    ], p=0.3),
    OneOf([
        LowPassFilter(p=1.0),
        LowShelfFilter(p=1.0),
        HighPassFilter(p=1.0),
        HighShelfFilter(p=1.0),
        BandPassFilter(p=1.0),
        BandStopFilter(p=1.0),
        ClippingDistortion(p=0.8),
        AirAbsorption(p=0.8),
        PeakingFilter(p=0.8)
    ], p=0.6),
    OneOf([
        Gain(min_gain_db=-6.0, max_gain_db=6.0, p=1.0),
        GainTransition(p=1.0),        
        AdjustDuration(duration_seconds=5.0, p=0.5),
        AdjustDuration(duration_seconds=5.0, padding_position="start", padding_mode="wrap", p=0.5),
        AdjustDuration(duration_seconds=5.0, padding_position="start", padding_mode="reflect", p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Shift(p=0.5),
    ], p=0.3)
])

In [None]:
from scipy.io import wavfile
from audiomentations.core.audio_loading_utils import load_sound_file

output_dir_aug = "synthetic_data_aug"
NUM_AUGMENTS = 5

if not os.path.exists(output_dir_aug):
    os.makedirs(output_dir_aug)


# Play each WAV file in the output folder
for wav_file in wav_files:
    file_path = os.path.join(output_dir, wav_file)
    samples, sample_rate = load_sound_file(file_path, sample_rate=None, mono=False)
    
    if len(samples.shape) == 2 and samples.shape[0] > samples.shape[1]:
        samples = samples.transpose()
        
        
    augmented_samples = augment(samples=samples, sample_rate=sample_rate)
    if len(augmented_samples.shape) == 2:
        augmented_samples = augmented_samples.transpose()

    for aug_idx in range(NUM_AUGMENTS):        
        output_file_path = os.path.join(output_dir_aug, f"{wav_file}_aug_{aug_idx}.wav")
        wavfile.write(output_file_path, rate=sample_rate, data=augmented_samples)        
                
    #display(Audio(filename=file_path))

In [None]:
# Copy the original wav files to the augmented folder
import shutil, glob
[shutil.copy2(f, output_dir_aug) for f in glob.glob(f"{output_dir}/*") if os.path.isfile(f)]

## 4. Create a Zip file for custom model training


-   For audio + human - labeled data(Acoustic type) to train a custom speech model, you need to make a zip file of the audio files and the corresponding text file.
-   Here is an example of the structure of the labeled text file.

    > ```text
    > audio1.wav	Content like data, models, tests, and endpoints are organized into Projects in the Custom Speech portal. Each project is specific to a domain and country/language. For example, you may create a project for call centers that use English in the United States. To create your first project, select the Speech-to-text/Custom speech, then click New Project. Follow the instructions provided by the wizard to create your project. After you've created a project, you should see four tabs: Data, Testing, Training, and Deployment. Use the links provided in Next steps to learn how to use each tab.
    > audio2.wav	Custom Speech provides tools that allow you to visually inspect the recognition quality of a model by comparing audio data with the corresponding recognition result. From the Custom Speech portal, you can play back uploaded audio and determine if the provided recognition result is correct. This tool allows you to quickly inspect quality of Microsoft's baseline speech-to-text model or a trained custom model without having to transcribe any audio data.
    > ```

    ```

    ```


In [None]:
import zipfile
import shutil

DELETE_OLD_DATA = True
USE_AUGMENTED_DATA = True
if USE_AUGMENTED_DATA:
    output_dir = output_dir_aug

train_dataset_dir = "train_dataset"
if not os.path.exists(train_dataset_dir):
    os.makedirs(train_dataset_dir)

if(DELETE_OLD_DATA):
    for file in os.listdir(train_dataset_dir):
        os.remove(os.path.join(train_dataset_dir, file))    

timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
zip_filename = f'train_{language}_{timestamp}.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files:
        zipf.write(os.path.join(output_dir, file), file)

print(f"Created zip file: {zip_filename}")

shutil.move(zip_filename, os.path.join(train_dataset_dir, zip_filename))
print(f"Moved zip file to: {os.path.join(train_dataset_dir, zip_filename)}")
print(f"Moved zip file to: {os.path.join(train_dataset_dir, zip_filename)}")
train_dataset_path = {os.path.join(train_dataset_dir, zip_filename)}
%store train_dataset_path

## 4. Generate Evaluation dataset for custom model evaluation


In [None]:
import datetime

# Check https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt for supported locale
language = CUSTOM_SPEECH_LOCALE

#output_dir = "synthetic_data"
output_dir = "synthetic_data_daekeun"
DELETE_OLD_DATA = True

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if(DELETE_OLD_DATA):
    for file in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, file))    

train_tts_voices = TTS_FOR_TRAIN.split(',')

for tts_voice in train_tts_voices:
    with open(synthetic_text_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                expression = json.loads(line)
                no = expression['no']
                text = expression[language]
                timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                file_name = f"{no}_locale_{language}_speaker_{tts_voice}_{timestamp}.wav"
                print(f"Generating {file_name}")
                get_audio_file_by_speech_synthesis(text, os.path.join(output_dir,file_name), language, tts_voice)
                with open(f'{output_dir}/manifest.txt', 'a', encoding='utf-8') as manifest_file:
                    manifest_file.write(f"{file_name}\t{text}\n")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line}")
                print(e)

In [None]:
import datetime

print(TTS_FOR_EVAL) 

eval_output_dir = "synthetic_eval_data"
DELETE_OLD_DATA = True

if not os.path.exists(eval_output_dir):
    os.makedirs(eval_output_dir)

if(DELETE_OLD_DATA):
    for file in os.listdir(eval_output_dir):
        os.remove(os.path.join(eval_output_dir, file))

eval_tts_voices = TTS_FOR_EVAL.split(',')

for tts_voice in eval_tts_voices:
    with open(synthetic_text_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                expression = json.loads(line)
                no = expression['no']
                text = expression[language]
                timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
                file_name = f"{no}_{language}_{timestamp}.wav"
                get_audio_file_by_speech_synthesis(text, os.path.join(eval_output_dir,file_name), language, tts_voice)
                with open(f'{eval_output_dir}/manifest.txt', 'a', encoding='utf-8') as manifest_file:
                    manifest_file.write(f"{file_name}\t{text}\n")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line}")
                print(e)

### Create a zip file for the evaluation dataset and move it to the eval_dataset directory


In [None]:
import os
from IPython.display import Audio, display
import zipfile
import shutil

DELETE_OLD_DATA = True

eval_dataset_dir = "eval_dataset"
if not os.path.exists(eval_dataset_dir):
    os.makedirs(eval_dataset_dir)

if(DELETE_OLD_DATA):
    for file in os.listdir(eval_dataset_dir):
        os.remove(os.path.join(eval_dataset_dir, file))    

files = os.listdir(eval_output_dir)
wav_files = [file for file in files if file.endswith('.wav')]

# Sort wav_files by 'no' in ascending order
wav_files.sort(key=lambda x: int(x.split('_')[0]))

timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
zip_filename = f'eval_{language}_{timestamp}.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files:
        zipf.write(os.path.join(eval_output_dir, file), file)

print(f"Created zip file: {zip_filename}")


shutil.move(zip_filename, os.path.join(eval_dataset_dir, zip_filename))
print(f"Moved zip file to: {os.path.join(eval_dataset_dir, zip_filename)}")
eval_dataset_path = {os.path.join(eval_dataset_dir, zip_filename)}
%store eval_dataset_path