# [Text To Speech] Synthetic Audio Dataset Creation

This sample demonstrates how to use Azure AI Speech API to generate synthetic audio dataset from text.

> ✨ **_Note_** <br>
> Please check the supported languages and region availabilty before you get started - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts / https://learn.microsoft.com/en-us/azure/ai-services/speech-service/regions

## Prerequisites

Configure a Python virtual environment for 3.10 or later:

1.  open the Command Palette (Ctrl+Shift+P).
1.  Search for Python: Create Environment.
1.  select Venv / Conda and choose where to create the new environment.
1.  Select the Python interpreter version. Create with version 3.10 or later.


## 1. Set up Speech SDK


In [None]:
import azure.cognitiveservices.speech as speechsdk
import os
import html
import time
import json
from dotenv import load_dotenv
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage
from azure.ai.inference.models import UserMessage
from azure.core.credentials import AzureKeyCredential

load_dotenv()

USE_AOAI = True

SPEECH_KEY = os.getenv("AZURE_AI_SPEECH_API_KEY")
SPEECH_REGION = os.getenv("AZURE_AI_SPEECH_REGION")
CUSTOM_SPEECH_LANG = os.getenv("CUSTOM_SPEECH_LANG")
CUSTOM_SPEECH_LOCALE = os.getenv("CUSTOM_SPEECH_LOCALE")
TTS_FOR_QUESTION = os.getenv("TTS_FOR_QUESTION")
TTS_FOR_ANSWER = os.getenv("TTS_FOR_ANSWER")

phi_api_endpoint = os.getenv("AZURE_PHI3.5_ENDPOINT")
phi_api_key = os.getenv("AZURE_PHI3.5_API_KEY")
phi_deployment_name = os.getenv("AZURE_PHI3.5_DEPLOYMENT_NAME")

aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

if "/models" in phi_api_endpoint:
    phi_api_endpoint = phi_api_endpoint.split("/models")[0] + "/models"
    
if "/models" in aoai_api_endpoint:
    aoai_api_endpoint = aoai_api_endpoint.split("/models")[0] + "/models"  
    
print("=== Azure AI Speech Info ===")
print(f"SPEECH_REGION={SPEECH_REGION}")
print(f"CUSTOM_SPEECH_LANG={CUSTOM_SPEECH_LANG}")
print(f"CUSTOM_SPEECH_LOCALE={CUSTOM_SPEECH_LOCALE}")  
print(f"TTS_FOR_QUESTION={TTS_FOR_QUESTION}") 
print(f"TTS_FOR_ANSWER={TTS_FOR_ANSWER}\n") 
        
try:
    if USE_AOAI:
        client = AzureOpenAI(
            azure_endpoint = aoai_api_endpoint,
            api_key        = aoai_api_key,
            api_version    = aoai_api_version,
        )

        print("=== Initialized AzuureOpenAI client ===")
        print(f"AZURE_OPENAI_ENDPOINT={aoai_api_endpoint}")
        print(f"AZURE_OPENAI_API_VERSION={aoai_api_version}")
        print(f"AZURE_OPENAI_DEPLOYMENT_NAME={aoai_deployment_name}")
        
    else:   
        client = ChatCompletionsClient(
            #endpoint="https://aoai-services1.services.ai.azure.com/models/chat/completions?api-version=2024-05-01-preview", # you will run into a 500 error if you use this endpoint
            endpoint=phi_api_endpoint,
            credential=AzureKeyCredential(phi_api_key)
        )
        
        print("=== Initialized AI Inference client ===")
        print(f"AZURE_PHI3.5_ENDPOINT={phi_api_endpoint}")
        print(f"AZURE_PHI3.5_API_KEY={phi_api_key}")
        print(f"AZURE_PHI3.5_DEPLOYMENT_NAME={phi_deployment_name}")           

except (ValueError, TypeError) as e:
    print(e)


Create an instance of a speech config with specified subscription key and service region.
Replace with your own subscription key and service region (e.g., "westus").


In [12]:
speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

In [None]:
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

## 2. Generate Synthetic Question and Answer Text Dataset


In [None]:
import os
from azure.ai.inference.models import SystemMessage
from azure.ai.inference.models import UserMessage

NUM_SAMPLES = 2

topic = f"""
Contoso Electronics call center question and answer related expected spoken utterances for {CUSTOM_SPEECH_LANG} and English languages.
"""
question = f"""
create {NUM_SAMPLES} lines of jsonl of the topic in {CUSTOM_SPEECH_LANG} and english. jsonl format is required. use 'no' as number 'type' including 'question' or 'answer' and '{CUSTOM_SPEECH_LOCALE}', 'en-US' keys for the languages.
only include the lines as the result. Do not include ```jsonl, ``` and blank line in the result. 
"""

system_message = """
Generate plain text sentences of #topic# related text to improve the recognition of domain-specific words and phrases.
Domain-specific words can be uncommon or made-up words, but their pronunciation must be straightforward to be recognized. 
Use text data that's close to the expected spoken utterances. The nummber of utterances per line should be 1. 
Here is examples of the expected format:
{"no": 1, "string": "string", "string": "string", "string": "string"}
{"no": 2, "string": "string", "string": "string", "string": "string"} 
"""

user_message = f"""
#topic#: {topic}
Question: {question}
"""

if USE_AOAI:
    response = client.chat.completions.create(
        model=aoai_deployment_name,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ],
        temperature=0.8,
        max_tokens=1024,
        top_p=0.1    
    )
else:
    response = client.complete(
        messages=[
            SystemMessage(content=system_message),
            UserMessage(content=user_message),
        ],
        # Simply change the model name for the appropiate model "Phi-3.5-mini-instruct" or "Phi-3.5-vision-instruct"
        model=phi_deployment_name, 
        temperature=0.8,
        max_tokens=1024,
        top_p=0.1
    )    
    
content = response.choices[0].message.content
print(content)
print("Usage Information:")
#print(f"Cached Tokens: {response.usage.prompt_tokens_details.cached_tokens}") #only o1 models support this
print(f"Completion Tokens: {response.usage.completion_tokens}")
print(f"Prompt Tokens: {response.usage.prompt_tokens}")
print(f"Total Tokens: {response.usage.total_tokens}")

## 3. Generate Synthetic Speaker Identification Audio Dataset


In [None]:
import datetime

languages = [CUSTOM_SPEECH_LOCALE]  # List of languages to generate audio files
output_dir = "synthetic_two_speaker_data"
DELETE_OLD_DATA = True

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if DELETE_OLD_DATA:
    for file in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, file))

for i in range(0, len(content.strip().split('\n')), 2):
    try:
        question_line = content.strip().split('\n')[i]
        answer_line = content.strip().split('\n')[i + 1]
        
        question_expression = json.loads(question_line)
        answer_expression = json.loads(answer_line)
        
        question_no = question_expression['no']
        question_text = question_expression[CUSTOM_SPEECH_LOCALE]
        question_tts_voice = TTS_FOR_QUESTION
        
        answer_text = answer_expression[CUSTOM_SPEECH_LOCALE]
        answer_tts_voice = TTS_FOR_ANSWER
        
        combined_text = f"{question_text} {answer_text}"
        timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        file_name = f"{question_no}_{CUSTOM_SPEECH_LOCALE}_{timestamp}.wav"
        print(f"Generating {file_name}")        
        
        ssml = f"""<speak version='1.0'  xmlns="https://www.w3.org/2001/10/synthesis" xml:lang='{CUSTOM_SPEECH_LOCALE}'>
                        <voice name='{question_tts_voice}'>
                                {html.escape(question_text)}
                        </voice>
                        <voice name='{answer_tts_voice}'>
                                {html.escape(answer_text)}
                        </voice>
                    </speak>"""
        
        speech_sythesis_result = speech_synthesizer.speak_ssml_async(ssml).get()
        stream = speechsdk.AudioDataStream(speech_sythesis_result)
        stream.save_to_wav_file(os.path.join(output_dir, file_name))
        
        with open(f'{output_dir}/manifest.txt', 'a', encoding='utf-8') as manifest_file:
            manifest_file.write(f"{file_name}\t{combined_text}\n")
    except (json.JSONDecodeError, IndexError) as e:
        print(f"Error processing lines {i} and {i + 1}")
        print(e)

## 4. Test the speaker diarization with the generated audio


In [None]:
import os
from IPython.display import Audio, display

files = os.listdir(output_dir)
wav_files = [file for file in files if file.endswith('.wav')]

# Sort wav_files by 'no' in ascending order
wav_files.sort(key=lambda x: int(x.split('_')[0]))
wav_files

In [None]:
# Play each WAV file in the output folder
for wav_file in wav_files[:2]:
    file_path = os.path.join(output_dir, wav_file)
    display(Audio(filename=file_path))

In [9]:
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
    print('Canceled event')

def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
    print('SessionStopped event')

def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
    print('\nTRANSCRIBED:')
    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print('\tText={}'.format(evt.result.text))
        print('\tSpeaker ID={}\n'.format(evt.result.speaker_id))
    elif evt.result.reason == speechsdk.ResultReason.NoMatch:
        print('\tNOMATCH: Speech could not be TRANSCRIBED: {}'.format(evt.result.no_match_details))

def conversation_transcriber_transcribing_cb(evt: speechsdk.SpeechRecognitionEventArgs):
    print('TRANSCRIBING:')
    print('\tText={}'.format(evt.result.text))
    print('\tSpeaker ID={}'.format(evt.result.speaker_id))

def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
    print('SessionStarted event')

def speech_recognition_from_file(file_path: str, lang:str):
    speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION, speech_recognition_language=lang)
    speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults, value='true')
    audio_config = speechsdk.AudioConfig(filename=file_path)
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    transcribing_stop = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        #"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal transcribing_stop
        transcribing_stop = True

    # Connect callbacks to the events fired by the conversation transcriber
    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
    conversation_transcriber.transcribing.connect(conversation_transcriber_transcribing_cb)
    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    conversation_transcriber.start_transcribing_async()

    # Waits for completion.
    while not transcribing_stop:
        time.sleep(.5)

    conversation_transcriber.stop_transcribing_async()

In [None]:
for wav_file in wav_files[0:3]:
    speech_recognition_from_file(os.path.join(output_dir, wav_file), CUSTOM_SPEECH_LOCALE)