# Text to Speech

## Install and Imports

In [None]:
!pip install azure-cognitiveservices-speech python-dotenv pydub google-cloud-texttospeech ibm-watson ibm-cloud-sdk-core

In [None]:
from dotenv import load_dotenv
import os
import azure.cognitiveservices.speech as speechsdk
from google.cloud import texttospeech
import boto3
import time
from pydub import AudioSegment
from datetime import datetime
import pandas as pd
from pathlib import Path
from IPython.display import display
import textwrap
from ibm_watson import TextToSpeechV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

In [None]:
data_path = Path('/content/drive/My Drive/TCC_data/speech2Text')
env_path = Path('/content/drive/My Drive/Colab Notebooks') / '.env'

load_dotenv(dotenv_path=env_path)

In [None]:
wrapper = textwrap.TextWrapper(width=80)

def wrap_print(text):
    for element in wrapper.wrap(text=text):
        print(element)

In [None]:
def play_audio_file(file_path):
    with open(file_path, 'rb') as riff:
        audio = AudioSegment.from_file(riff)
    return audio

In [None]:
def generate_filename(base_name, ext, root='.'):
    root = str(root)
    return '{}/{}_{}.{}'.format(
        root, base_name, datetime.now().strftime('%d-%m-%Y_%H:%M:%S'), ext
    )

In [None]:
generate_filename('a', 'wav', data_path / 'tts_tests')

## Azure Cognitive Services

In [None]:
def syntesize_text_azure(text,  audio_filename):
    # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
    speech_key, service_region = os.getenv("AZURE_KEY"), os.getenv("AZURE_REGION")
    speech_language = "pt-BR"
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key, 
        region=service_region,
        speech_recognition_language=speech_language
    )

    SSML = """
    <speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US">
        <voice name="pt-BR-HeloisaRUS">
            <mstts:express-as style="General">
                <prosody rate="0%" pitch="0%">
                {}
                </prosody>
            </mstts:express-as>
        </voice>
    </speak>
    """

    # Creates an audio configuration that points to an audio file.
    # Replace with your own audio filename.
    audio_output = speechsdk.audio.AudioOutputConfig(filename=audio_filename)
    
    # Creates a synthesizer with the given settings
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=audio_output
    )

    # Synthesizes the text to speech.
    # Replace with your own text.
    text = SSML.format(text)
    result = speech_synthesizer.speak_ssml_async(text).get()
    
    return play_audio_file(audio_filename)

## Google Cloud Text To Speech

In [None]:
def syntesize_text_gcloud(text, audio_filename):
    GOOGLE_APPLICATION_CREDENTIALS = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

    # Instantiates a client
    client = texttospeech.TextToSpeechClient.from_service_account_json(
        GOOGLE_APPLICATION_CREDENTIALS
    )

    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # Build the voice request, select the language code ("en-US") and the ssml
    # voice gender ("neutral")
    voice = texttospeech.VoiceSelectionParams(
        language_code='pt-BR',
        name='pt-BR-Standard-A',
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    )

    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
    )

    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # The response's audio_content is binary.
    with open(audio_filename, 'wb') as out:
        # Write the response to the output file.
        out.write(response.audio_content)

    return play_audio_file(audio_filename)

## AWS Polly

In [None]:
def syntesize_text_aws(text, audio_filename):
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
    AWS_REGION = os.getenv('AWS_REGION')
    polly_client = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,                     
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION
    ).client('polly')

    response = polly_client.synthesize_speech(
        VoiceId='Vitoria',
        OutputFormat='mp3', 
        Text=text
    )

    with open(audio_filename, 'wb') as out:
        # Write the response to the output file.
        out.write(response['AudioStream'].read())

    return play_audio_file(audio_filename)

## IBM Watson

In [None]:
def syntesize_text_watson(text, audio_filename):
    key = os.getenv('WATSON_KEY')
    url = os.getenv('WATSON_URL')

    authenticator = IAMAuthenticator(key)

    text_to_speech = TextToSpeechV1(
        authenticator=authenticator
    )

    text_to_speech.set_service_url(url)

    with open(audio_filename, 'wb') as audio_file:
        audio_file.write(
            text_to_speech.synthesize(
                text,
                voice='pt-BR_IsabelaVoice',
                accept='audio/wav'        
            ).get_result().content)

    return play_audio_file(audio_filename)

## Tests

In [None]:
texts = [
    "Heloísa é a professora de uma turma de crianças de 6 anos. Ela adora contar histórias para seus alunos, no final do horário, as crianças sentam-se no chão de frente para a professora, ficam quietinhas e curiosas, aguardam",
    "Margaridinha  Branca dormia na sua casa, embaixo da terra.",
    "O rato roeu a roupa do rei de roma, a rainha com raiva rasgou o resto.",
    "Dona Aranha subiu pela parede, veio a chuva forte e a derrubou, já passou a chuva e o sol já vem surgindo"
    "e a dona aranha continua a subir. Ela é teimosa,desobediente. sobe, sobe, sobe nunca está contente!",
    "Celina ama os animais. Ela tem uma gatinha chamada Viola."
]

lengths = [*map(len, texts)]

In [None]:
def benchmark():

    benchmark_apis = {
        'azure_cognitive_services': {
                'call': syntesize_text_azure,
                'ext': 'wav'
            },
        'watson_text_to_speech': {
            'call': syntesize_text_watson,
            'ext': 'wav'
        },
        'aws_polly': {
            'call': syntesize_text_aws,
            'ext': 'mp3'
        },
        'gcloud_text_to_speech': {
            'call': syntesize_text_gcloud,
            'ext': 'wav'
        },
    }

    for api in benchmark_apis:
        for col in ['file', 'text', 'length', 'time']:
            benchmark_apis[api][col] = []

    for api in benchmark_apis:

        print(api)
        start_api = time.time()
        api_call = benchmark_apis[api]['call']
        ext = benchmark_apis[api]['ext']
        
        for text, length in zip(texts, lengths):
            file_name = generate_filename(api, ext, root=data_path / 'tts_tests')

            start_syntesis = time.time()
            display(api_call(text, file_name))
            total_time_syntesis = time.time() - start_syntesis

            file_name = file_name.split('/')[-1]
    
            benchmark_apis[api]['file'].append(file_name)
            benchmark_apis[api]['length'].append(length)
            benchmark_apis[api]['text'].append(text)
            benchmark_apis[api]['time'].append(total_time_syntesis)

        total_time_api = time.time() - start_api
        print('took {} seconds'.format(total_time_api))
        print('#' * 80)
    
    return benchmark_apis

In [None]:
results = benchmark()

In [None]:
dataframes = dict()

for api in results:
    dataframes[api] = pd.DataFrame(results[api]).drop(columns=['call', 'ext'])

In [None]:
for api, df in dataframes.items():
    df.to_csv(data_path / 'tts_tests' / '{}.csv'.format(api), index=False)