### Speech resource configuration

In [1]:
import os

import azure.cognitiveservices.speech as speechsdk
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv

load_dotenv(override=True)

speech_api_key = os.environ.get("AZURE_SPEECH_API_KEY")
speech_region = os.environ.get("AZURE_SPEECH_REGION") #Note this takes a REGION instead of an endpoint!
credential = AzureKeyCredential(speech_api_key)

#Speech config is the speech RESOURCE config, used for both Text-To-Speech and Speech-To-Text
speech_config = speechsdk.SpeechConfig(
    subscription=speech_api_key, region=speech_region
)

### Speech-To-Text

In [2]:
# Audio Input Config is also the audio DEVICE config, used for Speech-To-Text
audio_input_config = speechsdk.audio.AudioConfig(use_default_microphone=True) #Using microphone, but could be text file instead

# Speech Recognizer is our "client" used for Speech-To-Text
speech_recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config, audio_config=audio_input_config
)

In [15]:
#Synchronous (blocking) single recognition
speech_result = speech_recognizer.recognize_once() # stops on silence or after 15 seconds

print('Code waits...')

print(speech_result)

Code waits
SpeechRecognitionResult(result_id=ab993028145c430d97abc26f8efd6bf3, text="Could waits for me to say something and it'll spit something out at the other end.", reason=ResultReason.RecognizedSpeech)


In [6]:
#Asynchronous (non-blocking) single recognition
speech_recognition_result_future = speech_recognizer.recognize_once_async() # stops on silence or after 15 seconds

print('Code keeps going')

speech_recognition_result = speech_recognition_result_future.get()
print(speech_recognition_result)

Code keeps going
SpeechRecognitionResult(result_id=66bad5855cc24991ac8cfc459d171cf4, text="Or I can do it this way where it returns to future promise I guess.", reason=ResultReason.RecognizedSpeech)


In [13]:
# Continuous recognition. Requires connecting callsbacks for handling various events.

# Note, this async method doesn't play nicely with my notebook, so I have to repeat the config setup within the same cell.
speech_recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config, audio_config=audio_input_config
)

#I'm just printing all events, but this could be much more complicated event handling.
def on_recognizing(event):
    print(f"Recognizing: {event.result.text}")

def on_recognized(event):
    print(f"Recognized: {event.result.text}")

def on_canceled(event):
    print(f"Canceled: {event.reason}")

def on_session_started(event):
    print("Session started.")

def on_session_stopped(event):
    print("Session stopped.")

# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(on_recognizing)
speech_recognizer.recognized.connect(on_recognized)
speech_recognizer.canceled.connect(on_canceled)
speech_recognizer.session_started.connect(on_session_started)
speech_recognizer.session_stopped.connect(on_session_stopped)

print("Starting continuous recognition...")
speech_recognizer.start_continuous_recognition_async()

# Run for 10 seconds and then stop continuous recognition
import time
time.sleep(10)
print("Stopping continuous recognition...")
speech_recognizer.stop_continuous_recognition_async()

Starting continuous recognition...
Session started.
Recognizing: and
Recognizing: and trying again
Recognizing: and trying again this
Recognizing: and trying again this way so it looks
Recognizing: and trying again this way so it looks good that's
Recognizing: and trying again this way so it looks good that's great
Recognized: And trying again this way so it looks good. That's great.
Stopping continuous recognition...
Session stopped.
None


### Text-To-Speech

In [3]:
## Text-To-Speech

# Audio Output Config is the audio DEVICE config, used for Text-To-Speech
audio_output_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True) #Using speaker, but could be audio file instead

# Speech Synthesizer is our "client" used for Text-To-Speech
speech_synthesizer = speechsdk.SpeechSynthesizer(
    speech_config=speech_config, audio_config=audio_output_config
)

In [3]:
#Synchronous (blocking) single synthesis
speech_synthesizer.speak_text("I am a friendly robot and I do not intend to take over the world.")

print('Code waits...')

Code waits...


In [24]:
#Synchronous (blocking) single synthesis
result_future = speech_synthesizer.speak_text_async("I am a friendly robot and I do not intend to take over the world.")

print('Code continues...')

result = result_future.get()
print(result)

Code continues...
SpeechSynthesisResult(result_id=464d91f12ea44646a67b74046f459784, reason=ResultReason.SynthesizingAudioCompleted, audio_length=170046)


In [31]:
# Other async methods too..
speech_synthesizer.start_speaking_text_async("one, two, three, four, five, six, seven, eight...")

import time
time.sleep(3)

speech_synthesizer.stop_speaking_async()

<azure.cognitiveservices.speech.ResultFuture at 0x2392e1c5b20>

In [23]:
# Change voice
speech_config.speech_synthesis_voice_name = "en-US-AshleyNeural"

speech_synthesizer = speechsdk.SpeechSynthesizer(
    speech_config=speech_config, audio_config=audio_output_config
)
speech_synthesizer.speak_text("I am a friendly robot and I do not intend to take over the world.")


# Change voice
speech_config.speech_synthesis_voice_name = "en-US-AIGenerate1Neural"

speech_synthesizer = speechsdk.SpeechSynthesizer(
    speech_config=speech_config, audio_config=audio_output_config
)
speech_synthesizer.speak_text("I am a friendly robot and I do not intend to take over the world.")

<azure.cognitiveservices.speech.SpeechSynthesisResult at 0x20a1392e8e0>

In [56]:
# Change voice
speech_config.speech_synthesis_voice_name = "en-US-NancyNeural"

speech_synthesizer = speechsdk.SpeechSynthesizer(
    speech_config=speech_config, audio_config=audio_output_config
)
speech_synthesizer.speak_text("I am a friendly robot and I do not intend to take over the world.")

<azure.cognitiveservices.speech.SpeechSynthesisResult at 0x20a13caf970>

In [59]:
# SSML to change pitch, speaking style, rate...
ssml_text = """
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
  <voice name='en-US-NancyNeural'>
    <mstts:express-as style='unfriendly'>
        <prosody pitch='+15Hz'>
            Oh, I am such a friendly robot, and I
            <mstts:express-as style='whispering' styledegree="2">
                <prosody pitch='+20Hz' rate='-20%'>
                    definitely
                </prosody>
            </mstts:express-as>
            don't want to take over the world!
        </prosody>
    </mstts:express-as>
  </voice>
</speak>
"""
speech_synthesizer.speak_ssml(ssml_text)


<azure.cognitiveservices.speech.SpeechSynthesisResult at 0x20a13bfac10>

In [87]:
# SSML to change language.
# Note multilingual voice name, and de-DE language tag
ssml_text = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-AvaMultilingualNeural">
        <lang xml:lang="de-DE">
            Ich bin ein freundlicher Roboter und habe nicht vor, die Welt zu erobern
        </lang>
    </voice>
</speak>
"""
speech_synthesizer.speak_ssml(ssml_text)

#Note en-US language tag
ssml_text = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-AvaMultilingualNeural">
        <lang xml:lang="en-US">
            Ich bin ein freundlicher Roboter und habe nicht vor, die Welt zu erobern.
        </lang>
    </voice>
</speak>
"""
speech_synthesizer.speak_ssml(ssml_text)

#Note de-DE voice
ssml_text = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="de-DE-TanjaNeural">
        <lang xml:lang="en-US">
            I am a friendly robot and I do not intend to take over the world.
        </lang>
    </voice>
</speak>
"""
speech_synthesizer.speak_ssml(ssml_text)

<azure.cognitiveservices.speech.SpeechSynthesisResult at 0x20a13d696a0>

### Speech Translation

In [10]:
# Speech translation config is the speech RESOURCE config
speech_translation_config = speechsdk.translation.SpeechTranslationConfig(
    subscription=speech_api_key, region=speech_region
)

# Note how the arguments are identical to the normal speech resource config
#   speech_config = speechsdk.SpeechConfig(
#       subscription=speech_api_key, region=speech_region
#   )

# However, it has some new methods
speech_translation_config.speech_recognition_language="en-US"
speech_translation_config.add_target_language("fr")
speech_translation_config.add_target_language("de")

# Reusing our audio input config from earlier...
audio_input_config = speechsdk.audio.AudioConfig(
    use_default_microphone=True
)

# Recognizer client takes a translaction config and an audio config.
translation_recognizer = speechsdk.translation.TranslationRecognizer(
    translation_config=speech_translation_config, audio_config=audio_input_config
)

In [11]:
#Single blocking translation recognition
translation_result = translation_recognizer.recognize_once()
print(translation_result)

TranslationRecognitionResult(result_id=e15af6e15cca4dbdb3db2ce37a5ca2b7, translations={'fr': 'Il s’agit de tester la traduction unique.', 'de': 'Dies ist das Testen der einzelnen Übersetzung.'}, reason=ResultReason.TranslatedSpeech)


In [None]:
# Also has other methods you'd expect...
translation_recognizer.recognize_once_async()
translation_recognizer.start_continuous_recognition()
translation_recognizer.stop_continuous_recognition()

In [15]:
# Speech to translated speech example
translation_result = translation_recognizer.recognize_once()

# Mapping voices to languages
voices_lookup = {
    "de": "de-DE-KlarissaNeural",
    "fr": "fr-FR-MauriceNeural"
}

for language, translation in translation_result.translations.items():
    print(language, translation)

    #Set voice based on language to translate to.
    voice_name = voices_lookup.get(language)
    speech_config.speech_synthesis_voice_name = voice_name

    #Construct synthesizer with correct voice.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=audio_output_config
    )
    
    #Speak.
    speech_synthesizer.speak_text(translation)
    

fr Je suis un robot amical et je ne veux pas conquérir le monde.
de Ich bin ein freundlicher Roboter und möchte nicht die Weltherrschaft übernehmen.


### Text Translation

In [35]:
# Note: Translator SDK is currently in beta, hopefully it's improved.
from azure.ai.translation.text import TextTranslationClient, TranslatorCredential
from dotenv import load_dotenv

load_dotenv(override=True)

translator_api_key = os.environ.get("AZURE_TRANSLATOR_API_KEY")
translator_region = os.environ.get("AZURE_TRANSLATOR_REGION") 
translator_endpoint = os.environ.get("AZURE_TRANSLATOR_ENDPOINT")

#Note this is NOT the default AzureKeyCredential and it takes a region AND an endpoint!
credential = TranslatorCredential(translator_api_key, translator_region) 
text_translator = TextTranslationClient(endpoint=translator_endpoint, credential=credential)

In [58]:
from azure.ai.translation.text.models import InputTextItem

#Translation method takes an array of "InputTextItem" models
translation_result = text_translator.translate(
    content=[
        InputTextItem(
            text="I am a friendly robot, and I do not intend to take over the world"
        )
    ],
    to=["de", "fr"],
)[0]

print('Translation Result:')
print(translation_result, '\n')

Translation Result:
{'detectedLanguage': {'language': 'en', 'score': 1.0}, 'translations': [{'text': 'Ich bin ein freundlicher Roboter und habe nicht die Absicht, die Weltherrschaft zu übernehmen', 'to': 'de'}, {'text': 'Je suis un robot amical, et je n’ai pas l’intention de conquérir le monde', 'to': 'fr'}]} 



In [68]:
# Transliteration
transliteration = text_translator.transliterate(
    content=[InputTextItem(text="こんにちは")],
    language="ja",
    from_script="Jpan",
    to_script="Latn",
)[0]

print(f"Transliterated text: '{transliteration.text}'.")

Transliterated text: 'Kon'nichiwa​'.


In [73]:
# What? This isn't implemented?
text_translator.detect() 

AttributeError: 'TextTranslationClient' object has no attribute 'detect'