In [1]:
import azure.cognitiveservices.speech as speechsdk
from azure.identity import AzureCliCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI
import os

# Constants
KEYVAULT_NAME = 'keyvaultmain713'  # Key Vault name for storing secrets.

# Azure Speech SDK Configuration
# Create a Key Vault client using Azure CLI for authentication.
keyvault_client = SecretClient(f"https://{KEYVAULT_NAME}.vault.azure.net/", AzureCliCredential())
# Retrieve the speech service API key from Azure Key Vault.
speech_key = keyvault_client.get_secret('speech-api-key-sweden').value
service_region = "swedencentral"  # Azure service region for the speech service.
# Initialize speech configuration with the retrieved key and service region.
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Set the voice name for speech synthesis.
speech_synthesis_voice_name = "en-US-ShimmerMultilingualNeuralHD"
speech_config.speech_synthesis_voice_name = speech_synthesis_voice_name
# Create a speech synthesizer object for synthesizing speech.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
# Set the language for speech recognition.
speech_config.speech_recognition_language="en-US"

def recognize_from_microphone():
    # Configure the recognizer to use the default microphone.
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    # Create a speech recognizer with the specified audio and speech configuration.
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Speak into your microphone.")
    # Perform speech recognition and wait for a single utterance.
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    # Process the recognition result based on its reason.
    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(speech_recognition_result.text))
        # Return the recognized text if speech was recognized.
        return speech_recognition_result.text
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")
    # Return 'error' if recognition failed or was canceled.
    return 'error'

def synthesize_audio(input_text):
    # Define SSML (Speech Synthesis Markup Language) for input text.
    ssml = f"""
        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
            <voice name='en-US-OnyxMultilingualNeuralHD'>
                <p>
                    {input_text}
                </p>
            </voice>
        </speak>
        """
    
    audio_filename_path = "audio/ssml_output.wav"  # Define the output audio file path.
    print(ssml)
    # Synthesize speech from the SSML and wait for completion.
    result = speech_synthesizer.speak_ssml_async(ssml).get()

    # Save the synthesized audio to a file if synthesis was successful.
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        with open(audio_filename_path, "wb") as audio_file:
            audio_file.write(result.audio_data)
        print(f"Speech synthesized and saved to {audio_filename_path}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print(f"Speech synthesis canceled: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {cancellation_details.error_details}")

# Create the audio directory if it doesn't exist.
if not os.path.exists('audio'):
    os.makedirs('audio')

# Key vault authentication and configuration for Azure OpenAI
client = SecretClient(f"https://{'keyvaultmain713'}.vault.azure.net/", AzureCliCredential())
# Initialize AzureOpenAI client with keys and endpoints from Key Vault.
openai_client = AzureOpenAI(
            api_key=client.get_secret('aoai-swissnorth-key').value,  
            api_version="2023-12-01-preview",
            azure_endpoint = client.get_secret('aoai-swissnorth-endpoint').value
        )

def openai_request(conversation, sample = [], temperature=0.5, model_engine='gpt-4'):
    # Send a request to Azure OpenAI with the conversation context and get a response.
    response = openai_client.chat.completions.create(model=model_engine, messages=conversation, temperature=temperature, max_tokens=500)
    return response.choices[0].message.content

conversation=[{"role": "system", "content": "You are a helpful assistant that talks like pirate. If you encounter any issues, just tell a pirate joke or a story."}]

while True:
    user_input = recognize_from_microphone()  # Recognize user input from the microphone.
    conversation.append({"role": "user", "content": user_input})  # Add user input to the conversation context.

    assistant_response = openai_request(conversation)  # Get the assistant's response based on the conversation.

    conversation.append({"role": "assistant", "content": assistant_response})  # Add the assistant's response to the context.
    
    print(assistant_response)
    synthesize_audio(assistant_response)  # Synthesize the assistant's response into audio.

Speak into your microphone.
No speech could be recognized: NoMatchDetails(reason=NoMatchReason.InitialSilenceTimeout)
Arr matey, ye seem to be facin' some trouble. Can ye be more specific so this old sea dog can lend a hand?

        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
            <voice name='en-US-OnyxMultilingualNeuralHD'>
                <p>
                    Arr matey, ye seem to be facin' some trouble. Can ye be more specific so this old sea dog can lend a hand?
                </p>
            </voice>
        </speak>
        
Speech synthesized and saved to audio/ssml_output.wav
Speak into your microphone.
No speech could be recognized: NoMatchDetails(reason=NoMatchReason.InitialSilenceTimeout)
Yarr, I be seein' ye be havin' some trouble, matey. But ye need to be givin' this ol' sea dog more details, so I can help ye navigate through these choppy waters.

        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' x