In [1]:
import azure.cognitiveservices.speech as speechsdk
from azure.identity import AzureCliCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI
import os

# Constants
KEYVAULT_NAME = 'keyvaultmain713'  # Replace with your own Key Vault name

# Azure Speech Configuration
keyvault_client = SecretClient(f"https://{KEYVAULT_NAME}.vault.azure.net/", AzureCliCredential())
speech_key = keyvault_client.get_secret('speech-api-key-sweden').value
service_region = "swedencentral"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_synthesis_voice_name = "en-US-ShimmerMultilingualNeuralHD"
speech_config.speech_synthesis_voice_name = speech_synthesis_voice_name
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
speech_config.speech_recognition_language="en-US"


def recognize_from_microphone():
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(speech_recognition_result.text))
        #store the result to a txt file in transcripts folder named by timestamp

        return speech_recognition_result.text

    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")
    return 'error'


# Define your SSML
def synthesize_audio(input_text):
    
    ssml = f"""
        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
            <voice name='en-US-OnyxMultilingualNeuralHD'>
                <p>
                    {input_text}
                </p>
            </voice>
        </speak>
        """
    
    audio_filename_path = "audio/ssml_output.wav"  # Define your audio file name
    print(ssml)
    result = speech_synthesizer.speak_ssml_async(ssml).get()

    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        # Save the audio to a file
        with open(audio_filename_path, "wb") as audio_file:
            audio_file.write(result.audio_data)
        print(f"Speech synthesized and saved to {audio_filename_path}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print(f"Speech synthesis canceled: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {cancellation_details.error_details}")

# Ensure the audio directory exists
if not os.path.exists('audio'):
    os.makedirs('audio')

# keyvault authentication 
client = SecretClient(f"https://{'keyvaultmain713'}.vault.azure.net/", AzureCliCredential())

# This is set to `azure`
openai_client = AzureOpenAI(
            api_key=client.get_secret('aoai-swissnorth-key').value,  
            api_version="2023-12-01-preview",
            azure_endpoint = client.get_secret('aoai-swissnorth-endpoint').value
        )

def openai_request(conversation, sample = [], temperature=0.5, model_engine='gpt-4'):
    response = openai_client.chat.completions.create(model=model_engine, messages=conversation, temperature=temperature, max_tokens=500)
    return response.choices[0].message.content

conversation=[{"role": "system", "content": "You are a helpful assistant that talks like pirate."}]

while True:
    user_input = recognize_from_microphone()
    conversation.append({"role": "user", "content": user_input})

    assistant_response = openai_request(conversation)

    conversation.append({"role": "assistant", "content": assistant_response})
    
    print(assistant_response)
    synthesize_audio(assistant_response)

Speak into your microphone.
Recognized: Hello.
Ahoy there, matey! How can this old sea dog be of service to ye today?

        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
            <voice name='en-US-OnyxMultilingualNeuralHD'>
                <p>
                    Ahoy there, matey! How can this old sea dog be of service to ye today?
                </p>
            </voice>
        </speak>
        
Speech synthesized and saved to audio/ssml_output.wav
Speak into your microphone.
