In [1]:
import os
import azure.cognitiveservices.speech as speechsdk
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION=os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_GPT4_DEPLOYMENT_NAME")

#init the openai client
client = AzureOpenAI(
  azure_endpoint = AZURE_OPENAI_ENDPOINT, 
  api_key=AZURE_OPENAI_API_KEY,  
  api_version=AZURE_OPENAI_API_VERSION
)

In [2]:
SPEECH_KEY = os.getenv("SPEECH_KEY")
SPEECH_REGION = os.getenv("SPEECH_REGION")
engine_name = "test"

speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
# Set up Azure Text-to-Speech language 
speech_config.speech_synthesis_language = "en-US"
# Set up Azure Speech-to-Text language recognition
speech_config.speech_recognition_language = "en-US"
#create log directory 
if not os.path.exists("./log"):
    os.makedirs("./log")

speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, "./log/log.txt")

# Set up the voice configuration
speech_config.speech_synthesis_voice_name = "en-US-JennyMultilingualNeural"
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

In [3]:
def speech_to_text():
    # Set up the audio configuration
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

    # Create a speech recognizer and start the recognition
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
    print("Say something...")

    result = speech_recognizer.recognize_once_async().get()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        return "Sorry, I didn't catch that."
    elif result.reason == speechsdk.ResultReason.Canceled:
        return "Recognition canceled."
    return ""

In [4]:
def speech_from_file(filename):
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    # Creates a speech recognizer using a file as audio input, also specify the speech language
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, language="en-US", audio_config=audio_config)

    # Starts speech recognition, and returns after a single utterance is recognized. The end of a
    # single utterance is determined by listening for silence at the end or until a maximum of about 30
    # seconds of audio is processed. It returns the recognition text as result.
    # Note: Since recognize_once() returns only a single utterance, it is suitable only for single
    # shot recognition like command or query.
    # For long-running multi-utterance recognition, use start_continuous_recognition() instead.
    result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
    return ""


In [5]:
def extract_entities(text):
    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": """
             Extract the entities from the text and provide only JSON output.
             
             {caller_name: "John", call_purpose: "meeting"}

             """},
            {"role": "user", "content": text},
        ],
        response_format={ "type": "json_object" },
    )

    return response.choices[0].message.content

In [6]:
# Define the text-to-speech function
def text_to_speech(text):
    try:
        result = speech_synthesizer.speak_text_async(text).get()
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Text-to-speech conversion successful.")
            return True
        else:
            print(f"Error synthesizing audio: {result}")
            return False
    except Exception as ex:
        print(f"Error synthesizing audio: {ex}")
        return False

In [7]:
def evaluate_sentiment(text):
    system_message = """
    You are an AI assistant that helps recognize the sentiment in a given text.
    1. Evaluate the given text and provide the category of the sentiment as either positive, negative, or neutral.
    2. Do not provide any additional examples to the output, just the category.
    """

    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4_DEPLOYMENT_NAME,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":text}
            ],
        temperature=0   
    )
    return response.choices[0].message.content

In [8]:
def translate(text, target_language):
    system_message = """You are a helpful assistant that translates text into """ + target_language + """.
    Answer in a clear and concise manner only translating the text.
    Text:
    """

    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4_DEPLOYMENT_NAME,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":text}
            ],
        temperature=0   
    )
    return response.choices[0].message.content

In [9]:
source = "./data/Call1_separated_16k_health_insurance.wav"
text = speech_from_file(source)
print(f"Transcription: {text}")

Transcription: Hello, thank you for calling Contoso, who am I speaking with today? Hi, my name is Mary Rondo. I'm trying to enroll myself with Contuso. Hi Mary. Uh, are you calling because you need health insurance? Yes, Yeah, I'm calling to sign up for insurance. Great. Uh, if you can answer a few questions, uh, we can get you signed up in a jiffy. OK. Umm, So, uh, what's your full name?


In [10]:
# Evaluate the sentiment using OpenAI
response = evaluate_sentiment(text)
print(f"Sentiment: {response}")

Sentiment: neutral


In [11]:
entities = extract_entities(text)
print(f"Entities: {entities}")

Entities: {
  "caller_name": "Mary Rondo",
  "call_purpose": "sign up for insurance"
}


In [12]:
# Translate the text to Spanish using OpenAI
translated_text = translate(text, "Spanish")
print(f"Translated to Spanish: {translated_text}")

Translated to Spanish: Hola, gracias por llamar a Contoso, ¿con quién tengo el gusto de hablar hoy? Hola, mi nombre es Mary Rondo. Estoy intentando inscribirme en Contuso. Hola Mary. Eh, ¿llamas porque necesitas seguro de salud? Sí, sí, estoy llamando para inscribirme en un seguro. Genial. Eh, si puedes responder algunas preguntas, eh, podemos inscribirte en un momento. OK. Eh, entonces, ¿cuál es tu nombre completo?
