*This notebook primarily makes use of azure.ai.textanalytics classes, to meet the analytical requirements of this capstone project.*

In [None]:
import re
import textgrid

def extract_text_from_textgrid(file_path, tier_name=None):
    # Load the TextGrid file
    tg = textgrid.TextGrid.fromFile(file_path)
    
    # Find the relevant tier (if tier_name is provided)
    if tier_name:
        tier = tg.getFirst(tier_name)
    else:
        # If no specific tier is mentioned, extract from the first tier
        tier = tg[0]

    # Extract the intervals with text and concatenate them
    extracted_text = []
    for interval in tier:
        if interval.mark.strip():  # Only consider non-empty intervals
            extracted_text.append(interval.mark)
    
    # Return all the concatenated text
    return " ".join(extracted_text)

def remove_intents(text):
    """
    Remove intents marked by <UNSURE>, <UNIN/>, etc. from the text.
    """
    # Regular expression pattern to match any text within angle brackets including the brackets
    pattern = r'<[^>]*>'
    # Use re.sub to replace matches with an empty string
    cleaned_text = re.sub(pattern, '', text)
    # Optionally, you can remove extra spaces left after removing tags
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [1]:
import os
import time
from dotenv import load_dotenv
import azure.cognitiveservices.speech as speechsdk

In [2]:
# Set up subscription info for the Speech Service
load_dotenv() # Load environment variables such as Speech SDK API keys
AZURE_SPEECH_KEY = os.getenv("SPEECHSDK_API_KEY")
AZURE_SERVICE_REGION = os.getenv("SPEECHSDK_REGION")

In [4]:
# Initialize Speech Service
def initialize_speech_service(audio_file_path):
    # speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    return speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

# Perform transcription on the audio file
def transcribe_continuous_audio_file(recognizer):
    recognized_speech = []

    # Set a variable to manage the state of transcription
    done = False

    def handle_recognized(evt):
        print(f"Recognized: {evt.result.text}")
        recognized_speech.append(evt.result.text)
    
    def handle_canceled(evt):
        print(f"Recognition canceled: {evt.result.reason}")
        if evt.result.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {evt.result.error_details}")

        recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    # Attach handlers for recognized results and any cancellations
    recognizer.recognized.connect(handle_recognized)
    recognizer.canceled.connect(handle_canceled)
    
    # Start continuous recognition
    recognizer.start_continuous_recognition() # transcribe longer audio sessions or multiple sentences and handle the results asynchronously.
    print("Transcribing...")

    # Wait for completion (i.e., done = True)
    try:
        import time
        while not done:
            time.sleep(.5)
    except KeyboardInterrupt:
        print("Transcription stopped by user.")
        recognizer.stop_continuous_recognition()

    return " ".join(recognized_speech)  # Return all the transcriptions concatenated

### Raw Transcription

In [5]:
recognizer = initialize_speech_service(r"C:\Users\zandr\OneDrive\Desktop\NUS Work\BT4103\faith-speechsdk\speechsdk\data\primock\day1_consultation01_doctor.wav") 
hypothesis_text = transcribe_continuous_audio_file(recognizer)

Transcribing...
Recognized: Hello.
Recognized: Hi. Yeah. OK. Hello. Good morning. So how can I help you this morning?
Recognized: Yeah, I'm sorry to hear that. And when you say diarrhoea, what do you mean by diarrhea? Do you mean you're going to the toilet more often or are your stools more loose?
Recognized: OK. And how many times a day are you going, let's say over the last couple of days?
Recognized: 6-7 times a day and you mentioned this mainly water tree. Have you noticed any other things like blood in your stools?
Recognized: OK. And you mentioned you've had some pain in your tummy as well. Whereabouts is the pain exactly?
Recognized: One side. And what side is that?
Recognized: That's right. OK. And can you describe the pain to me?
Recognized: OK. And there's a pain. Is that is it there all the time or does it come and go?
Recognized: Does the pain move anywhere else because on between your back?
Recognized: OK, fine. And you mentioned you've been feeling quite weak and shaky as

### Summarizing with `AbstractiveSummary `

In [15]:
text_analytics_key = os.getenv("text_analytics_key")
text_analytics_endpoint = os.getenv("text_analytics_endpoint")

In [16]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

# authenticate client
def authenticate_client():
    ta_credential = AzureKeyCredential(str(text_analytics_key))
    text_analytics_client = TextAnalyticsClient(
        endpoint=str(text_analytics_endpoint),
        credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [47]:
hypothesis_text_list = hypothesis_text.lower().split('. ')
print(hypothesis_text_list)
# remove predefined list of common unmeaningful words
words_to_remove = ['ok', 'yeah', 'yea', 'ya', 'hello', 'hi', 'bye', 'oh']

hypothesis_text_list_cleaned = [word.strip() for word in hypothesis_text_list if word.strip().lower() not in map(str.lower, words_to_remove)]
print(hypothesis_text_list_cleaned)

['hello', 'hi', 'yeah', 'ok', 'hello', 'good morning', "so how can i help you this morning? yeah, i'm sorry to hear that", "and when you say diarrhoea, what do you mean by diarrhea? do you mean you're going to the toilet more often or are your stools more loose? ok", "and how many times a day are you going, let's say over the last couple of days? 6-7 times a day and you mentioned this mainly water tree", 'have you noticed any other things like blood in your stools? ok', "and you mentioned you've had some pain in your tummy as well", 'whereabouts is the pain exactly? one side', "and what side is that? that's right", 'ok', 'and can you describe the pain to me? ok', "and there's a pain", 'is that is it there all the time or does it come and go? does the pain move anywhere else because on between your back? ok, fine', "and you mentioned you've been feeling quite weak and shaky as well", "what do you mean by shaky? do you mean you've been having, have you been feeling feverish, for example?

In [56]:
# remove those very short sentences

long_hypothesis_text_list_cleaned = []
for sentence in hypothesis_text_list_cleaned:
    sentence_list = sentence.split(" ")
    if len(sentence_list) > 10:
        long_hypothesis_text_list_cleaned.append(sentence)

long_hypothesis_text_list_cleaned

["so how can i help you this morning? yeah, i'm sorry to hear that",
 "and when you say diarrhoea, what do you mean by diarrhea? do you mean you're going to the toilet more often or are your stools more loose? ok",
 "and how many times a day are you going, let's say over the last couple of days? 6-7 times a day and you mentioned this mainly water tree",
 'have you noticed any other things like blood in your stools? ok',
 "and you mentioned you've had some pain in your tummy as well",
 'is that is it there all the time or does it come and go? does the pain move anywhere else because on between your back? ok, fine',
 "and you mentioned you've been feeling quite weak and shaky as well",
 "what do you mean by shaky? do you mean you've been having, have you been feeling feverish, for example? measure your temperature then",
 "and any vomiting at all? you stop vomiting again and would you vomit? i know it's not nice thing to talk about but was it just normal food colour? yeah",
 'and there w

In [69]:
# join into 1 single string to produce 1 summary only. as the max records is 25, we may as well just combine everything into 1
combined_dialogue = " ".join(long_hypothesis_text_list_cleaned)
[combined_dialogue]

["so how can i help you this morning? yeah, i'm sorry to hear that and when you say diarrhoea, what do you mean by diarrhea? do you mean you're going to the toilet more often or are your stools more loose? ok and how many times a day are you going, let's say over the last couple of days? 6-7 times a day and you mentioned this mainly water tree have you noticed any other things like blood in your stools? ok and you mentioned you've had some pain in your tummy as well is that is it there all the time or does it come and go? does the pain move anywhere else because on between your back? ok, fine and you mentioned you've been feeling quite weak and shaky as well what do you mean by shaky? do you mean you've been having, have you been feeling feverish, for example? measure your temperature then and any vomiting at all? you stop vomiting again and would you vomit? i know it's not nice thing to talk about but was it just normal food colour? yeah and there was no blood in your vomits, is that 

In [68]:
# endpoint = os.environ["text_analytics_endpoint"]
# key = os.environ["text_analytics_key"]
load_dotenv()
key = str(os.getenv("text_analytics_key"))
endpoint = str(os.getenv("text_analytics_endpoint"))

text_analytics_client = TextAnalyticsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key),
)
poller = text_analytics_client.begin_abstract_summary([combined_dialogue])
abstract_summary_results = poller.result()
for result in abstract_summary_results:
    if result.kind == "AbstractiveSummarization":
        print("Summaries abstracted:")
        [print(f"{summary.text}\n") for summary in result.summaries]
    elif result.is_error is True:
        print("...Is an error with code '{}' and message '{}'".format(
            result.error.code, result.error.message
        ))

Summaries abstracted:
The document is a conversation between a doctor and a patient who is experiencing various symptoms including diarrhea, stomach pain, weakness, and vomiting. The doctor asks a series of questions to understand the symptoms better, including the frequency of bathroom visits, the nature of the symptoms, and any potential triggers. The doctor also inquires about the patient's overall health, medication use, and lifestyle. The doctor suggests that the symptoms might be due to a stomach bug or infection, possibly caused by a Chinese take-away, and recommends conservative management, including hydration, paracetamol, and a few days of rest. If the symptoms persist, the doctor proposes a follow-up visit.



### Sentiment Analysis with `TextAnalyticsClient`

In [70]:
result = text_analytics_client.analyze_sentiment([combined_dialogue], show_opinion_mining=True)
docs = [doc for doc in result if not doc.is_error]

print("Let's visualize the sentiment of each of these documents")
for idx, doc in enumerate(docs):
    print(f"Document text: {[combined_dialogue][idx]}")
    print(f"Overall sentiment: {doc.sentiment}")

Let's visualize the sentiment of each of these documents
Document text: so how can i help you this morning? yeah, i'm sorry to hear that and when you say diarrhoea, what do you mean by diarrhea? do you mean you're going to the toilet more often or are your stools more loose? ok and how many times a day are you going, let's say over the last couple of days? 6-7 times a day and you mentioned this mainly water tree have you noticed any other things like blood in your stools? ok and you mentioned you've had some pain in your tummy as well is that is it there all the time or does it come and go? does the pain move anywhere else because on between your back? ok, fine and you mentioned you've been feeling quite weak and shaky as well what do you mean by shaky? do you mean you've been having, have you been feeling feverish, for example? measure your temperature then and any vomiting at all? you stop vomiting again and would you vomit? i know it's not nice thing to talk about but was it just no

### `HealthcareEntityRelation` recognises healthcare entities.

In [71]:
from azure.ai.textanalytics import HealthcareEntityRelation

poller = text_analytics_client.begin_analyze_healthcare_entities([combined_dialogue])
result = poller.result()

docs = [doc for doc in result if not doc.is_error]

print("Let's first visualize the outputted healthcare result:")
for doc in docs:
    for entity in doc.entities:
        print(f"Entity: {entity.text}")
        print(f"...Normalized Text: {entity.normalized_text}")
        print(f"...Category: {entity.category}")
        print(f"...Subcategory: {entity.subcategory}")
        print(f"...Offset: {entity.offset}")
        print(f"...Confidence score: {entity.confidence_score}")
        if entity.data_sources is not None:
            print("...Data Sources:")
            for data_source in entity.data_sources:
                print(f"......Entity ID: {data_source.entity_id}")
                print(f"......Name: {data_source.name}")
        if entity.assertion is not None:
            print("...Assertion:")
            print(f"......Conditionality: {entity.assertion.conditionality}")
            print(f"......Certainty: {entity.assertion.certainty}")
            print(f"......Association: {entity.assertion.association}")
    for relation in doc.entity_relations:
        print(f"Relation of type: {relation.relation_type} has the following roles")
        for role in relation.roles:
            print(f"...Role '{role.name}' with entity '{role.entity.text}'")
    print("------------------------------------------")

print("Now, let's get all of medication dosage relations from the documents")
dosage_of_medication_relations = [
    entity_relation
    for doc in docs
    for entity_relation in doc.entity_relations if entity_relation.relation_type == HealthcareEntityRelation.DOSAGE_OF_MEDICATION
]

Let's first visualize the outputted healthcare result:
Entity: morning
...Normalized Text: None
...Category: Time
...Subcategory: None
...Offset: 27
...Confidence score: 0.86
Entity: diarrhoea
...Normalized Text: Diarrhea
...Category: SymptomOrSign
...Subcategory: None
...Offset: 82
...Confidence score: 0.98
...Data Sources:
......Entity ID: C0011991
......Name: UMLS
......Entity ID: 0000005512
......Name: AOD
......Entity ID: BI00232
......Name: BI
......Entity ID: B03.3
......Name: CCC
......Entity ID: 1017488
......Name: CCPSS
......Entity ID: 0000003866
......Name: CHV
......Entity ID: 237
......Name: COSTAR
......Entity ID: 65228
......Name: CPM
......Entity ID: 1248-4166
......Name: CSP
......Entity ID: DIARRHEA
......Name: CST
......Entity ID: U000964
......Name: DXP
......Entity ID: HP:0002014
......Name: HPO
......Entity ID: R19.7
......Name: ICD10CM
......Entity ID: 787.91
......Name: ICD9CM
......Entity ID: 62315008
......Name: ICNP
......Entity ID: D11
......Name: ICPC
....

Categorises entities, can be used for further analysis. See examples below.
```
Entity: last couple of days
...Normalized Text: None
...Category: Time
...Subcategory: None
...Offset: 266
...Confidence score: 0.93
Entity: 6-7 times a day
...Normalized Text: None
...Category: Frequency
...Subcategory: None
...Offset: 287
...Confidence score: 0.98
Entity: blood in your stools
...Normalized Text: None
...Category: SymptomOrSign
...Subcategory: None
...Offset: 383
...Confidence score: 0.92
```

Classification can be done with `begin_multi_label_classify` in future.

### `recognize_entities`
This can also be used for classification, similar to the above, except that this is not healthcare specific.

In [73]:
import typing

result = text_analytics_client.recognize_entities([combined_dialogue])
result = [review for review in result if not review.is_error]
organization_to_reviews: typing.Dict[str, typing.List[str]] = {}

for idx, review in enumerate(result):
    for entity in review.entities:
        print(f"Entity '{entity.text}' has category '{entity.category}'")
        if entity.category == 'Organization':
            organization_to_reviews.setdefault(entity.text, [])
            organization_to_reviews[entity.text].append([combined_dialogue][idx])

for organization, reviews in organization_to_reviews.items():
    print(
        "\n\nOrganization '{}' has left us the following review(s): {}".format(
            organization, "\n\n".join(reviews)
        )
    )

Entity 'this morning' has category 'DateTime'
Entity 'toilet' has category 'Location'
Entity 'stools' has category 'Product'
Entity 'last couple of days' has category 'DateTime'
Entity '6-7' has category 'Quantity'
Entity 'a day' has category 'DateTime'
Entity 'measure' has category 'Skill'
Entity 'vomiting' has category 'Skill'
Entity 'food' has category 'Product'
Entity 'vomiting' has category 'Event'
Entity 'foods' has category 'Product'
Entity 'foods' has category 'Product'
Entity 'people' has category 'PersonType'
Entity 'ate' has category 'Skill'
Entity 'asthma' has category 'Product'
Entity 'bowel' has category 'Skill'
Entity 'inhalers' has category 'Product'
Entity 'medications' has category 'Product'
Entity 'working' has category 'Skill'
Entity 'at the moment' has category 'DateTime'
Entity 'last three days' has category 'DateTime'
Entity 'home' has category 'Location'
Entity 'wife' has category 'PersonType'
Entity 'two' has category 'Quantity'
Entity 'children' has category '