In [1]:
#conda activate text-analytics

import os
import json as js
import requests, uuid
import re, math
from datetime import datetime, timedelta
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from azure.ai.translation.document import DocumentTranslationClient
from azure.storage.blob import BlobServiceClient
from azure.storage.blob._shared_access_signature import BlobSharedAccessSignature

#Loads info from config file 
with open('./config.json','r') as file:
    config = js.load(file)

<h1>Demo setup </h1>

Before running this demo you need few things set up: 

<h4>Azure Resources</h4>
<ul>
<li> [Azure Text Analytics] (https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/overview) </li>
<li> [Azure Translator] (https://docs.microsoft.com/en-us/azure/cognitive-services/translator/document-translation/get-started-with-document-translation?WT.mc_id=Portal-Microsoft_Azure_ProjectOxford&tabs=csharp) </li>
<ul>
<li>Create a SAS token policy to use Document Translator [here] (https://docs.microsoft.com/en-us/azure/cognitive-services/translator/document-translation/create-sas-tokens?tabs=Containers)</li>
</ul>
</ul>

<h4> Folder Structure </h4>
In the root folder:
<ul>
<li> "data" folder, inside the data folder: </li>
<ul>
<li> "original" folder (where your original pdf files will be)</li>
<li> "output" folder (where your outputted highlighted docx file will be stored)</li>
<li> "preprocessed" folder (where your docx file will be)</li>
<li> "translated" folder (where you can download your translated docx)</li>
</ul>
<li> "config.json" file, with all the path and keys </li>
<li> notebook </li>
</ul>

<h1> Helpers functions definition </h1>

In [None]:
#Local helpers

################################
# Lists files in folder        #
################################
def FilesInFolder(path):
    file_list = os.listdir(path)
    return file_list

################################
#Extracts text from docx file  #
################################
def ExtractTextFromLocal(docPath):
    doc = Document(docPath)
    full_text = []
    for t in doc.paragraphs:
        full_text.append(t.text)
    return full_text

##############################################
# Highlights entities found in the docx file #
##############################################
def EntitySearch(document,keyword,dstPath,color):
    for p in document.paragraphs:
        if keyword in p.text:
            for run in p.runs:
                if keyword in run.text:
                    temp = run.text.split(keyword)
                    run.clear()
                    for i in range(len(temp)-1):
                        #run.add_text(temp[i])
                        run.add_text(keyword)
                        if(color == 'y'):
                            run.font.highlight_color = WD_COLOR_INDEX.YELLOW
                        else:
                            run.font.highlight_color = WD_COLOR_INDEX.GREEN
    document.save(dstPath)

################################################
# Takes extracted document text and divides it #
# in max n parts to avoid API limits           #
################################################
def ChunkText(text,n):
    max_size = math.ceil(len(text)/n)
    chunks_text = []
    for x in range(0,len(text),max_size):
        temp = ""
        for t in range(x,x+max_size,x+1):
            temp = temp + " "+ text[t]   
        chunks_text.append(temp)
    return chunks_text

In [None]:
#Blob helpers

######################################
# Creates connection to blob storage #
######################################
def ConnectToBlobStorage(connectionString):
    #connects to the Azure Storage Account
    blob_service_client = BlobServiceClient.from_connection_string(connectionString)
    return blob_service_client

###################################
# Lists container in blob storage #
###################################
def GetContainersInStorage(blob_service_client):
    container_list = blob_service_client.list_containers()
    return container_list

#####################################
# Lists blobs in specific container #
#####################################
def GetBlobsInContainer(blobClient,containerName):
    #gets the container you want to list
    container_client = blobClient.get_container_client(containerName)
    #gets the blobs
    blobs_list = container_client.list_blobs()
    return blobs_list

###############################################
# Downloads locally specific blob in storage  #
###############################################
def DownloadBlobLocally(blobServiceClient,containerName,fileName,downloadPath):
    #creates blob client
    blob_client = blobServiceClient.get_blob_client(container=containerName,blob=fileName)
    print("Downloading from Azure Storage as blob: " + fileName)
    #downloads file
    download_file_path= downloadPath + fileName
    with open(download_file_path,"wb") as download_file:
        download_file.write(blob_client.download_blob().readall())
    print("Blob downloaded in the following folder: "+download_file_path)
    return download_file_path

########################################################
# Uploads local file to specific container in storage  #
########################################################
def UploadFileToBlob(blob_service_client,containerName,fileName,filePath):
    #creates blob client
    blob_client = blob_service_client.get_blob_client(container=containerName,blob=fileName)  
    print("Uploading to Azure Storage as blob: " + fileName)
    #uploads file
    with open(filePath,"rb") as upload_file:
        blob_client.upload_blob(upload_file)
    print("Blob uploaded!")

##########################################
# Gets blob url from specific container  #
##########################################
def GetBlobURL(blob_service_client,blob_name,container_name):
    blob_client = blob_service_client.get_blob_client(container=container_name,blob=blob_name)
    blob_url = blob_client.url
    return blob_url

###########################################
# Creates SAS signature for specific blob #
###########################################
def CreateSASSignature(container_name, blob_name, permissions,expiry):
    blob_shared_access_signature = BlobSharedAccessSignature(config["sa_account_name"],config["sa_key"])
    sas_token = blob_shared_access_signature.generate_blob(container_name,blob_name,expiry=expiry,permission="rw")
    return sas_token


In [None]:
#Text Analytics helpers

#############################
# Detects document language #
#############################
def LanguageDetection(text_analytics_client,full_text):
    language_detected = text_analytics_client.detect_language(full_text)
    print("Language detected: {}".format(language_detected[0].primary_language))
    return language_detected

############################################
# Translates documents online (from blob)  #
############################################
def DocumentTranslation(transaltor_client,source_url,target_url,language):
    poller = transaltor_client.begin_translation(source_url,target_url,language)
    result = poller.result()
    #print("Status: {}".format(poller.status()))
    #print("Created on: {}".format(poller.details.created_on))
    #print("Last updated on: {}".format(poller.details.last_updated_on))
    #print("Total number of translations on documents: {}".format(poller.details.documents_total_count))
    for document in result:
        print("Document ID: {}".format(document.id))
        print("Document status: {}".format(document.status))
        if document.status == "Succeeded":
            print("Source document location: {}".format(document.source_document_url))
            print("Translated document location: {}".format(document.translated_document_url))
            print("Translated to language: {}\n".format(document.translated_to))
            return document.translated_document_url
        else:
            print("Error Code: {}, Message: {}\n".format(document.error.code, document.error.message))

#######################################
# Extracts health entities from text  #
#######################################
def TextAnalyticsForHealth(text_analytics_client,text):
    poller = text_analytics_client.begin_analyze_healthcare_entities(text)
    result = poller.result()
    health_entities = []
    docs = [doc for doc in result if not doc.is_error]
    print("Healthcare results:")
    for idx, doc in enumerate(docs):
        for entity in doc.entities:
            health_entities.append(entity.text)
    #         print("Entity: {}".format(entity.text))
    #         print("...Normalized Text: {}".format(entity.normalized_text))
    #         print("...Category: {}".format(entity.category))
    #         print("...Subcategory: {}".format(entity.subcategory))
    #         print("...Offset: {}".format(entity.offset))
    #         print("...Confidence score: {}".format(entity.confidence_score))
    #         if entity.data_sources is not None:
    #             print("...Data Sources:")
    #             for data_source in entity.data_sources:
    #                 print("......Entity ID: {}".format(data_source.entity_id))
    #                 print("......Name: {}".format(data_source.name))
    #         if entity.assertion is not None:
    #             print("...Assertion:")
    #             print("......Conditionality: {}".format(entity.assertion.conditionality))
    #             print("......Certainty: {}".format(entity.assertion.certainty))
    #             print("......Association: {}".format(entity.assertion.association))
    #     for relation in doc.entity_relations:
    #         print("Relation of type: {} has the following roles".format(relation.relation_type))
    #         for role in relation.roles:
    #             print("...Role '{}' with entity '{}'".format(role.name, role.entity.text))
    #     print("------------------------------------------")
    return docs,health_entities

###########################
# Extracts PII from text  #
###########################
def TextAnalyticsPII(text_analytics_client,text,language):
    response = text_analytics_client.recognize_pii_entities(text, language=language)
    result = [doc for doc in response if not doc.is_error]
    pii_entities = []
    for idx, doc in enumerate(result):
        #print("Document text: {}".format(documents[idx]))
        #print("Redacted document text: {}".format(doc.redacted_text))
        for entity in doc.entities:
            pii_entities.append(entity.text)
            #print("...Entity: {}".format(entity.text))
            #print("......Category: {}".format(entity.category))
            #print("......Confidence Score: {}".format(entity.confidence_score))
            #print("......Offset: {}".format(entity.offset))
    return result,pii_entities

#############################################
# Execute all the steps to: extracts text   #
# from docx, translates it, extracts health #
#    entities, highlights them in docx,     #
#    extracts PII entities from docx        #
#############################################
def TextAnalyticsOnDocs(docs_folder,translated_folder,output_folder,ta_client,tr_client,blob_client,container_name,source_url,target_url):
    #Check files locally 
    file_list = FilesInFolder(docs_folder)
    for f in file_list:
        print("File processed {}".format(f))
        #Stores file path
        file_path = docs_folder + f
        #Extract file text
        extracted_text = ExtractTextFromLocal(file_path)
        ########################
        #       STEP 1         #
        #                      #
        #  LANGUAGE DETECTION  #
        ########################
        language_detected = LanguageDetection(ta_client,extracted_text)
        ########################
        #       STEP 2         #
        #                      #
        # DOCUMENT TRANSLATION #
        ########################
        #Upload file to Blob storage
        UploadFileToBlob(blob_client,container_name,f,file_path)
        #Translate document
        translated_doc_url = DocumentTranslation(tr_client,source_url,target_url,"en")
        #################################
        #             STEP 3            #
        #                               #
        # DOWNLOAD AND EXTRACT ENTITIES #
        #################################
        #Download file 
        translated_file_path = DownloadBlobLocally(blob_client,"output",f,translated_folder)
        fully_translated_text = ExtractTextFromLocal(translated_file_path)
        #Format text to be sent to the TA
        ct = ChunkText(fully_translated_text,n)
        #Extract entities
        entities, health_entities = TextAnalyticsForHealth(ta_client,ct)
        #################################
        #             STEP 4            #
        #                               #
        #   HIGHLIGHT ENTITIES IN DOCX  #
        #################################
        translated_document = Document(translated_file_path)
        #Highlighted doc path
        highlighted_doc_path = output_folder + f
        for d in entities:
            for entity in d.entities:
                print(entity.text)
                EntitySearch(translated_document,entity.text,highlighted_doc_path,'y')
        #################################
        #             STEP 5            #
        #                               #
        #              PII              #
        #################################
        pii_extraction = ChunkText(fully_translated_text,5)
        results,pii_entities = TextAnalyticsPII(ta_client,pii_extraction,"en")
        #################################
        #             STEP 6            #
        #                               #
        #         HIGHLIGHT PII         #
        #################################
        for r in results:
            for entity in r.entities:
                #Uncomment to show results
                #print(entity.text)
                EntitySearch(translated_document,entity.text,highlighted_doc_path,'g')

<h1> Code to run Text Analytics for Health on specific document </h1>

In [None]:
#Helpers
DOCS_FOLDER = config["preprocessed_path"]
ORIGINAL_FOLDER =config["pdfs_path"]
OUTPUT_FOLDER = config["output_path"]
TRANSLATED_FOLDER = config["translated_path"]
SOURCE_URL = config["sa_source_url"]
TARGET_URL = config["sa_target_url"]

#Connect to Azure Text Analytics service
ta_credentials = AzureKeyCredential(config["text_analytics_key"])
TEXT_ANALYTICS_CLIENT = TextAnalyticsClient(endpoint=config["text_analytics_endpoint"],credential=ta_credentials)

#Connect to Azure Translator service
tr_credentials = AzureKeyCredential(config["translator_key"])
TRANSLATOR_CLIENT= DocumentTranslationClient(endpoint=config["translator_documents_endpoint"],credential=tr_credentials)

#Blob storage connection
BLOBSERVICECLIENT = ConnectToBlobStorage(config["sa_connectionstring"])
#Hardcoded container name
CONTAINER_NAME = "data"

In [None]:
#Analyze docx with Azure Text Analytics for Health
TextAnalyticsOnDocs(DOCS_FOLDER,TRANSLATED_FOLDER,OUTPUT_FOLDER,TEXT_ANALYTICS_CLIENT,TRANSLATOR_CLIENT,BLOBSERVICECLIENT,CONTAINER_NAME,SOURCE_URL,TARGET_URL)