In [3]:
#conda activate text-analytics
import os
import json as js
import requests, uuid
import re, math
from datetime import datetime, timedelta
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from azure.ai.translation.document import DocumentTranslationClient
from azure.storage.blob import BlobServiceClient
from azure.storage.blob._shared_access_signature import BlobSharedAccessSignature

#Loads info from config file 
with open('./config.json','r') as file:
    config = js.load(file)

In [4]:
#Local helpers
def FilesInFolder(path):
    file_list = os.listdir(path)
    return file_list

def ExtractTextFromLocal(docPath):
    doc = Document(docPath)
    full_text = []
    for t in doc.paragraphs:
        full_text.append(t.text)
    return full_text

def EntitySearch(document,keyword,dstPath):
    for p in document.paragraphs:
        if keyword in p.text:
            for run in p.runs:
                if keyword in run.text:
                    temp = run.text.split(keyword)
                    run.clear()
                    for i in range(len(temp)-1):
                        #run.add_text(temp[i])
                        run.add_text(keyword)
                        run.font.highlight_color = WD_COLOR_INDEX.YELLOW
    document.save(dstPath)

def ChunkText(text):
    max_size = math.ceil(len(text)/10)
    chunks_text = []
    for x in range(0,len(text),max_size):
        temp = ""
        for t in range(x,x+max_size,x+1):
            temp = temp + " "+ text[t]   
        chunks_text.append(temp)
    return chunks_text

In [5]:
#Blob helpers
def ConnectToBlobStorage(connectionString):
    #connects to the Azure Storage Account
    blob_service_client = BlobServiceClient.from_connection_string(connectionString)
    return blob_service_client

def GetContainersInStorage(blob_service_client):
    container_list = blob_service_client.list_containers()
    return container_list

def GetBlobsInContainer(blobClient,containerName):
    #gets the container you want to list
    container_client = blobClient.get_container_client(containerName)
    #gets the blobs
    blobs_list = container_client.list_blobs()
    return blobs_list

def DownloadBlobLocally(blobServiceClient,containerName,fileName,downloadPath):
    #creates blob client
    blob_client = blobServiceClient.get_blob_client(container=containerName,blob=fileName)
    print("Downloading from Azure Storage as blob: " + fileName)
    #downloads file
    download_file_path= downloadPath + fileName
    with open(download_file_path,"wb") as download_file:
        download_file.write(blob_client.download_blob().readall())
    print("Blob downloaded in the following folder: "+download_file_path)
    return download_file_path

def UploadFileToBlob(blob_service_client,containerName,fileName,filePath):
    #creates blob client
    blob_client = blob_service_client.get_blob_client(container=containerName,blob=fileName)  
    print("Uploading to Azure Storage as blob: " + fileName)
    #uploads file
    with open(filePath,"rb") as upload_file:
        blob_client.upload_blob(upload_file)
    print("Blob uploaded!")

def GetBlobURL(blob_service_client,blob_name,container_name):
    blob_client = blob_service_client.get_blob_client(container=container_name,blob=blob_name)
    blob_url = blob_client.url
    return blob_url

def CreateSASSignature(container_name, blob_name, permissions,expiry):
    blob_shared_access_signature = BlobSharedAccessSignature(config["sa_account_name"],config["sa_key"])
    sas_token = blob_shared_access_signature.generate_blob(container_name,blob_name,expiry=expiry,permission="rw")
    return sas_token


In [11]:
#Text Analytics helpers
def LanguageDetection(text_analytics_client,full_text):
    language_detected = text_analytics_client.detect_language(full_text)
    print("Language detected: {}".format(language_detected[0].primary_language))
    return language_detected

def DocumentTranslation(transaltor_client,source_url,target_url,language):
    poller = transaltor_client.begin_translation(source_url,target_url,language)
    result = poller.result()
    #print("Status: {}".format(poller.status()))
    #print("Created on: {}".format(poller.details.created_on))
    #print("Last updated on: {}".format(poller.details.last_updated_on))
    #print("Total number of translations on documents: {}".format(poller.details.documents_total_count))
    for document in result:
        print("Document ID: {}".format(document.id))
        print("Document status: {}".format(document.status))
        if document.status == "Succeeded":
            print("Source document location: {}".format(document.source_document_url))
            print("Translated document location: {}".format(document.translated_document_url))
            print("Translated to language: {}\n".format(document.translated_to))
            return document.translated_document_url
        else:
            print("Error Code: {}, Message: {}\n".format(document.error.code, document.error.message))

def TextAnalyticsForHealth(text_analytics_client,text):
    poller = text_analytics_client.begin_analyze_healthcare_entities(text)
    result = poller.result()
    docs = [doc for doc in result if not doc.is_error]
    print("Healthcare results:")
    # for idx, doc in enumerate(docs):
    #     for entity in doc.entities:
    #         print("Entity: {}".format(entity.text))
    #         print("...Normalized Text: {}".format(entity.normalized_text))
    #         print("...Category: {}".format(entity.category))
    #         print("...Subcategory: {}".format(entity.subcategory))
    #         print("...Offset: {}".format(entity.offset))
    #         print("...Confidence score: {}".format(entity.confidence_score))
    #         if entity.data_sources is not None:
    #             print("...Data Sources:")
    #             for data_source in entity.data_sources:
    #                 print("......Entity ID: {}".format(data_source.entity_id))
    #                 print("......Name: {}".format(data_source.name))
    #         if entity.assertion is not None:
    #             print("...Assertion:")
    #             print("......Conditionality: {}".format(entity.assertion.conditionality))
    #             print("......Certainty: {}".format(entity.assertion.certainty))
    #             print("......Association: {}".format(entity.assertion.association))
    #     for relation in doc.entity_relations:
    #         print("Relation of type: {} has the following roles".format(relation.relation_type))
    #         for role in relation.roles:
    #             print("...Role '{}' with entity '{}'".format(role.name, role.entity.text))
    #     print("------------------------------------------")
    return docs

def TextAnalyticsPII(text_analytics_client,text,language):
    response = text_analytics_client.recognize_pii_entities(text, language=language)
    result = [doc for doc in response if not doc.is_error]
    for idx, doc in enumerate(result):
        #print("Document text: {}".format(documents[idx]))
        #print("Redacted document text: {}".format(doc.redacted_text))
        for entity in doc.entities:
            print("...Entity: {}".format(entity.text))
            print("......Category: {}".format(entity.category))
            print("......Confidence Score: {}".format(entity.confidence_score))
            print("......Offset: {}".format(entity.offset))

def TextAnalyticsOnDocs(docs_folder,translated_folder,ta_client,tr_client,blob_client,container_name,source_url,target_url):
    #Check files locally 
    file_list = FilesInFolder(docs_folder)
    for f in file_list:
        print("File processed {}".format(f))
        #Stores file path
        file_path = docs_folder + f
        #Extract file text
        extracted_text = ExtractTextFromLocal(file_path)
        ########################
        #       STEP 1         #
        #                      #
        #  LANGUAGE DETECTION  #
        ########################
        language_detected = LanguageDetection(ta_client,extracted_text)
        ########################
        #       STEP 2         #
        #                      #
        # DOCUMENT TRANSLATION #
        ########################
        #Upload file to Blob storage
        UploadFileToBlob(blob_client,container_name,f,file_path)
        #Translate document
        translated_doc_url = DocumentTranslation(tr_client,source_url,target_url,"en")
        #################################
        #             STEP 3            #
        #                               #
        # DOWNLOAD AND EXTRACT ENTITIES #
        #################################
        #Download file 
        translated_file_path = DownloadBlobLocally(blob_client,"output",f,translated_folder)
        fully_translated_text = ExtractTextFromLocal(translated_file_path)
        #Format text to be sent to the TA
        ct = ChunkText(fully_translated_text)
        #Extract entities
        entities = TextAnalyticsForHealth(ta_client,ct)
        #################################
        #             STEP 4            #
        #                               #
        #   HIGHLIGHT ENTITIES IN DOCX  #
        #################################
        translated_document = Document(translated_file_path)
        for d in entities:
            for entity in d.entities:
                print(entity.text)
                EntitySearch(translated_document,entity.text,"ExtractedEntities.docx")
        #################################
        #             STEP 5            #
        #                               #
        #              PII              #
        #################################
        TextAnalyticsPII(ta_client,fully_translated_text,"en")

In [7]:
#Helpers
DOCS_FOLDER = config["preprocessed_path"]
ORIGINAL_FOLDER =config["pdfs_path"]
OUTPUT_FOLDER = config["output_path"]
TRANSLATED_FOLDER = config["translated_path"]
SOURCE_URL = config["sa_source_url"]
TARGET_URL = config["sa_target_url"]

#Connect to Azure Text Analytics service
ta_credentials = AzureKeyCredential(config["text_analytics_key"])
TEXT_ANALYTICS_CLIENT = TextAnalyticsClient(endpoint=config["text_analytics_endpoint"],credential=ta_credentials)

#Connect to Azure Translator service
tr_credentials = AzureKeyCredential(config["translator_key"])
TRANSLATOR_CLIENT= DocumentTranslationClient(endpoint=config["translator_documents_endpoint"],credential=tr_credentials)

#Blob storage connection
BLOBSERVICECLIENT = ConnectToBlobStorage(config["sa_connectionstring"])
CONTAINER_NAME = "data"

In [10]:
TextAnalyticsOnDocs(DOCS_FOLDER,TRANSLATED_FOLDER,TEXT_ANALYTICS_CLIENT,TRANSLATOR_CLIENT,BLOBSERVICECLIENT,CONTAINER_NAME,SOURCE_URL,TARGET_URL)

File processed LDO_AOPR_1.docx
Language detected: {'name': 'English', 'iso6391_name': 'en', 'confidence_score': 1.0}
Uploading to Azure Storage as blob: LDO_AOPR_1.docx
Blob uploaded!
Document ID: 0007e980-0000-0000-0000-000000000000
Document status: Succeeded
Source document location: https://guscianchealthsa.blob.core.windows.net/data/LDO_AOPR_1.docx
Translated document location: https://guscianchealthsa.blob.core.windows.net/output/LDO_AOPR_1.docx
Translated to language: en

Downloading from Azure Storage as blob: LDO_AOPR_1.docx
Blob downloaded in the following folder: C://Users//guscianc//source//repos//TextAnalytics//data//translated//LDO_AOPR_1.docx


HttpResponseError: (InvalidDocumentBatch) Batch request contains too many records. Max 10 records are permitted.
Code: InvalidDocumentBatch
Message: Batch request contains too many records. Max 10 records are permitted.