In [1]:
PROJECT_ID = 'dwh-siloam'
REGION = 'asia-southeast1'
print(f"Project ID: {PROJECT_ID}\nRegion: {REGION}")

# Initialize Vertex AI
from pathlib import Path
import vertexai
from google.cloud import aiplatform

print(f"Checking Credentials...")
if not any((Path.cwd()/"service_account").glob('*.json')):
    print("Service account folder is empty. Fallback using default gcloud account")
    aiplatform.init(project=PROJECT_ID, location=REGION)
    vertexai.init(project=PROJECT_ID, location=REGION)
else:
    print('Using service account credentials from service_account folder')
    from google.oauth2 import service_account
    sa_file = list((Path.cwd()/"service_account").glob('*.json'))[0]
    print(f"Using service account file: {sa_file}")
    credentials = service_account.Credentials.from_service_account_file(sa_file)
    aiplatform.init(project=PROJECT_ID, location=REGION, credentials=credentials)
    vertexai.init(project=PROJECT_ID, location=REGION, credentials=credentials)

# Import libraries
from langchain_google_vertexai import VertexAI, ChatVertexAI, create_structured_runnable
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from typing import List, Optional
import requests

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder
from settings import CopilotSettings
import time

import io
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

Project ID: dwh-siloam
Region: asia-southeast1
Checking Credentials...
Using service account credentials from service_account folder
Using service account file: /Users/donnymirzaadhitama/workspace/others/chatbot-llm/service_account/dwh-siloam-99402e61edd2.json


In [2]:
config = CopilotSettings()

## Testing Azure Document Intelligence

In [3]:
"""
This code sample shows Prebuilt Document operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = config.AZURE_DOCS_INT_ENDPOINT
key = config.AZURE_DOCS_INT_API_KEY

# sample document
form_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKS/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
in_file = io.BytesIO(
    open(file_path, 'rb').read()
)

document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    
# poller = document_analysis_client.begin_analyze_document_from_url(
#     model_id="prebuilt-document",
#     document_url=form_url
# )
poller = document_analysis_client.begin_analyze_document(
    model_id="prebuilt-document",
    document=in_file
)

result = poller.result()

print("----Key-value pairs found in document----")
for kv_pair in result.key_value_pairs:
    if kv_pair.key and kv_pair.value:
        print("Masuk if: Key '{}': Value: '{}'".format(kv_pair.key.content, kv_pair.value.content))
    else:
        print("Masuk else: Key '{}': Value:".format(kv_pair.key.content))

print("----------------------------------------")


----Key-value pairs found in document----
Masuk if: Key 'Sex:': Value: 'Male /'
Masuk if: Key 'Ph:': Value: '085399799077'
Masuk if: Key 'MR No:': Value: 'MRCCC.00-42-94-47'
Masuk if: Key 'DOB:': Value: '31-Dec-1971'
Masuk if: Key 'Kepada TS :': Value: 'dr.'
Masuk if: Key 'Internal': Value: ':selected:'
Masuk if: Key 'Eksternal': Value: ':unselected:'
Masuk if: Key 'Mohon konsultasi dan tindak lanjut untuk pasien dengan :': Value: 'Keluhan utama
Bah- Kin'
Masuk if: Key 'Hasil pemeriksaan yang ditemukan
:': Value: 'difama
Km .'
Masuk if: Key 'Diagnosa
:': Value: 'Saf Smilde'
Masuk if: Key 'Konsultasi satu kali': Value: ':unselected:'
Masuk if: Key 'Rawat bersama': Value: ':selected:'
Masuk if: Key 'Alih rawat': Value: ':unselected:'
Masuk if: Key 'Tanggal :': Value: '1120'
----------------------------------------


## Create object document loader

In [98]:
# from langchain_text_splitters.character import RecursiveCharacterTextSplitter, TextSplitter
# from typing import Any, List, Optional, Iterable
# from azure.ai.formrecognizer._models import AnalyzeResult
# from langchain_core.documents import Document

# class AnalyzeDocumentIntelligence(RecursiveCharacterTextSplitter):
#     def __init__(
#         self,
#         text_splitter: TextSplitter = RecursiveCharacterTextSplitter,
#         model_id: str = "prebuilt-document",
#         endpoint: Optional[str] = None,
#         key: Optional[str] = None,
#         ## nanti kasih embedding, char splitter, fileloader (source file), dataanalyzer

#         **kwargs: Any,
#     ) -> None:
#         # """Create a new TextSplitter."""
#         # super().__init__(keep_separator=keep_separator, **kwargs)
#         # self._separators = separators or ["\n\n", "\n", " ", ""]
#         # self._is_separator_regex = is_separator_regex
        
#         self.endpoint = endpoint
#         self.api_key = key
#         self.model_id = model_id

#     def analyze_document(
#         self,
#         file_path: str = None,
#         file_url: str = None,
#     ) -> AnalyzeResult:
#         document_analysis_client = DocumentAnalysisClient(
#             endpoint=self.endpoint, 
#             credential=AzureKeyCredential(self.api_key)
#         )
#         if file_url:
#             self.source = file_url
#             poller = document_analysis_client.begin_analyze_document_from_url(
#                 model_id=self.model_id,
#                 document_url=file_url
#             )
#         elif file_path:
#             self.source = file_path
#             in_file = io.BytesIO(
#                 open(file_path, 'rb').read()
#             )
#             poller = document_analysis_client.begin_analyze_document(
#                 model_id=self.model_id,
#                 document=in_file
#             )
#         else:
#             raise ValueError("Either 'file_path' or 'file_url' must be provided.")

#         return poller.result()
    
#     def _get_dict_from_key_value_poller(
#         self,
#         poller_result: AnalyzeResult,
#         source: str = None,
#     ) -> dict:
#         dict_poller = {}
#         for kv_pair in poller_result.key_value_pairs:
#             if kv_pair.key not in dict_poller:
#                 dict_poller[kv_pair.key.content] = []

#             if kv_pair.key and kv_pair.value:
#                 dict_poller[kv_pair.key.content].append(kv_pair.value.content)
#             else:
#                 dict_poller[kv_pair.key.content].append(None)

#         dict_poller["source"] = source
#         return dict_poller

#     def split_documents(
#         self, 
#         documents: Iterable[AnalyzeResult]
#     ) -> List[Document]:
#         """Note: Replaced the inherited split_documents method with additional metadata values"""
#         texts, metadatas = [], []
#         for doc in documents:
#             texts.append(doc.content)
#             dict_poller = self._get_dict_from_key_value_poller(doc, self.source)
#             metadatas.append(dict_poller)
#         return self.create_documents(texts, metadatas=metadatas)

In [None]:
from abc import ABC, abstractmethod
from azure.ai.formrecognizer._models import AnalyzeResult
from langchain_text_splitters.character import RecursiveCharacterTextSplitter, TextSplitter
from typing import Any, List, Optional, Iterable
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

class DocumentAnalyzer(ABC):
    @abstractmethod
    def analyze(
        self,
        file_path: str = None,
        file_url: str = None,
        **kwargs,
    ) -> AnalyzeResult:
        pass

class UnstructuredDocumentAnalyzer(DocumentAnalyzer):
    def __init__(
        self,
        text_splitter,
        data_loader,
        data_analyzer,
        embedding,
        endpoint: Optional[str] = None,
        key: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        super().__init__()
        self.text_splitter = text_splitter
        self.data_loader = data_loader
        self.data_analyzer = data_analyzer
        self.embedding = embedding,
        self.endpoint = endpoint,
        self.api_key = key

    def analyze(
        self,
        file_path: str = None,
        file_url: str = None,
        **kwargs,
    ) -> AnalyzeResult:
        pass

In [100]:
# file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPPK/2022110213021016_MRCCC_FPPK_PERTAMINA INTERNASIONAL EP PT ADMEDIKA_OPA2210270146_RIA NOVERIA.pdf"

# adi = AnalyzeDocumentIntelligence(
#     chunk_size=1000,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
#     endpoint=config.AZURE_DOCS_INT_ENDPOINT,
#     key=config.AZURE_DOCS_INT_API_KEY,
# )

# result = adi.analyze_document(file_path=file_path)
# doc_splitted = adi.split_documents([result])
# # print(doc_splitted[1].page_content)

In [3]:
from langchain_community.document_loaders.doc_intelligence import AzureAIDocumentIntelligenceLoader

In [5]:
print(config.AZURE_DOCS_INT_ENDPOINT)
print(config.AZURE_DOCS_INT_API_KEY)

https://southeastasia.api.cognitive.microsoft.com/
4db9e2e800be4a2ca93034b8baa12b74


In [16]:
file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKS/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
analysis_features = [
    # "ocrHighResolution", 
    "keyValuePairs",
]

loader = AzureAIDocumentIntelligenceLoader(
    file_path=file_path,
    api_endpoint=config.AZURE_DOCS_INT_ENDPOINT,
    api_key=config.AZURE_DOCS_INT_API_KEY,
    api_version="2024-02-29-preview",
    api_model="prebuilt-layout",
    mode="single",
    analysis_features=analysis_features,
)

In [13]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter, TextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [17]:
# result = loader.load()
result = loader.load_and_split(text_splitter=text_splitter)
result

[Document(page_content='Siloam Hospitals\nSiloam Hospitals\nPERMOHONAN KONSULTASI\nKAMRULLAH ILYAS Sex: Male / Ph: 085399799077 MR No: MRCCC.00-42-94-47 / DOB: 31-Dec-1971 ( OPA2208190466 / 19-Agt-2022 11:00 Dr. dr. Andri Lubis, SPOT (K) / VALE INDONESIA, PT\nKepada TS : dr. Rehorarios Media :selected: Internal :unselected: Eksternal\nMohon konsultasi dan tindak lanjut untuk pasien dengan :\nKeluhan utama :\n-\nHasil pemeriksaan yang ditemukan :\nBah - Kini\ndizer\n..........\nDiagnosa :\nObat dan tindakan yang diberikan\n:\nMen van\ndenTs\nTerima kasih atas bantuan & kerjasamanya. :unselected: Konsultasi satu kali :selected: Rawat bersama :unselected: Alih rawat\nDokter yang merawat\nTanda Tangan & Nama Lengkap\nTanggal : Jam :\n19/8-20\n1120\nSHAMA 1511.47a/1')]

In [21]:
# print(result[0].page_content)
# print(result[0].page_content)
result[0].dict()

{'page_content': 'Siloam Hospitals\nSiloam Hospitals\nPERMOHONAN KONSULTASI\nKAMRULLAH ILYAS Sex: Male / Ph: 085399799077 MR No: MRCCC.00-42-94-47 / DOB: 31-Dec-1971 ( OPA2208190466 / 19-Agt-2022 11:00 Dr. dr. Andri Lubis, SPOT (K) / VALE INDONESIA, PT\nKepada TS : dr. Rehorarios Media :selected: Internal :unselected: Eksternal\nMohon konsultasi dan tindak lanjut untuk pasien dengan :\nKeluhan utama :\n-\nHasil pemeriksaan yang ditemukan :\nBah - Kini\ndizer\n..........\nDiagnosa :\nObat dan tindakan yang diberikan\n:\nMen van\ndenTs\nTerima kasih atas bantuan & kerjasamanya. :unselected: Konsultasi satu kali :selected: Rawat bersama :unselected: Alih rawat\nDokter yang merawat\nTanda Tangan & Nama Lengkap\nTanggal : Jam :\n19/8-20\n1120\nSHAMA 1511.47a/1',
 'metadata': {},
 'type': 'Document'}

In [27]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.core.credentials import AzureKeyCredential
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import AzureAIDocumentIntelligenceParser

document_intelligence_client = DocumentIntelligenceClient(
        endpoint=config.AZURE_DOCS_INT_ENDPOINT, 
        credential=AzureKeyCredential(config.AZURE_DOCS_INT_API_KEY),
        # api_version="2023-07-31"
)
parser = AzureAIDocumentIntelligenceParser(  # type: ignore[misc]
            api_endpoint=config.AZURE_DOCS_INT_ENDPOINT,
            api_key=config.AZURE_DOCS_INT_API_KEY,
            api_version="2024-02-29-preview",
            api_model="prebuilt-layout",
            mode="markdown",
            analysis_features=analysis_features,
)
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKS/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
in_file = io.BytesIO(
    open(file_path, 'rb').read()
)
blob = Blob.from_path(file_path)
blob = parser.parse(blob)
with blob.as_bytes_io() as file_obj:
    poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",
            AnalyzeDocumentRequest(
                bytes_source=file_obj,
            #     url_source=formUrl
            )
    )

AttributeError: 'list' object has no attribute 'as_bytes_io'

In [28]:
blob

[Document(page_content='<figure>\n\n![](figures/0)\n\n<!-- FigureContent="Siloam Hospitals" -->\n\n</figure>\n\n\nSiloam Hospitals\n\nPERMOHONAN KONSULTASI\n\nKAMRULLAH ILYAS Sex: Male / Ph: 085399799077 MR No: MRCCC.00-42-94-47 / DOB: 31-Dec-1971 ( OPA2208190466 / 19-Agt-2022 11:00 Dr. dr. Andri Lubis, SPOT (K) / VALE INDONESIA, PT\n\nKepada TS : dr. Rehorarios Media :selected: Internal :unselected: Eksternal\n\nMohon konsultasi dan tindak lanjut untuk pasien dengan :\n\nKeluhan utama :\n\n\\-\n\nHasil pemeriksaan yang ditemukan :\n\nBah - Kini\n\ndizer\n\n..........\n\nDiagnosa :\n\nObat dan tindakan yang diberikan\n\n:\n\nMen van\n\ndenTs\n\nTerima kasih atas bantuan & kerjasamanya. :unselected: Konsultasi satu kali :selected: Rawat bersama\n :unselected:\nAlih rawat\n\nDokter yang merawat\n\nTanda Tangan & Nama Lengkap\n\n| Tanggal : Jam : | 19/8-20 |\n|| 1120 |\n\n<!-- PageFooter="SHAMA 1511.47a/1" -->\n')]