In [1]:
PROJECT_ID = 'dwh-siloam'
REGION = 'asia-southeast1'
print(f"Project ID: {PROJECT_ID}\nRegion: {REGION}")

# Initialize Vertex AI
from pathlib import Path
import vertexai
from google.cloud import aiplatform

print(f"Checking Credentials...")
if not any((Path.cwd()/"service_account").glob('*.json')):
    print("Service account folder is empty. Fallback using default gcloud account")
    aiplatform.init(project=PROJECT_ID, location=REGION)
    vertexai.init(project=PROJECT_ID, location=REGION)
else:
    print('Using service account credentials from service_account folder')
    from google.oauth2 import service_account
    sa_file = list((Path.cwd()/"service_account").glob('*.json'))[0]
    print(f"Using service account file: {sa_file}")
    credentials = service_account.Credentials.from_service_account_file(sa_file)
    aiplatform.init(project=PROJECT_ID, location=REGION, credentials=credentials)
    vertexai.init(project=PROJECT_ID, location=REGION, credentials=credentials)

# Import libraries
from langchain_google_vertexai import VertexAI, ChatVertexAI, create_structured_runnable
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from typing import List, Optional
import requests

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder
from settings import CopilotSettings
import time

import io
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

Project ID: dwh-siloam
Region: asia-southeast1
Checking Credentials...
Using service account credentials from service_account folder
Using service account file: /Users/donnymirzaadhitama/workspace/others/chatbot-llm/service_account/dwh-siloam-99402e61edd2.json


In [2]:
config = CopilotSettings()

## Testing Azure Document Intelligence

In [3]:
"""
This code sample shows Prebuilt Document operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = config.AZURE_DOCS_INT_ENDPOINT
key = config.AZURE_DOCS_INT_API_KEY

# sample document
form_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKS/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
in_file = io.BytesIO(
    open(file_path, 'rb').read()
)

document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    
# poller = document_analysis_client.begin_analyze_document_from_url(
#     model_id="prebuilt-document",
#     document_url=form_url
# )
poller = document_analysis_client.begin_analyze_document(
    model_id="prebuilt-document",
    document=in_file
)

result = poller.result()

print("----Key-value pairs found in document----")
for kv_pair in result.key_value_pairs:
    if kv_pair.key and kv_pair.value:
        print("Masuk if: Key '{}': Value: '{}'".format(kv_pair.key.content, kv_pair.value.content))
    else:
        print("Masuk else: Key '{}': Value:".format(kv_pair.key.content))

print("----------------------------------------")


----Key-value pairs found in document----
Masuk if: Key 'Sex:': Value: 'Male /'
Masuk if: Key 'Ph:': Value: '085399799077'
Masuk if: Key 'MR No:': Value: 'MRCCC.00-42-94-47'
Masuk if: Key 'DOB:': Value: '31-Dec-1971'
Masuk if: Key 'Kepada TS :': Value: 'dr.'
Masuk if: Key 'Internal': Value: ':selected:'
Masuk if: Key 'Eksternal': Value: ':unselected:'
Masuk if: Key 'Mohon konsultasi dan tindak lanjut untuk pasien dengan :': Value: 'Keluhan utama
Bah- Kin'
Masuk if: Key 'Hasil pemeriksaan yang ditemukan
:': Value: 'difama
Km .'
Masuk if: Key 'Diagnosa
:': Value: 'Saf Smilde'
Masuk if: Key 'Konsultasi satu kali': Value: ':unselected:'
Masuk if: Key 'Rawat bersama': Value: ':selected:'
Masuk if: Key 'Alih rawat': Value: ':unselected:'
Masuk if: Key 'Tanggal :': Value: '1120'
----------------------------------------


## Create object document loader

In [67]:
from abc import ABC, abstractmethod
from azure.ai.formrecognizer._models import AnalyzeResult
from langchain_text_splitters.character import RecursiveCharacterTextSplitter, TextSplitter
from typing import Any, List, Optional, Iterable, Iterator
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from langchain_core.documents import Document
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders import PyPDFLoader

class DocumentAnalyzer(ABC):
    @abstractmethod
    def analyze(
        self,
        file_path: str = None,
        file_url: str = None,
        **kwargs,
    ) -> AnalyzeResult:
        pass

    @abstractmethod
    def split_documents(
        self,
        documents: Iterable[AnalyzeResult]
    ) -> List[Document]:
        pass

class UnstructuredDocumentAnalyzer(DocumentAnalyzer):
    def __init__(
        self,
        endpoint: str,
        key: str,
        model_id: str = "prebuilt-document",
        text_splitter: Optional[TextSplitter] = None,
        embedding: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        """ 
        Initialize a unstructured document analyzer
        """
        super().__init__()
        self.text_splitter = text_splitter
        self.model_id = model_id
        self.embedding = embedding
        self.endpoint = endpoint
        self.api_key = key

    def analyze(
        self,
        file_path: Optional[str] = None,
        file_url: Optional[str] = None,
        **kwargs,
    ) -> AnalyzeResult:
        """ 
        Analyze
        """
        document_analysis_client = DocumentAnalysisClient(
            endpoint=self.endpoint, 
            credential=AzureKeyCredential(self.api_key)
        )
        assert (
            file_path is not None or file_url is not None
        ), "file_path or file_url is required!"
        if file_url:
            self.source = file_url
            poller = document_analysis_client.begin_analyze_document_from_url(
                model_id=self.model_id,
                document_url=file_url
            )
        elif file_path:
            self.source = file_path
            blob = Blob.from_path(file_path)
            with blob.as_bytes_io() as in_file:
                poller = document_analysis_client.begin_analyze_document(
                    model_id=self.model_id,
                    document=in_file,
                )
        return poller.result()

    def _get_dict_from_key_value_poller(
        self,
        poller_result: AnalyzeResult,
        source: str = None,
    ) -> dict:
        dict_poller = {}
        for kv_pair in poller_result.key_value_pairs:
            if kv_pair.key not in dict_poller:
                dict_poller[kv_pair.key.content] = {}
                if "value" not in dict_poller[kv_pair.key.content]:
                    dict_poller[kv_pair.key.content]["value"] = []
                if "confidence" not in dict_poller[kv_pair.key.content]:
                    dict_poller[kv_pair.key.content]["confidence"] = []

            if kv_pair.key and kv_pair.value:
                dict_poller[kv_pair.key.content]["value"].append(kv_pair.value.content)
                dict_poller[kv_pair.key.content]["confidence"].append(kv_pair.confidence)
            else:
                dict_poller[kv_pair.key.content]["value"].append(None)
                dict_poller[kv_pair.key.content]["confidence"].append(None)

        dict_poller["source"] = source
        return dict_poller

    def split_documents(
        self, 
        documents: Iterable[AnalyzeResult]
    ) -> List[Document]:
        """
        Split poller result from document intelligence.
        Note: Replaced the inherited split_documents method with additional metadata values
        """
        texts, metadatas = [], []
        for doc in documents:
            texts.append(doc.content)
            dict_poller = self._get_dict_from_key_value_poller(doc, self.source)
            metadatas.append(dict_poller)
        return self.text_splitter.create_documents(texts, metadatas=metadatas)
    
    def load(
        self,
        file_path: str,
    ) -> List[Document]:
        """ 
        Load large PDF documents such as paper or journal
        """
        loader = PyPDFLoader(
            file_path,
            extract_images=True
        )
        return loader.load_and_split(self.text_splitter)

In [68]:
# file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPPK/2022110213021016_MRCCC_FPPK_PERTAMINA INTERNASIONAL EP PT ADMEDIKA_OPA2210270146_RIA NOVERIA.pdf"
file_path = "/Users/donnymirzaadhitama/Documents/Lung Cancer 2020- Epidemiology, Etiology, and Prevention.pdf"
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
uda = UnstructuredDocumentAnalyzer(
    endpoint=config.AZURE_DOCS_INT_ENDPOINT,
    key=config.AZURE_DOCS_INT_API_KEY,
    text_splitter=text_splitter,
)

# Form case:
# result = uda.analyze(file_path=file_path)
# doc_splitted = uda.split_documents([result])
# print(doc_splitted[1].page_content)

# Paper case:
doc_splitted = uda.load(file_path=file_path)
doc_splitted

[Document(page_content='Lung Cancer 2020\nEpidemiology, Etiology, and\nPrevention\nBrett C. Bade, MD, Charles S. Dela Cruz, MD, PhD *\nNotable changes in lung cancer epidemiology and\nprevention have occurred over the past decadeowing to changes in smoking patterns, ground-\nbreaking advances in our understanding of the ge-\nnetics of lung cancer, the immune system’s role inlung cancer control, and lung cancer treatment op-\ntions. Despite these advances, lung cancer re-\nmains the leading cause of cancer death.\n1\nWorldwide, there are more lung cancer casesand deaths since 2011, the number of smokers\nincreased between 1980 and 2012,\n2,3and lung\ncancer rates are climbing in developing countriesin conjunction with tobacco smoking. In the United\nStates, lower tobacco smoking rates have led to\nreductions in lung cancer incidence and mortality,altered the demographics of patients developing\nlung cancer, and heightened the importance of', metadata={'source': '/Users/donnymirzaadhitam

In [69]:
# result.key_value_pairs[0]
doc_splitted[0].dict().keys()

dict_keys(['page_content', 'metadata', 'type'])

In [70]:
print(doc_splitted[0].page_content)
print("="*50)
print(doc_splitted[0].metadata)

Lung Cancer 2020
Epidemiology, Etiology, and
Prevention
Brett C. Bade, MD, Charles S. Dela Cruz, MD, PhD *
Notable changes in lung cancer epidemiology and
prevention have occurred over the past decadeowing to changes in smoking patterns, ground-
breaking advances in our understanding of the ge-
netics of lung cancer, the immune system’s role inlung cancer control, and lung cancer treatment op-
tions. Despite these advances, lung cancer re-
mains the leading cause of cancer death.
1
Worldwide, there are more lung cancer casesand deaths since 2011, the number of smokers
increased between 1980 and 2012,
2,3and lung
cancer rates are climbing in developing countriesin conjunction with tobacco smoking. In the United
States, lower tobacco smoking rates have led to
reductions in lung cancer incidence and mortality,altered the demographics of patients developing
lung cancer, and heightened the importance of
{'source': '/Users/donnymirzaadhitama/Documents/Lung Cancer 2020- Epidemiology, Etiology

## Using langchain data loader

In [71]:
# from langchain_community.document_loaders.doc_intelligence import AzureAIDocumentIntelligenceLoader

# file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKK/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
# analysis_features = [
#     # "ocrHighResolution", 
#     "keyValuePairs",
# ]

# loader = AzureAIDocumentIntelligenceLoader(
#     file_path=file_path,
#     api_endpoint=config.AZURE_DOCS_INT_ENDPOINT,
#     api_key=config.AZURE_DOCS_INT_API_KEY,
#     api_version="2024-02-29-preview",
#     api_model="prebuilt-layout",
#     mode="page",
#     analysis_features=analysis_features,
# )

In [72]:
# from langchain_text_splitters.character import RecursiveCharacterTextSplitter, TextSplitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
# )

In [73]:
# # result = loader.load()
# result = loader.load_and_split(text_splitter=text_splitter)
# result

In [74]:
# print(result[0].page_content)
# print(result[0].page_content)
# result[0].dict()

In [18]:
# from azure.ai.documentintelligence import DocumentIntelligenceClient
# from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
# from azure.core.credentials import AzureKeyCredential
# from langchain_community.document_loaders.blob_loaders import Blob
# from langchain_community.document_loaders.parsers import AzureAIDocumentIntelligenceParser

# document_intelligence_client = DocumentIntelligenceClient(
#         endpoint=config.AZURE_DOCS_INT_ENDPOINT, 
#         credential=AzureKeyCredential(config.AZURE_DOCS_INT_API_KEY),
#         # api_version="2023-07-31"
# )
# parser = AzureAIDocumentIntelligenceParser(  # type: ignore[misc]
#             api_endpoint=config.AZURE_DOCS_INT_ENDPOINT,
#             api_key=config.AZURE_DOCS_INT_API_KEY,
#             api_version="2024-02-29-preview",
#             api_model="prebuilt-layout",
#             mode="markdown",
#             analysis_features=analysis_features,
# )
# formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
# file_path = "/Users/donnymirzaadhitama/workspace/bithealth/training_data/FPKS/2022082011446651_MRCCC_FPKS_VALE INDONESIA PT_OPA2208190466_KAMRULLAH ILYAS-1.pdf"
# in_file = io.BytesIO(
#     open(file_path, 'rb').read()
# )
# blob = Blob.from_path(file_path)
# blob = parser.parse(blob)
# # with blob.as_bytes_io() as file_obj:
# #     poller = document_intelligence_client.begin_analyze_document(
# #             "prebuilt-layout",
# #             AnalyzeDocumentRequest(
# #                 bytes_source=file_obj,
# #             #     url_source=formUrl
# #             )
# #     ) 

## Test PDFLoader

In [31]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(
    "/Users/donnymirzaadhitama/Documents/Lung Cancer 2020- Epidemiology, Etiology, and Prevention.pdf", 
    extract_images=True
)
pages = loader.load()

In [42]:
len(pages)

24

In [44]:
pages_split = loader.load_and_split(text_splitter=text_splitter)

In [45]:
len(pages_split)

118

In [52]:
pages_split[4].dict()

{'page_content': 'with the recent increase in smoking prevalence in\nChina, Indonesia, Eastern Europe, and the North-\nern and Southern parts of Africa.5,7Up to 80% of\ncurrent smokers now live in low- or middle-income countries, and more than one-half of lung\ncancer deaths occur in less developed regions.\n6–8\nBy contrast, lung cancer incidence is decreasing\nor expected to decrease in countries that “took\nup” smoking the earliest and are now successfullyimplementing smoking cessation and avoidance\ncampaigns.5These countries are generally high in-\ncome and include the United States, the UnitedKingdom, the Nordic countries, Australia, NewZealand, Singapore, Germany, and Uruguay.\n5,7\nAlthough the increasing lung cancer burden\nglobally is driven by lung cancer cases in men,most countries are also observing an increasing\nincidence in women.\n5Although breast cancer is\nFig. 1. Distribution of cases and deaths\nfor the 10 most common cancers in\n2018 for ( A) both sexes, ( B) male

In [32]:
print(pages[0].page_content)

Lung Cancer 2020
Epidemiology, Etiology, and
Prevention
Brett C. Bade, MD, Charles S. Dela Cruz, MD, PhD *
Notable changes in lung cancer epidemiology and
prevention have occurred over the past decadeowing to changes in smoking patterns, ground-
breaking advances in our understanding of the ge-
netics of lung cancer, the immune system’s role inlung cancer control, and lung cancer treatment op-
tions. Despite these advances, lung cancer re-
mains the leading cause of cancer death.
1
Worldwide, there are more lung cancer casesand deaths since 2011, the number of smokers
increased between 1980 and 2012,
2,3and lung
cancer rates are climbing in developing countriesin conjunction with tobacco smoking. In the United
States, lower tobacco smoking rates have led to
reductions in lung cancer incidence and mortality,altered the demographics of patients developing
lung cancer, and heightened the importance of
nontobacco risk factors. Although disease under-standing, treatment options, and outcome

In [41]:
pages[4].dict()

{'page_content': 'uptake has been slow with only 4% of eligible\nAmericans undergoing low-dose computed to-mography screening in 2015.\n20Continued imple-\nmentation of LCS combined with therapeutic\nadvances for early and advanced stage diseasemay help reverse our current trends of late-stagediagnosis and low overall survival.\nPerhaps the greatest change in our understand-\ning of lung cancer epidemiology in the UnitedStates is the recognition of the disease’s “diver-\nsity.” That is, lung cancer can no longer be stereo-\ntyped as a disease of older male smokers. Fig. 4\ndemonstrates the meaningful change in lung can-\ncer development and outcomes by gender in the\nlast 50 years. Although smoking history and olderage remain the predominant risk factors for lung\ncancer development, current estimates are that10% to 20% of patients who develop lung cancerare never smokers,\n21and lung cancer incidence in\nwomen is approaching that in men. Also, although\nthe overall trend in the United