In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from unstructured.partition.pdf import partition_pdf
import time, os, shutil

In [19]:
from langchain_community.document_loaders import UnstructuredPDFLoader

# filename = 'nutrisi_pada_anak_dengan_penyakit_ginjal_clean_poster.pdf'
filename = 'Tatalaksana_Hemodialisis_pada_Anak_dan_Bayi_clean_poster.pdf'
# filepath = f'..\data\pdfs-blocks\\{filename}'
filepath = f'..\data\pdfs-poster\\{filename}'
loader = UnstructuredPDFLoader(filepath, 
                               mode="elements", 
                               strategy="hi_res", 
                               infer_table_structure=True)
documents = loader.load()

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


In [20]:
print("-----------------------------------")
for document in documents:
        print(document.page_content)
        print(document.metadata['category'])
        print("-----------------------------------")

-----------------------------------
ABSTRAK Penyakit ginjal kronik (PGK) merupakan masalah kesehatan serius pada anak dengan morbiditas dan mortalitas yang makin meningkat serta menimbulkan masalah sosial ekonomi yang signifikan. Penyebab utama PGK pada anak adalah anomali kongenital ginjal dan saluran kemih, diikuti nefropati herediter dan glomerulonefritis. Indikasi absolut untuk memulai dialisis pada anak meliputi anuria, gangguan elektrolit berat, gangguan neurologis pada gagal ginjal, perikarditis, diatesis perdarahan, mual berulang, gejala uremia, volume berlebihan, atau kegagalan pertumbuhan meskipun terapi medis sudah tepat, dan hipertensi. Mengoptimalkan status gizi dengan dukungan ahli diet adalah elemen dasar perawatan anak dengan penyakit ginjal kronik stadium 5. Peran ahli diet adalah meminimalkan gejala uremia, mencegah komplikasi penyakit tulang akibat gangguan ginjal (renal bone disease), dan mengoptimalkan pertumbuhan anak. Kata kunci: Anak, hemodialisis, indikasi, pen

In [21]:
import re
from unstructured.cleaners.core import clean_non_ascii_chars, clean_extra_whitespace
filename_clean = filename.replace(".pdf", "")
file = open(f"../multi_column/unstructured_{filename_clean}.txt", "w+")
file.write("")
file.close()

clean_documents = []
file = open(f"../multi_column/unstructured_{filename_clean}.txt", "a")
for document in documents:
    clean = clean_extra_whitespace(clean_non_ascii_chars(re.sub(r"[\u2013_~*’—]", " ", document.page_content).strip()))
    test = document.copy()
    test.page_content = clean

    # Appending the text into file
    file.write(test.page_content)
    file.write("\n")
    clean_documents.append(test)

    # Close the file
file.close()

In [None]:
def find_unique_characters(file_path):
    unique_characters = set()

    try:
        with open(file_path, 'r') as file:
            for line in file:
                # Remove whitespace and newline characters
                line = line.strip()
                # Add each character to the set
                for char in line:
                    if not char.isalpha():  # Check if the character is not a letter
                        unique_characters.add(char)
    except FileNotFoundError:
        print("File not found.")
        return
    except Exception as e:
        print("An error occurred:", e)
        return

    return unique_characters

file_path = f'../multi_column\\unstructured_nutrisi_pada_anak_dengan_penyakit_ginjal_clean_blocks.txt'  # Replace with the path to your text file
unique_chars = sorted(find_unique_characters(file_path))

if unique_chars:
    print("Unique characters in the file:")
    for char in unique_chars:
        print(char)
else:
    print("No unique characters found.")


In [7]:
def insert_newline(text, line_length=180):
    # Use regex to match every 180 characters
    pattern = r'.{1,' + str(line_length) + '}'
    # Insert newline after every match
    new_text = re.sub(pattern, '\\g<0>\n', text)
    return new_text

In [None]:
print("-----------------------------------")
for document in clean_documents:
        print(insert_newline(document.page_content, line_length=140))
        print("-----------------------------------")

BORDER

In [8]:
CHROMA_PATH = "loader_test/chroma_test"

In [10]:
def load_embedding_model():
    start_time = time.time()
    model_path="intfloat/multilingual-e5-large"   
    encode_kwargs = {"normalize_embeddings": True}
    local_embedding = HuggingFaceEmbeddings(
        model_name=model_path,
        cache_folder="../models",
        encode_kwargs=encode_kwargs
    )
    end_time = time.time()
    print(f'model load time {round(end_time - start_time, 0)} second')
    return local_embedding

embedding = load_embedding_model()

model load time 103.0 second


In [11]:
def save_to_chroma(chunks: list[Document]):
    print("Starting Embedding")
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    start_time = time.time()
    db = Chroma.from_documents(
        chunks, 
        embedding, 
        persist_directory=CHROMA_PATH,
        collection_metadata={"hnsw:space": "cosine"}
    )
    db.persist()
    end_time = time.time()
    print(f'embedding time {round(end_time - start_time, 0)} second')
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(documents: list[Document]):
    print("Starting chunking")

    "Character Splitter"
    # text_splitter = CharacterTextSplitter(
    # separator="\n\n",
    # chunk_size=1000,
    # chunk_overlap=300,
    # length_function=len,
    # strip_whitespace=True
    # )
    # chunks = text_splitter.split_documents(documents)

    "Recursive Splitter"
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=300,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    print(f"len docs {len(chunks)}")
    document = chunks[10]
    print("page content \n", document.page_content)
    print("doc metadata \n", document.metadata)
    return chunks

In [17]:
clean_documents[0].metadata.str

{'source': '..\\data\\pdfs-poster\\nutrisi_pada_anak_dengan_penyakit_ginjal_clean_poster.pdf',
 'coordinates': {'points': ((0.0, -1385.7833333333328),
   (0.0, 452.77777777777777),
   (1594.4444444444443, 452.77777777777777),
   (1594.4444444444443, -1385.7833333333328)),
  'system': 'PixelSpace',
  'layout_width': 1442,
  'layout_height': 453},
 'last_modified': '2024-04-19T20:23:55',
 'filetype': 'application/pdf',
 'languages': ['eng'],
 'page_number': 1,
 'file_directory': '..\\data\\pdfs-poster',
 'filename': 'nutrisi_pada_anak_dengan_penyakit_ginjal_clean_poster.pdf',
 'category': 'Image'}

In [18]:
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from langchain_community.vectorstores.utils import filter_complex_metadata
# chunks = chunk_elements(clean_documents, 
#                         max_characters=1500, 
#                         new_after_n_chars=1000, 
#                         overlap=300, 
#                         overlap_all=True)

# chunks = chunk_by_title(clean_documents,
#                         combine_text_under_n_chars = 500,
#                         overlap=300, 
#                         overlap_all=True)

# clean_elements_str_lst = [element.page_content for element in clean_documents]

# if os.path.exists(CHROMA_PATH):
#         shutil.rmtree(CHROMA_PATH)
clean_metadata = filter_complex_metadata(clean_documents)
chunks = split_text(clean_metadata)

db = Chroma.from_documents(
        chunks, 
        embedding, 
        persist_directory=CHROMA_PATH, 
        collection_metadata={"hnsw:space": "cosine"}
    )
db.persist()

Starting chunking
Split 19 documents into 24 chunks.
len docs 24
page content 
 4. Kortikosteroid Penggunaan kortikosteroid dapat menekan pertumbuhan pada anak dengan PGK karena efeknya menekan hormon pertumbuhan dengan cara menstimulasi somatostatin hipotalamus, penurunan regulasi reseptor hepatic hormon pertumbuhan, inhibisi aktivitas biologis insulin-like growth factor (IGF), gangguan keseimbangan profil IGF binding protein serum, dan efek supresi langsung terhadap faktor pertumbuhan secara lokal dan hasil produksi jaringan matriks. Penghentian kortikosteroid atau modifikasi dosis kortikosteroid penting untuk tercapainya pertumbuhan yang optimal.' a. Pada anak dengan PGK derajat 2-5 dan 5D, bikarbonat serum perlu dikoreksi hingga normal. Penyakit ginjal kronik menyebabkan asidosis metabolik yang berdampak terhadap gangguan pertumbuhan. Pada keadaan asidosis, hormon tiroid akan turun dan respons IGF terhadap recombinant human growth hormone (rhGH) juga menumpul, sehingga terjadi resi

In [22]:
clean_metadata = filter_complex_metadata(clean_documents)
chunks = split_text(clean_metadata)
db.add_documents(chunks)


Starting chunking
Split 16 documents into 28 chunks.
len docs 28
page content 
 beratnya kurang dari 10 kg. Sebuah studi Eropa multisenter telah menemukan bahwa prioritas pertama PD adalah usia anak (30%), pilihan orang tua (27%), jarak dari unit (14%), pilihan pasien (11%), kondisi sosial (7%), dan tidak dapat melakukan satu mode (6%). Memilih mode dialisis, baik HD maupun PD, untuk anak membutuhkan pertimbangan, di antaranya faktor-faktor lain dari kemungkinan dampak dari salah satu mode dialisis pada pemeliharaan sisa fungsi ginjal (RRF), karena dampaknya yang spesifik pada hasil pasien. Meskipun tidak ada konsensus umum, peritoneal dialisis dikaitkan dengan kurangnya risiko kehilangan RRF.' Indikasi absolut memulai dialisis pada anak meliputi anuria, gangguan elektrolit berat, gangguan neurologis pada gagal ginjal (misalnya ensefalopati, kejang, foot drop), perikarditis, diatesis perdarahan, mual berulang, gejala uremia, volume berlebihan, atau kegagalan pertumbuhan meskipun terapi

['55a6f805-fe56-11ee-99bf-b6b024e70617',
 '55a6f806-fe56-11ee-972f-b6b024e70617',
 '55a6f807-fe56-11ee-9703-b6b024e70617',
 '55a6f808-fe56-11ee-bb4d-b6b024e70617',
 '55a6f809-fe56-11ee-93d7-b6b024e70617',
 '55a6f80a-fe56-11ee-80be-b6b024e70617',
 '55a6f80b-fe56-11ee-a49f-b6b024e70617',
 '55a6f80c-fe56-11ee-a967-b6b024e70617',
 '55a6f80d-fe56-11ee-925b-b6b024e70617',
 '55a6f80e-fe56-11ee-9ae3-b6b024e70617',
 '55a6f80f-fe56-11ee-8634-b6b024e70617',
 '55a6f810-fe56-11ee-ab12-b6b024e70617',
 '55a6f811-fe56-11ee-b9ce-b6b024e70617',
 '55a6f812-fe56-11ee-b30f-b6b024e70617',
 '55a6f813-fe56-11ee-93a5-b6b024e70617',
 '55a6f814-fe56-11ee-b2e0-b6b024e70617',
 '55a6f815-fe56-11ee-ac16-b6b024e70617',
 '55a6f816-fe56-11ee-a229-b6b024e70617',
 '55a6f817-fe56-11ee-b658-b6b024e70617',
 '55a6f818-fe56-11ee-a1e7-b6b024e70617',
 '55a6f819-fe56-11ee-bdb7-b6b024e70617',
 '55a6f81a-fe56-11ee-a05a-b6b024e70617',
 '55a6f81b-fe56-11ee-9c08-b6b024e70617',
 '55a6f81c-fe56-11ee-b3ae-b6b024e70617',
 '55a6f81d-fe56-