In [3]:
def data_loading(
    input_dir: str,
    output_langchain_document: bool = False,
):
    """Load data from the given directory."""
    from pdf2image import convert_from_path
    from llama_index.readers.file import ImageReader
    from llama_index.core import SimpleDirectoryReader
    import os
    import glob

    # Incase there are multiple pdfs in the directory, convert them to images then store under folder images.
    images_root = os.path.join(input_dir, "images")
    os.makedirs(images_root, exist_ok=True)

    pdf_paths = glob.glob(os.path.join(input_dir, "*.pdf"))
    for pdf_path in pdf_paths:
        file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        print(f"Processing {pdf_path}")
        images = convert_from_path(pdf_path=pdf_path)
        for idx, img in enumerate(images):
            img.resize((850, 1100))
            img.save(os.path.join(images_root, f"{file_name}_{idx}.png"))

    # Load all data.
    # Configure the ImageReader to keep images as base64
    image_reader = ImageReader(keep_image=True, parse_text=False)

    # Define a custom file extractor for image files
    file_extractor = {
        ".jpg": image_reader,
        ".png": image_reader,
        ".jpeg": image_reader,
    }
    # Create the MultiModal index
    documents = SimpleDirectoryReader(
        input_dir=input_dir, recursive=True, file_extractor=file_extractor
    ).load_data()

    if output_langchain_document:
        documents = [doc.to_langchain_format() for doc in documents]

    return documents

In [4]:
tenant_name = "wdm_55647e3f100b46dd9c21ea0a67d20458"
documents = data_loading(input_dir="/datadrive/man.pham/data/pdfs")

Processing /datadrive/man.pham/data/pdfs/images_pdf.pdf


In [10]:
import requests
import httpx
import time
import json
from typing import List
from pydantic import BaseModel
from llama_index.core import (
    Document,
)
from tasks import ETL_pipeline
from llama_index.core.schema import (
    ImageNode,
    TextNode,
)


class IngestionRequest(BaseModel):
    documents: List[Document]
    collection_name: str
    tenant: str


url = "http://10.10.193.73:5000/etl_task/"

headers = {"content-type": "application/json"}
data = IngestionRequest.model_validate(
    {
        "documents": documents,
        "collection_name": "LlamaIndex_da9b7bb158e64c93bea491df09894psd",
        "tenant": tenant_name,
    }
).model_dump_json()


def send_request(_):
    try:
        response = requests.request("POST", url, headers=headers, data=data)
        return response.status_code
    except Exception as e:
        return f"Error: {e}"


from multiprocessing import Pool

n_time = 1
start_time = time.time()
with Pool(processes=10) as pool:  # you can adjust the number of processes
    results = pool.map(send_request, range(n_time))


end_time = time.time()

total_time = end_time - start_time
throughput = n_time / total_time  # requests per second

print(f"Total Time: {total_time:.2f} seconds")
print(f"Throughput: {throughput:.2f} requests per second")

Total Time: 3.17 seconds
Throughput: 0.32 requests per second


In [7]:
documents[0].metadata

{'file_path': '/datadrive/man.pham/data/pdfs/images/images_pdf_0.png',
 'file_name': 'images_pdf_0.png',
 'file_type': 'image/png',
 'file_size': 933446,
 'creation_date': '2025-04-21',
 'last_modified_date': '2025-04-21'}

In [None]:
# from llama_index.vector_stores.weaviate import WeaviateVectorStore
# import weaviate
# import os

# client = weaviate.connect_to_custom(
#     http_host=os.environ.get(
#         "WEAVIATE_HOST"
#     ),  # "10.100.224.34",  # URL only, no http prefix
#     http_port=os.environ.get("WEAVIATE_HOST_PORT"),  # "8080",
#     http_secure=False,  # Set to True if https
#     grpc_host=os.environ.get("WEAVIATE_GPC_URL"),  #  "10.100.224.34",
#     grpc_port=os.environ.get(
#         "WEAVIATE_GPC_URL_PORT"
#     ),  # "50051",      # Default is 50051, WCD uses 443
#     grpc_secure=False,  # Edit as needed
#     skip_init_checks=True,
# )
# vector_store = WeaviateVectorStore(
#     weaviate_client=client, index_name="LlamaIndex_da9b7bb158e64c93bea491df09894psd"
# )

            Please make sure to close the connection using `client.close()`.


In [None]:
# vector_store.delete_index()