In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.node_parser.docling import DoclingNodeParser
import os
from dotenv import load_dotenv
from pathlib import Path
from tempfile import mkdtemp

load_dotenv()

# tenant_name = "wdm_55647e3f100b46dd9c21ea0a67d20458"
# tmp_dir_path = Path(mkdtemp())
# EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", device="cuda")
# GEN_MODEL = OpenAI(model="gpt-4o-mini", temperature=0.0)
# Settings.embed_model = EMBED_MODEL
# Settings.llm = GEN_MODEL
# SOURCE = "/datadrive/man.pham/ownllm/data/doc_1.pdf"  # Docling Technical Report
# QUERY = "what is the objective?"

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'llama_index.readers.docling'

In [2]:
# %pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv

## Read documents

In [3]:
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

dir_reader = SimpleDirectoryReader(input_dir="/datadrive/man.pham/ownllm/data")
docs = dir_reader.load_data(show_progress=True)
for doc in docs:
    doc.metadata["tenant_id"] = tenant_name
docs

Loading files: 100%|██████████| 1/1 [00:00<00:00, 10.55file/s]


[Document(id_='97ce50a5-c83f-45eb-940c-542e3d9b9c0c', embedding=None, metadata={'page_label': '1', 'file_name': 'doc_1.pdf', 'file_path': '/datadrive/man.pham/ownllm/data/doc_1.pdf', 'file_type': 'application/pdf', 'file_size': 533405, 'creation_date': '2025-04-09', 'last_modified_date': '2025-04-09', 'tenant_id': 'wdm_55647e3f100b46dd9c21ea0a67d20458'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='kyanon.digital\nx \n kyanon.digital\nAgenda\n', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='7c0cd99d-7653-4ca7-8f3d-eb8f52d26

## Ingestion pipeline

In [None]:
import weaviate
client = weaviate.connect_to_custom(
    http_host= os.environ.get("WEAVIATE_HOST"), #"10.100.224.34",  # URL only, no http prefix
    http_port= os.environ.get("WEAVIATE_HOST_PORT"), #"8080",
    http_secure=False ,   # Set to True if https
    grpc_host= os.environ.get("WEAVIATE_GPC_URL"),#  "10.100.224.34",
    grpc_port= os.environ.get("WEAVIATE_GPC_URL_PORT"),#"50051",      # Default is 50051, WCD uses 443
    grpc_secure=False,   # Edit as needed
    skip_init_checks=True,
)

In [27]:
multi_collection = client.collections.get("LlamaIndex_da9b7bb158e64c93bea491df09894psd")
list(multi_collection.iterator())

[Object(uuid=_WeaviateUUIDInt('0c50a396-69b6-4a54-aa7a-f754e3710f38'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'file_size': 533405.0, 'last_modified_date': '2025-04-09', '_node_type': 'TextNode', 'page_label': '6', 'document_id': UUID('2e7a9b38-2427-4d1b-96a2-e2e2dc2a0b1c'), 'relationships': None, 'ref_doc_id': '2e7a9b38-2427-4d1b-96a2-e2e2dc2a0b1c', 'doc_id': UUID('2e7a9b38-2427-4d1b-96a2-e2e2dc2a0b1c'), 'text': 'kyanon.digital\nx \n kyanon.digital\n2. Key Terminology\nSLA (Service Level Agreement) Introduction\n● Definition: An SLA (Service Level Agreement) is a formal agreement between a service provider and a client that outlines the \nexpected level of service, including response and resolution times for incidents.\n● Importance: SLAs ensure accountability and set clear expectations, which help in maintaining client trust.\n● Implications: T

In [6]:
from llama_index.core.ingestion import IngestionPipeline, DocstoreStrategy
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter
import weaviate
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.response.notebook_utils import display_response

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex_da9b7bb158e64c93bea491df09894psd"
)

chunk_size = 512
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size),
        Settings.embed_model,
    ],
    # docstore=docstore,
    vector_store=vector_store,
    # UPSERTS: This strategy is used to handle both the insertion of new documents and the updating of existing ones. When a document is added to the document store, the UPSERTS strategy checks if the document already exists based on its ID. If the document does not exist, it is inserted. If it does exist and the hash of the document has changed, the document is updated in the document store. This strategy ensures that the document store always contains the most recent version of each document.
    # DUPLICATES_ONLY: This strategy focuses solely on handling duplicates. It checks if a document with the same hash already exists in the document store. If a duplicate is found, the document is not added again. This strategy is useful when you want to avoid storing multiple copies of the same document without updating existing ones.
    # docstore_strategy=DocstoreStrategy.UPSERTS,
)

In [7]:
nodes = pipeline.run(documents=docs, show_progress=True)
nodes

Parsing nodes:   0%|          | 0/14 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

[TextNode(id_='67ac5bfe-efb9-4df4-bf56-7eb40605e88e', embedding=[-0.0029776329174637794, -0.050927113741636276, 0.03266538307070732, 0.001820327597670257, 0.008053546771407127, 0.012163792736828327, -0.07446815818548203, 0.052395641803741455, -0.05341937020421028, -0.013888059183955193, 0.044329386204481125, -0.019611764699220657, 0.0033959837164729834, -0.0033237668685615063, 0.02782275155186653, -0.02197062410414219, 0.01803778111934662, 0.013190139085054398, 0.031204091385006905, 0.02856704406440258, 0.0703495591878891, -0.01759287714958191, -0.025800203904509544, -0.039017632603645325, -0.0005172728560864925, 0.035232312977313995, 0.015844084322452545, -0.042305298149585724, -0.04908576235175133, -0.13578476011753082, 0.034741681069135666, -0.04161180183291435, 0.04697808250784874, -0.022638343274593353, 0.08193653076887131, -0.010274510830640793, -0.00925475638359785, 0.040264926850795746, 0.03894476220011711, 0.007483027875423431, -0.01021498255431652, -0.01649228110909462, 0.009

## Create Index

In [41]:
# Load data into the vector store
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)


filters = MetadataFilters(
    filters=[
        MetadataFilter(key="tenant_id", operator=FilterOperator.EQ, value="1"),
    ]
)

query = "What is the agenda?"

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, embed_model=Settings.embed_model
)
retriever = index.as_retriever(filters=filters)
retrived_nodes = retriever.retrieve(query)
# Query with tenant-specific metadata filter
retrived_nodes

[]