In [3]:
from llmsherpa.readers import LayoutPDFReader
import os
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import hashlib
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from transformers import AutoTokenizer
import os
import torch
import pandas as pd
import math

model_id = "mistralai/Mixtral-8x7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Initialize the embedding function to run on GPU
bge_m3_ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3',  # Specify the model name
    device='cuda:0',  # Specify the device to use (GPU in this case)
    use_fp16=False  # Specify whether to use fp16. Set to `False` if `device` is `cpu`.
)

llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 145635.56it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [4]:
def iterate_files(folder_path):
    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Full path to the file
            file_path = os.path.join(root, file)
            yield file_path  # Yield file path for further use
            
# create the length function
def tiktoken_len(text):
    tokens = tokenizer(
        text,
        return_tensors="pt"
    )["input_ids"][0]
    return len(tokens)

# def add_chunk_to_list(self, chunk_list, uid, idx, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings):
def add_chunk_to_list(chunk_list, uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings):
    """Helper function to add the current chunk to the chunk list."""
    if main_header is not None and chunk_text:
        chunk_list.append({
            'document_id': uid,
            # 'chunk_id': f'{idx}',
            'chunk_text': chunk_text,
            'chunk_token_length': tiktoken_len(chunk_text),
            'file_name': file_name,
            'file_path': file_path,
            'chunk_name': main_header,
            'dense_vector': docs_embeddings['dense'][0],
            'sparse_vector': docs_embeddings["sparse"],
            'metadata': metadata
        })

def extract_filename(file_path):
    return os.path.basename(file_path)

def divide_text_into_pieces(text, num_pieces):
    # Calculate the length of each piece
    piece_length = len(text) // num_pieces
    pieces = [text[i:i+piece_length] for i in range(0, len(text), piece_length)]
    
    # Ensure the pieces list contains exactly 20 elements
    if len(pieces) > num_pieces:
        # Merge the last few smaller pieces if necessary
        last_piece = ''.join(pieces[num_pieces-1:])
        pieces = pieces[:num_pieces-1] + [last_piece]
    
    return pieces

In [38]:
# Initialize hash object for unique ID generation
m = hashlib.md5()

# folder_path = "../artifacts/fls/"
folder_path = "../artifacts/SAP_files/"
files = iterate_files(folder_path)

link = ""
final_list = []
for file_path in tqdm(files, desc="Getting File path"):
    print(f"Processing file: {file_path}")    
    # Get File Name
    file_name = extract_filename(file_path)    
    # Read PDF
    doc = pdf_reader.read_pdf(file_path)
        
    if file_path != link:
        m.update(file_path.encode('utf-8'))
        uid = m.hexdigest()
        link = file_path
        # print(uid)
        
    main_header = None
    last_header = None
    chunk_text = ""
    metadata = {}
    chunk_list = []
    for idx, chunk in enumerate(tqdm(doc.chunks(), desc="Chunks Processing")):
        lines = chunk.to_context_text().split("\n", 1)  # Split at the first newline
        header = lines[0].strip()  # The first line is the header
        split_headers = [part.strip() for part in header.split('>')]
        chunk_bbox = chunk.bbox
        chunk_page_idx = chunk.page_idx
        
        if tiktoken_len(chunk.to_context_text())>1000:
            # print("tiktoken_len(chunk.to_context_text())>1000")
            pieces = math.ceil(tiktoken_len(chunk.to_context_text()) / 1000)
            metadata[header] = {"page_index": chunk_page_idx, "bbox": chunk_bbox}                
            main_header = split_headers[0]                
            last_header = split_headers[-1]                                
            divided_chunk = divide_text_into_pieces(chunk.to_context_text(), pieces)
            for x in divided_chunk:
                chunk_text = x
                docs_embeddings = bge_m3_ef([chunk_text])
                add_chunk_to_list(chunk_list,uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                # self.add_chunk_to_list(chunk_list,uid, chunk_idx, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                # chunk_idx+= 1
            metadata = {}
        else:
            if main_header == split_headers[0] and tiktoken_len(chunk_text + chunk.to_context_text())>1000 : # or len(chunk_text + chunk.to_context_text())>65000:
                # print("tiktoken_len(chunk_text + chunk.to_context_text())>1000")
                # print(tiktoken_len(chunk_text + chunk.to_context_text()))
                # print(tiktoken_len(chunk.to_context_text()))
                
                # print(split_headers[-1])
                docs_embeddings = bge_m3_ef([chunk_text])
                # self.add_chunk_to_list(chunk_list,uid, chunk_idx, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                add_chunk_to_list(chunk_list,uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                main_header, last_header, chunk_text = None, None, ""

            if main_header != split_headers[0] :
                if chunk_text:
                    docs_embeddings = bge_m3_ef([chunk_text])
                    # self.add_chunk_to_list(chunk_list,uid, chunk_idx, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                    add_chunk_to_list(chunk_list,uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
                metadata = {}
                main_header = split_headers[0]
                chunk_text = chunk.to_context_text()
                last_header = split_headers[-1]
            elif last_header != split_headers[-1]:
                # When the last header changes but the main header remains the same
                last_header = split_headers[-1]
                chunk_text += "\n\n" + split_headers[-1] + "\n" + lines[1] + "\n"
            else:
                # If it's the same main header and last header, append the content
                chunk_text += lines[1]
            metadata[header] = {"page_index":chunk_page_idx,"bbox":chunk_bbox}
        
    # # Add the last chunk after the loop ends
    # docs_embeddings = bge_m3_ef([chunk_text])
    # add_chunk_to_list(chunk_list,uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
    if chunk_text:
        docs_embeddings = bge_m3_ef([chunk_text])
        add_chunk_to_list(chunk_list, uid, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
        # self.add_chunk_to_list(chunk_list, uid, chunk_idx, main_header, chunk_text, file_name, file_path, metadata, docs_embeddings)
        
    # Convert the chunk_list to a Pandas DataFrame
    chunk_df = pd.DataFrame(chunk_list)
    # chunk_df.sort_values(by='chunk_id', ascending=False)
    chunk_df['chunk_id'] = chunk_df.index
    # Convert DataFrame to a list of dictionaries
    chunk_list = chunk_df.to_dict(orient='records')
    final_list.extend(chunk_list)
    # chunk_list =[]
    

Getting File path: 0it [00:00, ?it/s]

Processing file: ../artifacts/SAP_files/01.Basic_Function_SD.pdf


Chunks Processing: 100%|██████████| 1469/1469 [00:19<00:00, 74.84it/s]
Getting File path: 1it [00:31, 31.45s/it]

Processing file: ../artifacts/SAP_files/02.Sales.pdf


Chunks Processing: 100%|██████████| 2280/2280 [00:33<00:00, 69.00it/s]
Getting File path: 2it [01:24, 44.19s/it]

Processing file: ../artifacts/SAP_files/03.Pricing_Condition.pdf


Chunks Processing: 100%|██████████| 1182/1182 [00:15<00:00, 75.98it/s]
Getting File path: 3it [01:47, 34.62s/it]

Processing file: ../artifacts/SAP_files/04.Availability_Check.pdf


Chunks Processing: 100%|██████████| 344/344 [00:06<00:00, 54.32it/s]
Getting File path: 4it [01:59, 25.45s/it]

Processing file: ../artifacts/SAP_files/04.McGrawHill-SD.pdf


Chunks Processing: 100%|██████████| 2045/2045 [01:01<00:00, 33.23it/s]
Getting File path: 5it [03:38, 52.22s/it]

Processing file: ../artifacts/SAP_files/05.Scheduling_Agreement.pdf


Chunks Processing: 100%|██████████| 1033/1033 [00:14<00:00, 73.51it/s]
Getting File path: 6it [04:01, 42.23s/it]

Processing file: ../artifacts/SAP_files/06.Shipping.pdf


Chunks Processing: 100%|██████████| 1917/1917 [00:28<00:00, 68.45it/s]
Getting File path: 7it [04:51, 44.71s/it]

Processing file: ../artifacts/SAP_files/07.Transportation.pdf


Chunks Processing: 100%|██████████| 2338/2338 [00:38<00:00, 61.28it/s]
Getting File path: 8it [05:56, 51.29s/it]

Processing file: ../artifacts/SAP_files/07.Trasportation_46C.pdf


Chunks Processing: 100%|██████████| 2995/2995 [00:50<00:00, 59.14it/s]
Getting File path: 9it [07:16, 60.15s/it]

Processing file: ../artifacts/SAP_files/08.Billing_process.pdf


Chunks Processing: 100%|██████████| 1446/1446 [00:17<00:00, 81.80it/s]
Getting File path: 10it [07:43, 49.99s/it]

Processing file: ../artifacts/SAP_files/09.Billing_Plan.pdf


Chunks Processing: 100%|██████████| 176/176 [00:02<00:00, 76.08it/s]
Getting File path: 11it [07:47, 35.70s/it]

Processing file: ../artifacts/SAP_files/11.Output_Determination.pdf


Chunks Processing: 100%|██████████| 138/138 [00:02<00:00, 66.04it/s]
Getting File path: 12it [07:50, 25.76s/it]

Processing file: ../artifacts/SAP_files/Credit_Management.pdf


Chunks Processing: 100%|██████████| 394/394 [00:06<00:00, 65.36it/s]
Getting File path: 13it [08:00, 20.95s/it]

Processing file: ../artifacts/SAP_files/Customer_Service.pdf


Chunks Processing: 100%|██████████| 405/405 [00:06<00:00, 64.47it/s]
Getting File path: 14it [08:09, 17.55s/it]

Processing file: ../artifacts/SAP_files/Documentary_Payments.pdf


Chunks Processing: 100%|██████████| 225/225 [00:03<00:00, 66.53it/s]
Getting File path: 15it [08:15, 13.90s/it]

Processing file: ../artifacts/SAP_files/EDI-IDOC_SD.pdf


Chunks Processing: 100%|██████████| 39/39 [00:00<00:00, 50.75it/s]
Getting File path: 16it [08:16, 10.09s/it]

Processing file: ../artifacts/SAP_files/Foreign Trade.pdf


Chunks Processing: 100%|██████████| 46/46 [00:00<00:00, 76.60it/s]
Getting File path: 17it [08:18,  7.64s/it]

Processing file: ../artifacts/SAP_files/IACs_Foreign_Trade.pdf


Chunks Processing: 100%|██████████| 39/39 [00:00<00:00, 82.51it/s]
Getting File path: 18it [08:19,  5.57s/it]

Processing file: ../artifacts/SAP_files/Legal_Control.pdf


Chunks Processing: 100%|██████████| 271/271 [00:03<00:00, 81.62it/s]
Getting File path: 19it [08:23,  5.36s/it]

Processing file: ../artifacts/SAP_files/Periodic_Declarations_SD-FT-GOV.pdf


Chunks Processing: 100%|██████████| 420/420 [00:05<00:00, 72.61it/s]
Getting File path: 20it [08:37,  7.87s/it]

Processing file: ../artifacts/SAP_files/Periodic_Declarations.pdf


Chunks Processing: 100%|██████████| 359/359 [00:05<00:00, 61.84it/s]
Getting File path: 21it [08:46,  8.17s/it]

Processing file: ../artifacts/SAP_files/Preferences.pdf


Chunks Processing: 100%|██████████| 309/309 [00:04<00:00, 68.39it/s] 
Getting File path: 22it [08:56,  8.64s/it]

Processing file: ../artifacts/SAP_files/SCM610_EN_Delivery_Processes.pdf


Chunks Processing: 100%|██████████| 1563/1563 [00:20<00:00, 75.42it/s]
Getting File path: 23it [09:49, 22.07s/it]

Processing file: ../artifacts/SAP_files/SD-FT-PRO.pdf


Chunks Processing: 100%|██████████| 335/335 [00:04<00:00, 73.98it/s]
Getting File path: 24it [09:57, 17.66s/it]

Processing file: ../artifacts/SAP_files/Import.pdf


Chunks Processing: 100%|██████████| 56/56 [00:00<00:00, 78.80it/s]
Getting File path: 25it [09:58, 23.92s/it]


In [41]:
len(final_list_df)

2210

In [None]:
from pymilvus import MilvusClient

# 1. Set up a Milvus client
client = MilvusClient(
    uri="http://localhost:19530/dolphinai_db"
)
client.list_collections() 

# Load a collection
client.load_collection(
    # collection_name="dolphinai_collection",
    collection_name="hybrid_sap_collection",
    replica_number=1 # Number of replicas to create on query nodes. Max value is 1 for Milvus Standalone, and no greater than `queryNode.replicas` for Milvus Cluster.
)

res = client.get_load_state(
    collection_name="hybrid_sap_collection"
    # collection_name="dolphinai_collection"
)

print(res)

res = client.insert(
    collection_name="hybrid_sap_collection",
    # collection_name="dolphinai_collection",
    data=final_list
)

print(res)