In [1]:
import os
import sys
from pathlib import Path
import logging
import numpy as np
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

In [2]:
notebook_dir = Path.cwd()
project_root = notebook_dir.parent  # If notebooks are in langechain/notebooks

# Add project root to Python path for imports
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /app


In [3]:
from src.data_pipline.json_loader.json_loader_epo import EPOPatentLoader
from config.json_loader_config import PatentLoaderConfig
from src.embeddings.text_Chunker import TextChunker, ChunkerConfig


In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [5]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("patent_pipeline")

In [6]:
data_dir = Path(project_root) / "data" / "parsed" / "EPO"
model_name = "sentence-transformers/all-mpnet-base-v2"
vector_db_dir = Path(project_root) / "data" / "vector_db"
vector_db_dir.mkdir(parents=True, exist_ok=True)

In [7]:
loader_config = PatentLoaderConfig(
    max_files=200,
    
    verbose=False,
    epo_dir=data_dir,
    include_abstract=True,
    include_claims=True,
    include_description=True
)
patent_loader = EPOPatentLoader(config=loader_config)
print("‚úÖ Patent loader initialized")

‚úÖ Patent loader initialized


In [17]:
json_files = patent_loader.load_raw_json_files()

2025-06-19 22:34:34,057 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 200 files out of 200 total files
2025-06-19 22:34:34,057 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 200 files out of 200 total files
2025-06-19 22:34:34,058 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 200 JSON files in /app/data/parsed/EPO
2025-06-19 22:34:34,058 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 200 JSON files in /app/data/parsed/EPO
2025-06-19 22:34:34,098 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 200 raw JSON files
2025-06-19 22:34:34,098 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 200 raw JSON files


In [18]:
documents  = patent_loader.extract_document_components()

2025-06-19 22:34:35,034 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 200 files out of 200 total files
2025-06-19 22:34:35,034 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 200 files out of 200 total files
2025-06-19 22:34:35,035 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 200 JSON files in /app/data/parsed/EPO
2025-06-19 22:34:35,035 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 200 JSON files in /app/data/parsed/EPO
2025-06-19 22:34:35,169 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 199 documents from 200 files
2025-06-19 22:34:35,169 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 199 documents from 200 files


In [19]:
json_files = []

In [20]:
print(f"‚úÖ Loaded {len(documents)} document segments from {len(json_files)} patent files")


‚úÖ Loaded 199 document segments from 0 patent files


In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
chunker_config = ChunkerConfig(
    base_chunk_size=300,  # Adjust based on your needs
    chunk_overlap=30,
    min_chunk_size=100
)
chunker = TextChunker(tokenizer, config=chunker_config)
print("‚úÖ TextChunker initialized")

‚úÖ TextChunker initialized


In [22]:
print("‚úÇÔ∏è Chunking documents...")
chunked_docs = []

for docs in tqdm(documents, desc="Chunking"):
    for doc in tqdm(docs, desc="Chunking"):
        try:
            text = doc.page_content
            metadata = doc.metadata
            
            # Use your TextChunker to split the document
            chunks = chunker.chunk_text(text, use_semantic=True)
            
            # Create LangChain Document objects from chunks
            for i, chunk_text in enumerate(chunks):
                # Copy metadata and add chunk information
                chunk_metadata = metadata.copy()
                chunk_metadata.update({
                    "chunk_index": i,
                    "total_chunks": len(chunks)
                })
                
                chunked_docs.append(Document(
                    page_content=chunk_text,
                    metadata=chunk_metadata
                ))
        except Exception as e:
            logger.error(f"Error chunking document: {str(e)}")

‚úÇÔ∏è Chunking documents...


Chunking:   0%|          | 0/199 [00:00<?, ?it/s]

Chunking:   0%|          | 0/180 [00:00<?, ?it/s]

2025-06-19 22:34:42,333 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:42,334 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,335 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:42,336 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,336 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:42,339 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,340 - src.embeddings.text_Chunker - INFO - Input text has 116 tokens
2025-06-19 22:34:42,341 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,342 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-19 22:34:42,342 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,343 - src.embeddings.text_Chunker - INFO - Input text has 88 tokens
2025-06-19 22:34:42,344 - s

Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-19 22:34:42,508 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:42,508 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,509 - src.embeddings.text_Chunker - INFO - Input text has 479 tokens
2025-06-19 22:34:42,510 - src.embeddings.text_Chunker - INFO - Using chunk size: 240 tokens, overlap: 30
2025-06-19 22:34:42,513 - src.embeddings.text_Chunker - INFO - Input text has 149 tokens
2025-06-19 22:34:42,513 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,514 - src.embeddings.text_Chunker - INFO - Input text has 98 tokens
2025-06-19 22:34:42,514 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,514 - src.embeddings.text_Chunker - INFO - Input text has 142 tokens
2025-06-19 22:34:42,514 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,515 - src.embeddings.text_Chunker - INFO - Input text has 148 tokens
2025-06-

Chunking:   0%|          | 0/2 [00:00<?, ?it/s]

2025-06-19 22:34:42,530 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:42,531 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,531 - src.embeddings.text_Chunker - INFO - Input text has 72 tokens
2025-06-19 22:34:42,531 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/2 [00:00<?, ?it/s]

2025-06-19 22:34:42,538 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:42,538 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,538 - src.embeddings.text_Chunker - INFO - Input text has 255 tokens
2025-06-19 22:34:42,539 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-19 22:34:42,546 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:42,547 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,547 - src.embeddings.text_Chunker - INFO - Input text has 219 tokens
2025-06-19 22:34:42,548 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,548 - src.embeddings.text_Chunker - INFO - Input text has 172 tokens
2025-06-19 22:34:42,548 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,549 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:42,549 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,550 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-19 22:34:42,550 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,550 - src.embeddings.text_Chunker - INFO - Input text has 194 tokens
2025-06-19 22:34:42,551 

Chunking:   0%|          | 0/111 [00:00<?, ?it/s]

2025-06-19 22:34:42,565 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-19 22:34:42,566 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,566 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-19 22:34:42,567 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,567 - src.embeddings.text_Chunker - INFO - Input text has 70 tokens
2025-06-19 22:34:42,568 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,568 - src.embeddings.text_Chunker - INFO - Input text has 103 tokens
2025-06-19 22:34:42,569 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,569 - src.embeddings.text_Chunker - INFO - Input text has 134 tokens
2025-06-19 22:34:42,569 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,570 - src.embeddings.text_Chunker - INFO - Input text has 105 tokens
2025-06-19 22:34:42,570 

Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,675 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,675 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,681 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-19 22:34:42,681 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,688 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:42,689 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,693 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:42,693 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,699 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:42,699 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,704 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:42,704 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,709 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:42,709 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,714 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:42,714 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,719 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:42,720 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,725 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:42,726 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,729 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-19 22:34:42,729 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,736 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,736 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,742 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-19 22:34:42,742 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,747 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:42,747 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,753 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:42,754 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,759 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,759 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,764 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:42,764 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,770 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-19 22:34:42,771 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,778 - src.embeddings.text_Chunker - INFO - Input text has 28 tokens
2025-06-19 22:34:42,778 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,783 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:42,783 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,788 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:42,789 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,795 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:42,795 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,801 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-19 22:34:42,802 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,807 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:42,808 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,813 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,814 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,819 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:42,819 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,824 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:42,824 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,829 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,830 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-19 22:34:42,835 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-19 22:34:42,835 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/73 [00:00<?, ?it/s]

2025-06-19 22:34:42,841 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-19 22:34:42,841 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,841 - src.embeddings.text_Chunker - INFO - Input text has 40 tokens
2025-06-19 22:34:42,842 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,842 - src.embeddings.text_Chunker - INFO - Input text has 341 tokens
2025-06-19 22:34:42,843 - src.embeddings.text_Chunker - INFO - Using chunk size: 171 tokens, overlap: 30
2025-06-19 22:34:42,845 - src.embeddings.text_Chunker - INFO - Input text has 382 tokens
2025-06-19 22:34:42,845 - src.embeddings.text_Chunker - INFO - Using chunk size: 191 tokens, overlap: 30
2025-06-19 22:34:42,847 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-19 22:34:42,848 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,848 - src.embeddings.text_Chunker - INFO - Input text has 30

Chunking:   0%|          | 0/15 [00:00<?, ?it/s]

2025-06-19 22:34:42,927 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:42,927 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,927 - src.embeddings.text_Chunker - INFO - Input text has 321 tokens
2025-06-19 22:34:42,928 - src.embeddings.text_Chunker - INFO - Using chunk size: 161 tokens, overlap: 30
2025-06-19 22:34:42,929 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:42,929 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,929 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:42,930 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,930 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-19 22:34:42,931 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,931 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-19 

Chunking:   0%|          | 0/17 [00:00<?, ?it/s]

2025-06-19 22:34:42,944 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:42,944 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,945 - src.embeddings.text_Chunker - INFO - Input text has 188 tokens
2025-06-19 22:34:42,946 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,946 - src.embeddings.text_Chunker - INFO - Input text has 71 tokens
2025-06-19 22:34:42,946 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,947 - src.embeddings.text_Chunker - INFO - Input text has 83 tokens
2025-06-19 22:34:42,947 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,947 - src.embeddings.text_Chunker - INFO - Input text has 90 tokens
2025-06-19 22:34:42,948 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,948 - src.embeddings.text_Chunker - INFO - Input text has 50 tokens
2025-06-19 22:34:42,949 - s

Chunking:   0%|          | 0/92 [00:00<?, ?it/s]

2025-06-19 22:34:42,963 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:42,963 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,964 - src.embeddings.text_Chunker - INFO - Input text has 46 tokens
2025-06-19 22:34:42,965 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,965 - src.embeddings.text_Chunker - INFO - Input text has 86 tokens
2025-06-19 22:34:42,965 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,965 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:42,966 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,966 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:42,967 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:42,967 - src.embeddings.text_Chunker - INFO - Input text has 67 tokens
2025-06-19 22:34:42,967 - src

Chunking:   0%|          | 0/79 [00:00<?, ?it/s]

2025-06-19 22:34:43,131 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:43,131 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,133 - src.embeddings.text_Chunker - INFO - Input text has 258 tokens
2025-06-19 22:34:43,133 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,133 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-19 22:34:43,133 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,134 - src.embeddings.text_Chunker - INFO - Input text has 74 tokens
2025-06-19 22:34:43,134 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,135 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-19 22:34:43,135 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,136 - src.embeddings.text_Chunker - INFO - Input text has 39 tokens
2025-06-19 22:34:43,136 - 

Chunking:   0%|          | 0/74 [00:00<?, ?it/s]

2025-06-19 22:34:43,192 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:43,193 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,193 - src.embeddings.text_Chunker - INFO - Input text has 58 tokens
2025-06-19 22:34:43,194 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,195 - src.embeddings.text_Chunker - INFO - Input text has 77 tokens
2025-06-19 22:34:43,195 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,195 - src.embeddings.text_Chunker - INFO - Input text has 92 tokens
2025-06-19 22:34:43,195 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,196 - src.embeddings.text_Chunker - INFO - Input text has 140 tokens
2025-06-19 22:34:43,196 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,197 - src.embeddings.text_Chunker - INFO - Input text has 130 tokens
2025-06-19 22:34:43,197 - 

Chunking:   0%|          | 0/96 [00:00<?, ?it/s]

2025-06-19 22:34:43,254 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:43,254 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,254 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-19 22:34:43,255 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,255 - src.embeddings.text_Chunker - INFO - Input text has 68 tokens
2025-06-19 22:34:43,255 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,256 - src.embeddings.text_Chunker - INFO - Input text has 86 tokens
2025-06-19 22:34:43,256 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,257 - src.embeddings.text_Chunker - INFO - Input text has 67 tokens
2025-06-19 22:34:43,257 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,257 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-19 22:34:43,257 - sr

Chunking:   0%|          | 0/94 [00:00<?, ?it/s]

2025-06-19 22:34:43,331 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:43,331 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,333 - src.embeddings.text_Chunker - INFO - Input text has 117 tokens
2025-06-19 22:34:43,333 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,334 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-19 22:34:43,334 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,335 - src.embeddings.text_Chunker - INFO - Input text has 59 tokens
2025-06-19 22:34:43,335 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,336 - src.embeddings.text_Chunker - INFO - Input text has 46 tokens
2025-06-19 22:34:43,336 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,337 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:43,337 - s

Chunking:   0%|          | 0/124 [00:00<?, ?it/s]

2025-06-19 22:34:43,421 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:43,421 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,422 - src.embeddings.text_Chunker - INFO - Input text has 102 tokens
2025-06-19 22:34:43,422 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,423 - src.embeddings.text_Chunker - INFO - Input text has 203 tokens
2025-06-19 22:34:43,423 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,424 - src.embeddings.text_Chunker - INFO - Input text has 52 tokens
2025-06-19 22:34:43,424 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,424 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:43,425 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,425 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-19 22:34:43,426 -

Chunking:   0%|          | 0/133 [00:00<?, ?it/s]

2025-06-19 22:34:43,529 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:43,529 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,530 - src.embeddings.text_Chunker - INFO - Input text has 205 tokens
2025-06-19 22:34:43,530 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,531 - src.embeddings.text_Chunker - INFO - Input text has 162 tokens
2025-06-19 22:34:43,531 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,532 - src.embeddings.text_Chunker - INFO - Input text has 251 tokens
2025-06-19 22:34:43,532 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,533 - src.embeddings.text_Chunker - INFO - Input text has 39 tokens
2025-06-19 22:34:43,533 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,534 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:43,534 -

Chunking:   0%|          | 0/74 [00:00<?, ?it/s]

2025-06-19 22:34:43,658 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:43,658 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,659 - src.embeddings.text_Chunker - INFO - Input text has 38 tokens
2025-06-19 22:34:43,659 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,661 - src.embeddings.text_Chunker - INFO - Input text has 128 tokens
2025-06-19 22:34:43,661 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,662 - src.embeddings.text_Chunker - INFO - Input text has 165 tokens
2025-06-19 22:34:43,662 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,663 - src.embeddings.text_Chunker - INFO - Input text has 72 tokens
2025-06-19 22:34:43,664 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,664 - src.embeddings.text_Chunker - INFO - Input text has 41 tokens
2025-06-19 22:34:43,664 -

Chunking:   0%|          | 0/6 [00:00<?, ?it/s]

2025-06-19 22:34:43,727 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:43,727 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,728 - src.embeddings.text_Chunker - INFO - Input text has 495 tokens
2025-06-19 22:34:43,728 - src.embeddings.text_Chunker - INFO - Using chunk size: 248 tokens, overlap: 30
2025-06-19 22:34:43,731 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:43,731 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,731 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:43,732 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,732 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:43,732 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,733 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 

Chunking:   0%|          | 0/62 [00:00<?, ?it/s]

2025-06-19 22:34:43,739 - src.embeddings.text_Chunker - INFO - Input text has 4 tokens
2025-06-19 22:34:43,739 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,740 - src.embeddings.text_Chunker - INFO - Input text has 40 tokens
2025-06-19 22:34:43,740 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,741 - src.embeddings.text_Chunker - INFO - Input text has 96 tokens
2025-06-19 22:34:43,741 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,742 - src.embeddings.text_Chunker - INFO - Input text has 99 tokens
2025-06-19 22:34:43,742 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,743 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-19 22:34:43,743 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:43,744 - src.embeddings.text_Chunker - INFO - Input text has 65 tokens
2025-06-19 22:34:43,744 - sr

Chunking:   0%|          | 0/118 [00:00<?, ?it/s]

2025-06-19 22:34:45,736 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:45,737 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,737 - src.embeddings.text_Chunker - INFO - Input text has 255 tokens
2025-06-19 22:34:45,737 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,738 - src.embeddings.text_Chunker - INFO - Input text has 384 tokens
2025-06-19 22:34:45,739 - src.embeddings.text_Chunker - INFO - Using chunk size: 192 tokens, overlap: 30
2025-06-19 22:34:45,741 - src.embeddings.text_Chunker - INFO - Input text has 156 tokens
2025-06-19 22:34:45,742 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,743 - src.embeddings.text_Chunker - INFO - Input text has 235 tokens
2025-06-19 22:34:45,743 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,744 - src.embeddings.text_Chunker - INFO - Input text has 273 tokens
2025-06

Chunking:   0%|          | 0/57 [00:00<?, ?it/s]

2025-06-19 22:34:45,853 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:45,854 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,854 - src.embeddings.text_Chunker - INFO - Input text has 32 tokens
2025-06-19 22:34:45,854 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,854 - src.embeddings.text_Chunker - INFO - Input text has 59 tokens
2025-06-19 22:34:45,855 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,855 - src.embeddings.text_Chunker - INFO - Input text has 58 tokens
2025-06-19 22:34:45,855 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,855 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:45,855 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,856 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 22:34:45,856 - s

Chunking:   0%|          | 0/198 [00:00<?, ?it/s]

2025-06-19 22:34:45,902 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-19 22:34:45,902 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,903 - src.embeddings.text_Chunker - INFO - Input text has 23 tokens
2025-06-19 22:34:45,903 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,903 - src.embeddings.text_Chunker - INFO - Input text has 106 tokens
2025-06-19 22:34:45,904 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,904 - src.embeddings.text_Chunker - INFO - Input text has 58 tokens
2025-06-19 22:34:45,904 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,904 - src.embeddings.text_Chunker - INFO - Input text has 69 tokens
2025-06-19 22:34:45,904 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:45,906 - src.embeddings.text_Chunker - INFO - Input text has 34 tokens
2025-06-19 22:34:45,907 - 

Chunking:   0%|          | 0/58 [00:00<?, ?it/s]

2025-06-19 22:34:46,052 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:46,053 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,053 - src.embeddings.text_Chunker - INFO - Input text has 81 tokens
2025-06-19 22:34:46,054 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,054 - src.embeddings.text_Chunker - INFO - Input text has 145 tokens
2025-06-19 22:34:46,055 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,055 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-19 22:34:46,055 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,056 - src.embeddings.text_Chunker - INFO - Input text has 43 tokens
2025-06-19 22:34:46,056 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,057 - src.embeddings.text_Chunker - INFO - Input text has 28 tokens
2025-06-19 22:34:46,057 - s

Chunking:   0%|          | 0/63 [00:00<?, ?it/s]

2025-06-19 22:34:46,105 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 22:34:46,105 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,105 - src.embeddings.text_Chunker - INFO - Input text has 62 tokens
2025-06-19 22:34:46,105 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,106 - src.embeddings.text_Chunker - INFO - Input text has 62 tokens
2025-06-19 22:34:46,106 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,106 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-19 22:34:46,107 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,107 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-19 22:34:46,107 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,107 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-19 22:34:46,107 - s

Chunking:   0%|          | 0/138 [00:00<?, ?it/s]

2025-06-19 22:34:46,157 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:46,158 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,158 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 22:34:46,158 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,158 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-19 22:34:46,159 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,159 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-19 22:34:46,159 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,160 - src.embeddings.text_Chunker - INFO - Input text has 118 tokens
2025-06-19 22:34:46,160 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,161 - src.embeddings.text_Chunker - INFO - Input text has 89 tokens
2025-06-19 22:34:46,161 - s

Chunking:   0%|          | 0/327 [00:00<?, ?it/s]

2025-06-19 22:34:46,282 - src.embeddings.text_Chunker - INFO - Input text has 4 tokens
2025-06-19 22:34:46,282 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,283 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-19 22:34:46,283 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,284 - src.embeddings.text_Chunker - INFO - Input text has 172 tokens
2025-06-19 22:34:46,285 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,286 - src.embeddings.text_Chunker - INFO - Input text has 124 tokens
2025-06-19 22:34:46,287 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,289 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:46,290 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,291 - src.embeddings.text_Chunker - INFO - Input text has 54 tokens
2025-06-19 22:34:46,292 - 

Chunking:   0%|          | 0/14 [00:00<?, ?it/s]

2025-06-19 22:34:46,568 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:46,568 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,569 - src.embeddings.text_Chunker - INFO - Input text has 211 tokens
2025-06-19 22:34:46,569 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,569 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:46,570 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,570 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:46,570 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,571 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:46,571 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,572 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:46,572 - s

Chunking:   0%|          | 0/33 [00:00<?, ?it/s]

2025-06-19 22:34:46,584 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:46,585 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,585 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-19 22:34:46,585 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,586 - src.embeddings.text_Chunker - INFO - Input text has 199 tokens
2025-06-19 22:34:46,586 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,586 - src.embeddings.text_Chunker - INFO - Input text has 139 tokens
2025-06-19 22:34:46,587 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,587 - src.embeddings.text_Chunker - INFO - Input text has 43 tokens
2025-06-19 22:34:46,587 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,588 - src.embeddings.text_Chunker - INFO - Input text has 299 tokens
2025-06-19 22:34:46,590 -

Chunking:   0%|          | 0/177 [00:00<?, ?it/s]

2025-06-19 22:34:46,695 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:46,696 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,696 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-19 22:34:46,696 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,697 - src.embeddings.text_Chunker - INFO - Input text has 63 tokens
2025-06-19 22:34:46,697 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,698 - src.embeddings.text_Chunker - INFO - Input text has 109 tokens
2025-06-19 22:34:46,698 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,698 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-19 22:34:46,699 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,699 - src.embeddings.text_Chunker - INFO - Input text has 82 tokens
2025-06-19 22:34:46,699 - s

Chunking:   0%|          | 0/70 [00:00<?, ?it/s]

2025-06-19 22:34:46,865 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:46,865 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,867 - src.embeddings.text_Chunker - INFO - Input text has 32 tokens
2025-06-19 22:34:46,867 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,868 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-19 22:34:46,868 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,869 - src.embeddings.text_Chunker - INFO - Input text has 114 tokens
2025-06-19 22:34:46,869 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,869 - src.embeddings.text_Chunker - INFO - Input text has 59 tokens
2025-06-19 22:34:46,870 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,870 - src.embeddings.text_Chunker - INFO - Input text has 96 tokens
2025-06-19 22:34:46,870 -

Chunking:   0%|          | 0/175 [00:00<?, ?it/s]

2025-06-19 22:34:46,924 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:46,925 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,926 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:46,926 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,927 - src.embeddings.text_Chunker - INFO - Input text has 131 tokens
2025-06-19 22:34:46,927 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,927 - src.embeddings.text_Chunker - INFO - Input text has 154 tokens
2025-06-19 22:34:46,928 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,929 - src.embeddings.text_Chunker - INFO - Input text has 124 tokens
2025-06-19 22:34:46,929 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:46,930 - src.embeddings.text_Chunker - INFO - Input text has 29 tokens
2025-06-19 22:34:46,930 -

Chunking:   0%|          | 0/112 [00:00<?, ?it/s]

2025-06-19 22:34:48,897 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:48,898 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,898 - src.embeddings.text_Chunker - INFO - Input text has 81 tokens
2025-06-19 22:34:48,898 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,898 - src.embeddings.text_Chunker - INFO - Input text has 95 tokens
2025-06-19 22:34:48,899 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,899 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-19 22:34:48,899 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,899 - src.embeddings.text_Chunker - INFO - Input text has 49 tokens
2025-06-19 22:34:48,899 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,900 - src.embeddings.text_Chunker - INFO - Input text has 82 tokens
2025-06-19 22:34:48,900 - s

Chunking:   0%|          | 0/77 [00:00<?, ?it/s]

2025-06-19 22:34:48,990 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:48,990 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,991 - src.embeddings.text_Chunker - INFO - Input text has 75 tokens
2025-06-19 22:34:48,991 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,992 - src.embeddings.text_Chunker - INFO - Input text has 199 tokens
2025-06-19 22:34:48,992 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,993 - src.embeddings.text_Chunker - INFO - Input text has 187 tokens
2025-06-19 22:34:48,993 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,993 - src.embeddings.text_Chunker - INFO - Input text has 29 tokens
2025-06-19 22:34:48,994 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:48,995 - src.embeddings.text_Chunker - INFO - Input text has 109 tokens
2025-06-19 22:34:48,995 

Chunking:   0%|          | 0/10 [00:00<?, ?it/s]

2025-06-19 22:34:49,060 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:49,060 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,061 - src.embeddings.text_Chunker - INFO - Input text has 687 tokens
2025-06-19 22:34:49,063 - src.embeddings.text_Chunker - INFO - Using chunk size: 229 tokens, overlap: 30
2025-06-19 22:34:49,065 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:49,065 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,066 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:49,066 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,067 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:49,067 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,068 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 

Chunking:   0%|          | 0/10 [00:00<?, ?it/s]

2025-06-19 22:34:49,077 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:49,077 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,078 - src.embeddings.text_Chunker - INFO - Input text has 96 tokens
2025-06-19 22:34:49,078 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,079 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,079 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,079 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,080 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,080 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,080 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,081 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,081 - src.em

Chunking:   0%|          | 0/71 [00:00<?, ?it/s]

2025-06-19 22:34:49,090 - src.embeddings.text_Chunker - INFO - Input text has 23 tokens
2025-06-19 22:34:49,090 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,090 - src.embeddings.text_Chunker - INFO - Input text has 209 tokens
2025-06-19 22:34:49,092 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,092 - src.embeddings.text_Chunker - INFO - Input text has 134 tokens
2025-06-19 22:34:49,092 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,093 - src.embeddings.text_Chunker - INFO - Input text has 157 tokens
2025-06-19 22:34:49,093 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,093 - src.embeddings.text_Chunker - INFO - Input text has 155 tokens
2025-06-19 22:34:49,094 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,095 - src.embeddings.text_Chunker - INFO - Input text has 137 tokens
2025-06-19 22:34:49,09

Chunking:   0%|          | 0/113 [00:00<?, ?it/s]

2025-06-19 22:34:49,151 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-19 22:34:49,152 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,152 - src.embeddings.text_Chunker - INFO - Input text has 28 tokens
2025-06-19 22:34:49,152 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,153 - src.embeddings.text_Chunker - INFO - Input text has 159 tokens
2025-06-19 22:34:49,153 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,153 - src.embeddings.text_Chunker - INFO - Input text has 116 tokens
2025-06-19 22:34:49,154 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,154 - src.embeddings.text_Chunker - INFO - Input text has 91 tokens
2025-06-19 22:34:49,155 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,155 - src.embeddings.text_Chunker - INFO - Input text has 52 tokens
2025-06-19 22:34:49,155 -

Chunking:   0%|          | 0/29 [00:00<?, ?it/s]

2025-06-19 22:34:49,248 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-19 22:34:49,250 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,251 - src.embeddings.text_Chunker - INFO - Input text has 104 tokens
2025-06-19 22:34:49,251 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,251 - src.embeddings.text_Chunker - INFO - Input text has 88 tokens
2025-06-19 22:34:49,251 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,252 - src.embeddings.text_Chunker - INFO - Input text has 49 tokens
2025-06-19 22:34:49,252 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,252 - src.embeddings.text_Chunker - INFO - Input text has 43 tokens
2025-06-19 22:34:49,254 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,254 - src.embeddings.text_Chunker - INFO - Input text has 50 tokens
2025-06-19 22:34:49,254 - 

Chunking:   0%|          | 0/102 [00:00<?, ?it/s]

2025-06-19 22:34:49,277 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:49,278 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,278 - src.embeddings.text_Chunker - INFO - Input text has 29 tokens
2025-06-19 22:34:49,279 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,279 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-19 22:34:49,279 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,280 - src.embeddings.text_Chunker - INFO - Input text has 90 tokens
2025-06-19 22:34:49,280 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,281 - src.embeddings.text_Chunker - INFO - Input text has 39 tokens
2025-06-19 22:34:49,281 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,282 - src.embeddings.text_Chunker - INFO - Input text has 61 tokens
2025-06-19 22:34:49,282 - sr

Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-19 22:34:49,356 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:49,357 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,357 - src.embeddings.text_Chunker - INFO - Input text has 225 tokens
2025-06-19 22:34:49,359 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,359 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,359 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,360 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-19 22:34:49,360 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,360 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:49,360 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,361 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:49,361 - s

Chunking:   0%|          | 0/125 [00:00<?, ?it/s]

2025-06-19 22:34:49,374 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:49,375 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,375 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:49,375 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,376 - src.embeddings.text_Chunker - INFO - Input text has 41 tokens
2025-06-19 22:34:49,376 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,376 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:49,376 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,377 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-19 22:34:49,377 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,378 - src.embeddings.text_Chunker - INFO - Input text has 134 tokens
2025-06-19 22:34:49,378 - s

Chunking:   0%|          | 0/58 [00:00<?, ?it/s]

2025-06-19 22:34:49,475 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:49,475 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,476 - src.embeddings.text_Chunker - INFO - Input text has 69 tokens
2025-06-19 22:34:49,476 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,476 - src.embeddings.text_Chunker - INFO - Input text has 27 tokens
2025-06-19 22:34:49,476 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,477 - src.embeddings.text_Chunker - INFO - Input text has 34 tokens
2025-06-19 22:34:49,477 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,477 - src.embeddings.text_Chunker - INFO - Input text has 68 tokens
2025-06-19 22:34:49,479 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,479 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:49,479 - sr

Chunking:   0%|          | 0/39 [00:00<?, ?it/s]

2025-06-19 22:34:49,526 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:49,526 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,527 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:49,527 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,527 - src.embeddings.text_Chunker - INFO - Input text has 55 tokens
2025-06-19 22:34:49,528 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,528 - src.embeddings.text_Chunker - INFO - Input text has 138 tokens
2025-06-19 22:34:49,529 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,529 - src.embeddings.text_Chunker - INFO - Input text has 125 tokens
2025-06-19 22:34:49,530 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,530 - src.embeddings.text_Chunker - INFO - Input text has 78 tokens
2025-06-19 22:34:49,530 - 

Chunking:   0%|          | 0/152 [00:00<?, ?it/s]

2025-06-19 22:34:49,564 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:49,564 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,564 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:49,565 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,565 - src.embeddings.text_Chunker - INFO - Input text has 62 tokens
2025-06-19 22:34:49,565 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,566 - src.embeddings.text_Chunker - INFO - Input text has 110 tokens
2025-06-19 22:34:49,567 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,567 - src.embeddings.text_Chunker - INFO - Input text has 85 tokens
2025-06-19 22:34:49,567 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,568 - src.embeddings.text_Chunker - INFO - Input text has 86 tokens
2025-06-19 22:34:49,568 - 

Chunking:   0%|          | 0/37 [00:00<?, ?it/s]

2025-06-19 22:34:49,689 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:49,689 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,690 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:49,690 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,691 - src.embeddings.text_Chunker - INFO - Input text has 105 tokens
2025-06-19 22:34:49,691 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,692 - src.embeddings.text_Chunker - INFO - Input text has 34 tokens
2025-06-19 22:34:49,692 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,692 - src.embeddings.text_Chunker - INFO - Input text has 29 tokens
2025-06-19 22:34:49,692 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,693 - src.embeddings.text_Chunker - INFO - Input text has 168 tokens
2025-06-19 22:34:49,694 - 

Chunking:   0%|          | 0/104 [00:00<?, ?it/s]

2025-06-19 22:34:49,721 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:49,722 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,722 - src.embeddings.text_Chunker - INFO - Input text has 46 tokens
2025-06-19 22:34:49,722 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,723 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-19 22:34:49,723 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,723 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-19 22:34:49,723 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,724 - src.embeddings.text_Chunker - INFO - Input text has 55 tokens
2025-06-19 22:34:49,724 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,725 - src.embeddings.text_Chunker - INFO - Input text has 114 tokens
2025-06-19 22:34:49,725 - s

Chunking:   0%|          | 0/126 [00:00<?, ?it/s]

2025-06-19 22:34:49,802 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:49,803 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,805 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:49,805 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,806 - src.embeddings.text_Chunker - INFO - Input text has 142 tokens
2025-06-19 22:34:49,806 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,807 - src.embeddings.text_Chunker - INFO - Input text has 103 tokens
2025-06-19 22:34:49,807 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,807 - src.embeddings.text_Chunker - INFO - Input text has 27 tokens
2025-06-19 22:34:49,808 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,808 - src.embeddings.text_Chunker - INFO - Input text has 48 tokens
2025-06-19 22:34:49,808 - 

Chunking:   0%|          | 0/37 [00:00<?, ?it/s]

2025-06-19 22:34:49,909 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:49,909 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,909 - src.embeddings.text_Chunker - INFO - Input text has 91 tokens
2025-06-19 22:34:49,910 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,910 - src.embeddings.text_Chunker - INFO - Input text has 171 tokens
2025-06-19 22:34:49,911 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,911 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-19 22:34:49,911 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,912 - src.embeddings.text_Chunker - INFO - Input text has 195 tokens
2025-06-19 22:34:49,913 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,913 - src.embeddings.text_Chunker - INFO - Input text has 357 tokens
2025-06-19 22:34:49,914

Chunking:   0%|          | 0/113 [00:00<?, ?it/s]

2025-06-19 22:34:49,951 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:49,951 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,952 - src.embeddings.text_Chunker - INFO - Input text has 27 tokens
2025-06-19 22:34:49,952 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,952 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-19 22:34:49,953 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,953 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-19 22:34:49,953 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,953 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-19 22:34:49,954 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:49,954 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:49,954 - src

Chunking:   0%|          | 0/42 [00:00<?, ?it/s]

2025-06-19 22:34:50,040 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:50,041 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,041 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:50,042 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,042 - src.embeddings.text_Chunker - INFO - Input text has 102 tokens
2025-06-19 22:34:50,042 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,043 - src.embeddings.text_Chunker - INFO - Input text has 192 tokens
2025-06-19 22:34:50,043 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,044 - src.embeddings.text_Chunker - INFO - Input text has 89 tokens
2025-06-19 22:34:50,044 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,044 - src.embeddings.text_Chunker - INFO - Input text has 84 tokens
2025-06-19 22:34:50,045 -

Chunking:   0%|          | 0/89 [00:00<?, ?it/s]

2025-06-19 22:34:50,081 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:50,081 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,082 - src.embeddings.text_Chunker - INFO - Input text has 43 tokens
2025-06-19 22:34:50,082 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,084 - src.embeddings.text_Chunker - INFO - Input text has 65 tokens
2025-06-19 22:34:50,084 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,085 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:50,085 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,085 - src.embeddings.text_Chunker - INFO - Input text has 55 tokens
2025-06-19 22:34:50,085 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:50,086 - src.embeddings.text_Chunker - INFO - Input text has 76 tokens
2025-06-19 22:34:50,086 - s

Chunking:   0%|          | 0/70 [00:00<?, ?it/s]

2025-06-19 22:34:52,268 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:52,268 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,269 - src.embeddings.text_Chunker - INFO - Input text has 48 tokens
2025-06-19 22:34:52,269 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,270 - src.embeddings.text_Chunker - INFO - Input text has 34 tokens
2025-06-19 22:34:52,270 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,270 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:52,271 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,271 - src.embeddings.text_Chunker - INFO - Input text has 88 tokens
2025-06-19 22:34:52,271 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,272 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-19 22:34:52,272 - sr

Chunking:   0%|          | 0/3 [00:00<?, ?it/s]

2025-06-19 22:34:52,344 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:52,344 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,345 - src.embeddings.text_Chunker - INFO - Input text has 150 tokens
2025-06-19 22:34:52,345 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,346 - src.embeddings.text_Chunker - INFO - Input text has 176 tokens
2025-06-19 22:34:52,347 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/40 [00:00<?, ?it/s]

2025-06-19 22:34:52,352 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-19 22:34:52,352 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,353 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:52,354 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,354 - src.embeddings.text_Chunker - INFO - Input text has 283 tokens
2025-06-19 22:34:52,355 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,356 - src.embeddings.text_Chunker - INFO - Input text has 130 tokens
2025-06-19 22:34:52,356 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,357 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:52,358 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,358 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:52,358 - 

Chunking:   0%|          | 0/14 [00:00<?, ?it/s]

2025-06-19 22:34:52,392 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-19 22:34:52,392 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,393 - src.embeddings.text_Chunker - INFO - Input text has 309 tokens
2025-06-19 22:34:52,393 - src.embeddings.text_Chunker - INFO - Using chunk size: 155 tokens, overlap: 30
2025-06-19 22:34:52,394 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:52,394 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,394 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:52,395 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,395 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:52,397 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,397 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 

Chunking:   0%|          | 0/135 [00:00<?, ?it/s]

2025-06-19 22:34:52,408 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:52,408 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,409 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:52,409 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,410 - src.embeddings.text_Chunker - INFO - Input text has 212 tokens
2025-06-19 22:34:52,410 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,411 - src.embeddings.text_Chunker - INFO - Input text has 337 tokens
2025-06-19 22:34:52,411 - src.embeddings.text_Chunker - INFO - Using chunk size: 169 tokens, overlap: 30
2025-06-19 22:34:52,413 - src.embeddings.text_Chunker - INFO - Input text has 63 tokens
2025-06-19 22:34:52,413 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,414 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19

Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-19 22:34:52,522 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-19 22:34:52,522 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,523 - src.embeddings.text_Chunker - INFO - Input text has 127 tokens
2025-06-19 22:34:52,523 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,525 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:52,525 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,525 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:52,525 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,526 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:52,526 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,528 - src.embeddings.text_Chunker - INFO - Input text has 21 tokens
2025-06-19 22:34:52,528 - s

Chunking:   0%|          | 0/215 [00:00<?, ?it/s]

2025-06-19 22:34:52,541 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:52,542 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,543 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:52,543 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,543 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:52,543 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,544 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-19 22:34:52,544 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,545 - src.embeddings.text_Chunker - INFO - Input text has 40 tokens
2025-06-19 22:34:52,545 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,546 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:52,546 - sr

Chunking:   0%|          | 0/56 [00:00<?, ?it/s]

2025-06-19 22:34:52,709 - src.embeddings.text_Chunker - INFO - Input text has 14 tokens
2025-06-19 22:34:52,709 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,710 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-19 22:34:52,710 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,711 - src.embeddings.text_Chunker - INFO - Input text has 183 tokens
2025-06-19 22:34:52,711 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,712 - src.embeddings.text_Chunker - INFO - Input text has 207 tokens
2025-06-19 22:34:52,713 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,713 - src.embeddings.text_Chunker - INFO - Input text has 67 tokens
2025-06-19 22:34:52,713 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,713 - src.embeddings.text_Chunker - INFO - Input text has 68 tokens
2025-06-19 22:34:52,714 -

Chunking:   0%|          | 0/55 [00:00<?, ?it/s]

2025-06-19 22:34:52,759 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:52,759 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,760 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:52,761 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,761 - src.embeddings.text_Chunker - INFO - Input text has 106 tokens
2025-06-19 22:34:52,761 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,762 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:52,762 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,762 - src.embeddings.text_Chunker - INFO - Input text has 221 tokens
2025-06-19 22:34:52,763 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,764 - src.embeddings.text_Chunker - INFO - Input text has 121 tokens
2025-06-19 22:34:52,764 

Chunking:   0%|          | 0/87 [00:00<?, ?it/s]

2025-06-19 22:34:52,814 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 22:34:52,814 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,814 - src.embeddings.text_Chunker - INFO - Input text has 42 tokens
2025-06-19 22:34:52,815 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,815 - src.embeddings.text_Chunker - INFO - Input text has 77 tokens
2025-06-19 22:34:52,816 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,816 - src.embeddings.text_Chunker - INFO - Input text has 76 tokens
2025-06-19 22:34:52,816 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,817 - src.embeddings.text_Chunker - INFO - Input text has 59 tokens
2025-06-19 22:34:52,817 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,817 - src.embeddings.text_Chunker - INFO - Input text has 219 tokens
2025-06-19 22:34:52,818 - s

Chunking:   0%|          | 0/42 [00:00<?, ?it/s]

2025-06-19 22:34:52,884 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-19 22:34:52,885 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,887 - src.embeddings.text_Chunker - INFO - Input text has 44 tokens
2025-06-19 22:34:52,887 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,888 - src.embeddings.text_Chunker - INFO - Input text has 98 tokens
2025-06-19 22:34:52,888 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,888 - src.embeddings.text_Chunker - INFO - Input text has 164 tokens
2025-06-19 22:34:52,889 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,889 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:52,889 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,890 - src.embeddings.text_Chunker - INFO - Input text has 190 tokens
2025-06-19 22:34:52,890 -

Chunking:   0%|          | 0/87 [00:00<?, ?it/s]

2025-06-19 22:34:52,925 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-19 22:34:52,926 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,926 - src.embeddings.text_Chunker - INFO - Input text has 27 tokens
2025-06-19 22:34:52,928 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,928 - src.embeddings.text_Chunker - INFO - Input text has 107 tokens
2025-06-19 22:34:52,928 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,929 - src.embeddings.text_Chunker - INFO - Input text has 91 tokens
2025-06-19 22:34:52,929 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,929 - src.embeddings.text_Chunker - INFO - Input text has 102 tokens
2025-06-19 22:34:52,930 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:52,930 - src.embeddings.text_Chunker - INFO - Input text has 28 tokens
2025-06-19 22:34:52,930 - 

Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-19 22:34:53,004 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:53,005 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,006 - src.embeddings.text_Chunker - INFO - Input text has 355 tokens
2025-06-19 22:34:53,006 - src.embeddings.text_Chunker - INFO - Using chunk size: 178 tokens, overlap: 30
2025-06-19 22:34:53,008 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 22:34:53,009 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,009 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-19 22:34:53,009 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,010 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-19 22:34:53,010 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,011 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-19 

Chunking:   0%|          | 0/80 [00:00<?, ?it/s]

2025-06-19 22:34:53,023 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-19 22:34:53,023 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,024 - src.embeddings.text_Chunker - INFO - Input text has 84 tokens
2025-06-19 22:34:53,024 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,026 - src.embeddings.text_Chunker - INFO - Input text has 258 tokens
2025-06-19 22:34:53,026 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,026 - src.embeddings.text_Chunker - INFO - Input text has 82 tokens
2025-06-19 22:34:53,027 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,027 - src.embeddings.text_Chunker - INFO - Input text has 149 tokens
2025-06-19 22:34:53,028 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,028 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:53,028 -

Chunking:   0%|          | 0/88 [00:00<?, ?it/s]

2025-06-19 22:34:53,090 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-19 22:34:53,090 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,091 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 22:34:53,091 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,091 - src.embeddings.text_Chunker - INFO - Input text has 25 tokens
2025-06-19 22:34:53,091 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,092 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-19 22:34:53,092 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,092 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-19 22:34:53,092 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,093 - src.embeddings.text_Chunker - INFO - Input text has 60 tokens
2025-06-19 22:34:53,093 - src

Chunking:   0%|          | 0/41 [00:00<?, ?it/s]

2025-06-19 22:34:53,160 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-19 22:34:53,160 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,161 - src.embeddings.text_Chunker - INFO - Input text has 273 tokens
2025-06-19 22:34:53,161 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,162 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-19 22:34:53,163 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,163 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-19 22:34:53,163 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,164 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-19 22:34:53,164 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,164 - src.embeddings.text_Chunker - INFO - Input text has 81 tokens
2025-06-19 22:34:53,165 - 

Chunking:   0%|          | 0/44 [00:00<?, ?it/s]

2025-06-19 22:34:53,204 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-19 22:34:53,204 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,204 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-19 22:34:53,205 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,205 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-19 22:34:53,206 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,206 - src.embeddings.text_Chunker - INFO - Input text has 113 tokens
2025-06-19 22:34:53,207 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,207 - src.embeddings.text_Chunker - INFO - Input text has 131 tokens
2025-06-19 22:34:53,207 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,208 - src.embeddings.text_Chunker - INFO - Input text has 116 tokens
2025-06-19 22:34:53,208

Chunking:   0%|          | 0/350 [00:00<?, ?it/s]

2025-06-19 22:34:53,248 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-19 22:34:53,248 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,248 - src.embeddings.text_Chunker - INFO - Input text has 34 tokens
2025-06-19 22:34:53,249 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,250 - src.embeddings.text_Chunker - INFO - Input text has 73 tokens
2025-06-19 22:34:53,250 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,250 - src.embeddings.text_Chunker - INFO - Input text has 105 tokens
2025-06-19 22:34:53,252 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,252 - src.embeddings.text_Chunker - INFO - Input text has 35 tokens
2025-06-19 22:34:53,252 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 22:34:53,253 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-19 22:34:53,253 - 

In [23]:
documents = []
chunked_docs[0]

Document(metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': 'G10L  19/038       20130101AFI20170426BHEP, G10L  19/07        20130101ALI20170426BHEP', 'file': 'EP13899497W1B9.xml', 'filePath': '/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json', 'title': 'AUDIO SIGNAL ENCODER', 'section': 'title', 'chunk_index': 0, 'total_chunks': 1}, page_content='AUDIO SIGNAL ENCODER')

In [24]:
print(f"‚úÖ Created {len(chunked_docs)} chunks from {len(documents)} document segments")


‚úÖ Created 17428 chunks from 0 document segments


In [25]:

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"batch_size": 32, "normalize_embeddings": True}
)
print("‚úÖ LangChain embeddings initialized")


  embeddings = HuggingFaceEmbeddings(
2025-06-19 22:34:55,595 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


‚úÖ LangChain embeddings initialized


In [26]:
# 7. Create vector store with chunked documents - connect to Docker ChromaDB
print("üíæ Creating vector store using dockerized ChromaDB...")

# Import ChromaDB client directly (needed for Docker connection)
import chromadb
from chromadb.config import Settings

# When running inside Docker
chroma_client = chromadb.HttpClient(
    host="vector_db",  # Docker service name
    port=8000,         # Internal container port
    settings=Settings(anonymized_telemetry=False)
)

üíæ Creating vector store using dockerized ChromaDB...


2025-06-19 22:34:57,467 - httpx - INFO - HTTP Request: GET http://vector_db:8000/api/v2/auth/identity "HTTP/1.1 200 OK"
2025-06-19 22:34:57,495 - httpx - INFO - HTTP Request: GET http://vector_db:8000/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
2025-06-19 22:34:57,498 - httpx - INFO - HTTP Request: GET http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"


In [27]:
collection_name = "patents"

vectorstore = Chroma(
    client=chroma_client,
    collection_name=collection_name,
    embedding_function=embeddings
)

  vectorstore = Chroma(
2025-06-19 22:35:01,431 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"


In [29]:
# Add documents to the vectorstore (if we have any)
if chunked_docs:
    # Add in smaller batches to avoid memory issues
    batch_size = 100
    for i in tqdm(range(0, len(chunked_docs), batch_size), desc="Adding to vector store"):
        batch = chunked_docs[i:i+batch_size]
        vectorstore.add_documents(documents=batch)
    
    print(f"‚úÖ Added {len(chunked_docs)} document chunks to ChromaDB collection '{collection_name}'")
else:
    print("‚ö†Ô∏è No document chunks to add to the vector store!")

Adding to vector store:   0%|          | 0/175 [00:00<?, ?it/s]

2025-06-19 22:35:50,142 - httpx - INFO - HTTP Request: GET http://vector_db:8000/api/v2/pre-flight-checks "HTTP/1.1 200 OK"
2025-06-19 22:35:50,250 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/ed371e4e-de10-48d9-84f5-ceb6d5b769cd/upsert "HTTP/1.1 200 OK"
2025-06-19 22:35:56,690 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/ed371e4e-de10-48d9-84f5-ceb6d5b769cd/upsert "HTTP/1.1 200 OK"
2025-06-19 22:36:05,187 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/ed371e4e-de10-48d9-84f5-ceb6d5b769cd/upsert "HTTP/1.1 200 OK"
2025-06-19 22:36:13,551 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/ed371e4e-de10-48d9-84f5-ceb6d5b769cd/upsert "HTTP/1.1 200 OK"
2025-06-19 22:36:22,098 

‚úÖ Added 17428 document chunks to ChromaDB collection 'patents'


In [None]:
# 8. Test retrieval (only if we have documents)
if chunked_docs:
    print("\nüîç Testing vector store with sample queries...")

    test_queries = [
        "Audio signal encoding methods",
        "Image compression techniques",
        "Wireless communication protocols"
    ]

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        # Get top 3 results
        results = vectorstore.similarity_search_with_score(query, k=3)
        
        print(f"Top {len(results)} results:")
        for i, (doc, score) in enumerate(results):
            print(f"\nResult {i+1} (similarity: {score:.4f})")
            print(f"Title: {doc.metadata.get('title', 'N/A')}")
            print(f"Section: {doc.metadata.get('section', 'N/A')}")
            print(f"Patent ID: {doc.metadata.get('doc_id', 'N/A')}")
            print(f"Content preview: {doc.page_content[:150]}...")
else:
    print("\n‚ö†Ô∏è No documents in vector store to search!")

In [30]:
vectorstore.delete_collection()

2025-06-19 22:04:58,996 - httpx - INFO - HTTP Request: DELETE http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/patents "HTTP/1.1 200 OK"
