In [13]:
import os
import sys
from pathlib import Path
import logging
import numpy as np
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

In [14]:
notebook_dir = Path.cwd()
project_root = notebook_dir.parent  # If notebooks are in langechain/notebooks

# Add project root to Python path for imports
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /app


In [15]:
from src.data_pipline.json_loader.json_loader_epo import EPOPatentLoader
from config.json_loader_config import PatentLoaderConfig
from src.embeddings.text_Chunker import TextChunker, ChunkerConfig


In [16]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

In [17]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("patent_pipeline")

In [19]:
data_dir = Path(project_root) / "data" / "parsed" / "EPO"
model_name = "sentence-transformers/all-mpnet-base-v2"
vector_db_dir = Path(project_root) / "data" / "vector_db"
vector_db_dir.mkdir(parents=True, exist_ok=True)

In [20]:
loader_config = PatentLoaderConfig(
    max_files=50,
    
    verbose=False,
    epo_dir=data_dir,
    include_abstract=True,
    include_claims=True,
    include_description=True
)
patent_loader = EPOPatentLoader(config=loader_config)
print("✅ Patent loader initialized")

✅ Patent loader initialized


In [21]:
json_files = patent_loader.load_raw_json_files()

2025-06-20 08:25:20,774 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 50 files out of 50 total files
2025-06-20 08:25:20,774 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 50 files out of 50 total files
2025-06-20 08:25:20,774 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 50 JSON files in /app/data/parsed/EPO
2025-06-20 08:25:20,774 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 50 JSON files in /app/data/parsed/EPO
2025-06-20 08:25:20,787 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 50 raw JSON files
2025-06-20 08:25:20,787 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 50 raw JSON files


In [22]:
documents  = patent_loader.extract_document_components()

2025-06-20 08:25:22,392 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 50 files out of 50 total files
2025-06-20 08:25:22,392 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 50 files out of 50 total files
2025-06-20 08:25:22,393 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 50 JSON files in /app/data/parsed/EPO
2025-06-20 08:25:22,393 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 50 JSON files in /app/data/parsed/EPO
2025-06-20 08:25:22,405 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 50 documents from 50 files
2025-06-20 08:25:22,405 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 50 documents from 50 files


In [23]:
json_files = []

In [25]:
documents[0][0]

Document(metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': 'G10L  19/038       20130101AFI20170426BHEP, G10L  19/07        20130101ALI20170426BHEP', 'file': 'EP13899497W1B9.xml', 'filePath': '/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json', 'title': 'AUDIO SIGNAL ENCODER', 'section': 'title'}, page_content='AUDIO SIGNAL ENCODER')

In [24]:
print(f"✅ Loaded {len(documents)} document segments from {len(json_files)} patent files")


✅ Loaded 50 document segments from 0 patent files


In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
chunker_config = ChunkerConfig(
    base_chunk_size=512,  # Adjust based on your needs
    chunk_overlap=50,
    min_chunk_size=100
)
chunker = TextChunker(tokenizer, config=chunker_config)
print("✅ TextChunker initialized")

✅ TextChunker initialized


In [27]:
print("✂️ Chunking documents...")
chunked_docs = []

for docs in tqdm(documents, desc="Chunking"):
    for doc in tqdm(docs, desc="Chunking"):
        try:
            text = doc.page_content
            metadata = doc.metadata
            
            # Use your TextChunker to split the document
            chunks = chunker.chunk_text(text, use_semantic=True)
            
            # Create LangChain Document objects from chunks
            for i, chunk_text in enumerate(chunks):
                # Copy metadata and add chunk information
                chunk_metadata = metadata.copy()
                chunk_metadata.update({
                    "chunk_index": i,
                    "total_chunks": len(chunks)
                })
                
                chunked_docs.append(Document(
                    page_content=chunk_text,
                    metadata=chunk_metadata
                ))
        except Exception as e:
            logger.error(f"Error chunking document: {str(e)}")

✂️ Chunking documents...


Chunking:   0%|          | 0/50 [00:00<?, ?it/s]

Chunking:   0%|          | 0/180 [00:00<?, ?it/s]

2025-06-20 08:26:30,094 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-20 08:26:30,095 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,095 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-20 08:26:30,096 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,096 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-20 08:26:30,096 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,097 - src.embeddings.text_Chunker - INFO - Input text has 116 tokens
2025-06-20 08:26:30,097 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,098 - src.embeddings.text_Chunker - INFO - Input text has 37 tokens
2025-06-20 08:26:30,099 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,099 - src.embeddings.text_Chunker - INFO - Input text has 88 tokens
2025-06-20 08:26:30,099 - s

Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-20 08:26:30,200 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-20 08:26:30,200 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,201 - src.embeddings.text_Chunker - INFO - Input text has 479 tokens
2025-06-20 08:26:30,201 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,201 - src.embeddings.text_Chunker - INFO - Input text has 149 tokens
2025-06-20 08:26:30,201 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,202 - src.embeddings.text_Chunker - INFO - Input text has 98 tokens
2025-06-20 08:26:30,202 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,202 - src.embeddings.text_Chunker - INFO - Input text has 142 tokens
2025-06-20 08:26:30,202 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,203 - src.embeddings.text_Chunker - INFO - Input text has 148 tokens
2025-06-20 08:26:30,203 

Chunking:   0%|          | 0/2 [00:00<?, ?it/s]

2025-06-20 08:26:30,211 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-20 08:26:30,211 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,212 - src.embeddings.text_Chunker - INFO - Input text has 72 tokens
2025-06-20 08:26:30,212 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/2 [00:00<?, ?it/s]

2025-06-20 08:26:30,215 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-20 08:26:30,215 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,215 - src.embeddings.text_Chunker - INFO - Input text has 255 tokens
2025-06-20 08:26:30,215 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/16 [00:00<?, ?it/s]

2025-06-20 08:26:30,218 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-20 08:26:30,218 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,219 - src.embeddings.text_Chunker - INFO - Input text has 219 tokens
2025-06-20 08:26:30,219 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,219 - src.embeddings.text_Chunker - INFO - Input text has 172 tokens
2025-06-20 08:26:30,219 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,220 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-20 08:26:30,220 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,220 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-20 08:26:30,220 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,221 - src.embeddings.text_Chunker - INFO - Input text has 194 tokens
2025-06-20 08:26:30,221 

Chunking:   0%|          | 0/111 [00:00<?, ?it/s]

2025-06-20 08:26:30,229 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-20 08:26:30,230 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,230 - src.embeddings.text_Chunker - INFO - Input text has 56 tokens
2025-06-20 08:26:30,230 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,230 - src.embeddings.text_Chunker - INFO - Input text has 70 tokens
2025-06-20 08:26:30,231 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,231 - src.embeddings.text_Chunker - INFO - Input text has 103 tokens
2025-06-20 08:26:30,231 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,231 - src.embeddings.text_Chunker - INFO - Input text has 134 tokens
2025-06-20 08:26:30,232 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,232 - src.embeddings.text_Chunker - INFO - Input text has 105 tokens
2025-06-20 08:26:30,232 

Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,307 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,307 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,311 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-20 08:26:30,311 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,314 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-20 08:26:30,315 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,318 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-20 08:26:30,318 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,321 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:30,322 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,325 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-20 08:26:30,325 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,328 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-20 08:26:30,328 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,332 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-20 08:26:30,332 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,336 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-20 08:26:30,337 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,341 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-20 08:26:30,341 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,345 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-20 08:26:30,345 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,349 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,349 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,354 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-20 08:26:30,354 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,358 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-20 08:26:30,358 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,362 - src.embeddings.text_Chunker - INFO - Input text has 13 tokens
2025-06-20 08:26:30,362 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,365 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,365 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,368 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-20 08:26:30,368 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,371 - src.embeddings.text_Chunker - INFO - Input text has 2 tokens
2025-06-20 08:26:30,371 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,374 - src.embeddings.text_Chunker - INFO - Input text has 28 tokens
2025-06-20 08:26:30,374 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,377 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-20 08:26:30,377 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,381 - src.embeddings.text_Chunker - INFO - Input text has 12 tokens
2025-06-20 08:26:30,382 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,385 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:30,385 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,387 - src.embeddings.text_Chunker - INFO - Input text has 11 tokens
2025-06-20 08:26:30,388 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,390 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:30,390 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,393 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,393 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,547 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-20 08:26:30,547 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,551 - src.embeddings.text_Chunker - INFO - Input text has 9 tokens
2025-06-20 08:26:30,551 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,554 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,554 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-20 08:26:30,557 - src.embeddings.text_Chunker - INFO - Input text has 19 tokens
2025-06-20 08:26:30,558 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Chunking:   0%|          | 0/73 [00:00<?, ?it/s]

2025-06-20 08:26:30,561 - src.embeddings.text_Chunker - INFO - Input text has 20 tokens
2025-06-20 08:26:30,561 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,561 - src.embeddings.text_Chunker - INFO - Input text has 40 tokens
2025-06-20 08:26:30,562 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,562 - src.embeddings.text_Chunker - INFO - Input text has 341 tokens
2025-06-20 08:26:30,563 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,563 - src.embeddings.text_Chunker - INFO - Input text has 382 tokens
2025-06-20 08:26:30,563 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,564 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-20 08:26:30,564 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,564 - src.embeddings.text_Chunker - INFO - Input text has 30 tokens
2025-06-20 08:26:30,564 

Chunking:   0%|          | 0/15 [00:00<?, ?it/s]

2025-06-20 08:26:30,609 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,609 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,609 - src.embeddings.text_Chunker - INFO - Input text has 321 tokens
2025-06-20 08:26:30,610 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,610 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-20 08:26:30,610 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,610 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-20 08:26:30,611 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,611 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-20 08:26:30,611 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,611 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-20 08:26:30,611 - s

Chunking:   0%|          | 0/17 [00:00<?, ?it/s]

2025-06-20 08:26:30,619 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-20 08:26:30,619 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,619 - src.embeddings.text_Chunker - INFO - Input text has 188 tokens
2025-06-20 08:26:30,620 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,620 - src.embeddings.text_Chunker - INFO - Input text has 71 tokens
2025-06-20 08:26:30,620 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,620 - src.embeddings.text_Chunker - INFO - Input text has 83 tokens
2025-06-20 08:26:30,621 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,621 - src.embeddings.text_Chunker - INFO - Input text has 90 tokens
2025-06-20 08:26:30,621 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,621 - src.embeddings.text_Chunker - INFO - Input text has 50 tokens
2025-06-20 08:26:30,622 - s

Chunking:   0%|          | 0/92 [00:00<?, ?it/s]

2025-06-20 08:26:30,630 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-20 08:26:30,630 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,630 - src.embeddings.text_Chunker - INFO - Input text has 46 tokens
2025-06-20 08:26:30,631 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,631 - src.embeddings.text_Chunker - INFO - Input text has 86 tokens
2025-06-20 08:26:30,631 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,631 - src.embeddings.text_Chunker - INFO - Input text has 33 tokens
2025-06-20 08:26:30,631 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,632 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-20 08:26:30,632 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,632 - src.embeddings.text_Chunker - INFO - Input text has 67 tokens
2025-06-20 08:26:30,632 - src

Chunking:   0%|          | 0/79 [00:00<?, ?it/s]

2025-06-20 08:26:30,683 - src.embeddings.text_Chunker - INFO - Input text has 15 tokens
2025-06-20 08:26:30,683 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,684 - src.embeddings.text_Chunker - INFO - Input text has 258 tokens
2025-06-20 08:26:30,684 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,684 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-20 08:26:30,684 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,685 - src.embeddings.text_Chunker - INFO - Input text has 74 tokens
2025-06-20 08:26:30,685 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,685 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-20 08:26:30,686 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,686 - src.embeddings.text_Chunker - INFO - Input text has 39 tokens
2025-06-20 08:26:30,686 - 

Chunking:   0%|          | 0/74 [00:00<?, ?it/s]

2025-06-20 08:26:30,727 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-20 08:26:30,727 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,728 - src.embeddings.text_Chunker - INFO - Input text has 58 tokens
2025-06-20 08:26:30,728 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,728 - src.embeddings.text_Chunker - INFO - Input text has 77 tokens
2025-06-20 08:26:30,728 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,729 - src.embeddings.text_Chunker - INFO - Input text has 92 tokens
2025-06-20 08:26:30,729 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,729 - src.embeddings.text_Chunker - INFO - Input text has 140 tokens
2025-06-20 08:26:30,730 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,730 - src.embeddings.text_Chunker - INFO - Input text has 130 tokens
2025-06-20 08:26:30,730 - 

Chunking:   0%|          | 0/96 [00:00<?, ?it/s]

2025-06-20 08:26:30,764 - src.embeddings.text_Chunker - INFO - Input text has 6 tokens
2025-06-20 08:26:30,764 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,764 - src.embeddings.text_Chunker - INFO - Input text has 24 tokens
2025-06-20 08:26:30,764 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,765 - src.embeddings.text_Chunker - INFO - Input text has 68 tokens
2025-06-20 08:26:30,765 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,765 - src.embeddings.text_Chunker - INFO - Input text has 86 tokens
2025-06-20 08:26:30,765 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,766 - src.embeddings.text_Chunker - INFO - Input text has 67 tokens
2025-06-20 08:26:30,766 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,766 - src.embeddings.text_Chunker - INFO - Input text has 45 tokens
2025-06-20 08:26:30,766 - sr

Chunking:   0%|          | 0/94 [00:00<?, ?it/s]

2025-06-20 08:26:30,819 - src.embeddings.text_Chunker - INFO - Input text has 3 tokens
2025-06-20 08:26:30,820 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,820 - src.embeddings.text_Chunker - INFO - Input text has 117 tokens
2025-06-20 08:26:30,820 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,821 - src.embeddings.text_Chunker - INFO - Input text has 122 tokens
2025-06-20 08:26:30,821 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,821 - src.embeddings.text_Chunker - INFO - Input text has 59 tokens
2025-06-20 08:26:30,822 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,822 - src.embeddings.text_Chunker - INFO - Input text has 46 tokens
2025-06-20 08:26:30,822 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,822 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,823 - s

Chunking:   0%|          | 0/124 [00:00<?, ?it/s]

2025-06-20 08:26:30,874 - src.embeddings.text_Chunker - INFO - Input text has 16 tokens
2025-06-20 08:26:30,874 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,875 - src.embeddings.text_Chunker - INFO - Input text has 102 tokens
2025-06-20 08:26:30,875 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,875 - src.embeddings.text_Chunker - INFO - Input text has 203 tokens
2025-06-20 08:26:30,875 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,876 - src.embeddings.text_Chunker - INFO - Input text has 52 tokens
2025-06-20 08:26:30,876 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,876 - src.embeddings.text_Chunker - INFO - Input text has 31 tokens
2025-06-20 08:26:30,876 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,877 - src.embeddings.text_Chunker - INFO - Input text has 51 tokens
2025-06-20 08:26:30,877 -

Chunking:   0%|          | 0/133 [00:00<?, ?it/s]

2025-06-20 08:26:30,937 - src.embeddings.text_Chunker - INFO - Input text has 7 tokens
2025-06-20 08:26:30,937 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,938 - src.embeddings.text_Chunker - INFO - Input text has 205 tokens
2025-06-20 08:26:30,938 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,939 - src.embeddings.text_Chunker - INFO - Input text has 162 tokens
2025-06-20 08:26:30,939 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,939 - src.embeddings.text_Chunker - INFO - Input text has 251 tokens
2025-06-20 08:26:30,940 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,940 - src.embeddings.text_Chunker - INFO - Input text has 39 tokens
2025-06-20 08:26:30,940 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:30,941 - src.embeddings.text_Chunker - INFO - Input text has 17 tokens
2025-06-20 08:26:30,941 -

Chunking:   0%|          | 0/74 [00:00<?, ?it/s]

2025-06-20 08:26:31,016 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:31,016 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,017 - src.embeddings.text_Chunker - INFO - Input text has 38 tokens
2025-06-20 08:26:31,017 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,017 - src.embeddings.text_Chunker - INFO - Input text has 128 tokens
2025-06-20 08:26:31,017 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,018 - src.embeddings.text_Chunker - INFO - Input text has 165 tokens
2025-06-20 08:26:31,018 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,018 - src.embeddings.text_Chunker - INFO - Input text has 72 tokens
2025-06-20 08:26:31,018 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,019 - src.embeddings.text_Chunker - INFO - Input text has 41 tokens
2025-06-20 08:26:31,019 -

Chunking:   0%|          | 0/6 [00:00<?, ?it/s]

2025-06-20 08:26:31,062 - src.embeddings.text_Chunker - INFO - Input text has 18 tokens
2025-06-20 08:26:31,063 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,064 - src.embeddings.text_Chunker - INFO - Input text has 495 tokens
2025-06-20 08:26:31,064 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,064 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:31,065 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,065 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:31,065 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,065 - src.embeddings.text_Chunker - INFO - Input text has 10 tokens
2025-06-20 08:26:31,065 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,066 - src.embeddings.text_Chunker - INFO - Input text has 8 tokens
2025-06-20 08:26:31,066 - s

Chunking:   0%|          | 0/62 [00:00<?, ?it/s]

2025-06-20 08:26:31,069 - src.embeddings.text_Chunker - INFO - Input text has 4 tokens
2025-06-20 08:26:31,069 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,070 - src.embeddings.text_Chunker - INFO - Input text has 40 tokens
2025-06-20 08:26:31,070 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,070 - src.embeddings.text_Chunker - INFO - Input text has 96 tokens
2025-06-20 08:26:31,071 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,071 - src.embeddings.text_Chunker - INFO - Input text has 99 tokens
2025-06-20 08:26:31,071 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,072 - src.embeddings.text_Chunker - INFO - Input text has 57 tokens
2025-06-20 08:26:31,072 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-20 08:26:31,072 - src.embeddings.text_Chunker - INFO - Input text has 65 tokens
2025-06-20 08:26:31,072 - sr

Chunking:   0%|          | 0/77 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [28]:
documents = []
chunked_docs[0]

Document(metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': 'G10L  19/038       20130101AFI20170426BHEP, G10L  19/07        20130101ALI20170426BHEP', 'file': 'EP13899497W1B9.xml', 'filePath': '/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json', 'title': 'AUDIO SIGNAL ENCODER', 'section': 'title', 'chunk_index': 0, 'total_chunks': 1}, page_content='AUDIO SIGNAL ENCODER')

In [29]:
print(f"✅ Created {len(chunked_docs)} chunks from {len(documents)} document segments")


✅ Created 1409 chunks from 0 document segments


In [30]:

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"batch_size": 32, "normalize_embeddings": True}
)
print("✅ LangChain embeddings initialized")


  embeddings = HuggingFaceEmbeddings(
2025-06-20 08:30:07,970 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


✅ LangChain embeddings initialized


In [4]:
# 7. Create vector store with chunked documents - connect to Docker ChromaDB
print("💾 Creating vector store using dockerized ChromaDB...")

# Import ChromaDB client directly (needed for Docker connection)
import chromadb
from chromadb.config import Settings

# When running inside Docker
chroma_client = chromadb.HttpClient(
    host="vector_db",  # Docker service name
    port=8000,         # Internal container port
    settings=Settings(anonymized_telemetry=False)
)

💾 Creating vector store using dockerized ChromaDB...


In [33]:
collection_name = "patents"

vectorstore = Chroma(
    client=chroma_client,
    collection_name=collection_name,
    embedding_function=embeddings
)

2025-06-20 08:30:50,248 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"


In [34]:
# Add documents to the vectorstore (if we have any)
if chunked_docs:
    # Add in smaller batches to avoid memory issues
    batch_size = 100
    for i in tqdm(range(0, len(chunked_docs), batch_size), desc="Adding to vector store"):
        batch = chunked_docs[i:i+batch_size]
        vectorstore.add_documents(documents=batch)
    
    print(f"✅ Added {len(chunked_docs)} document chunks to ChromaDB collection '{collection_name}'")
else:
    print("⚠️ No document chunks to add to the vector store!")

Adding to vector store:   0%|          | 0/15 [00:00<?, ?it/s]

2025-06-20 08:31:05,094 - httpx - INFO - HTTP Request: GET http://vector_db:8000/api/v2/pre-flight-checks "HTTP/1.1 200 OK"
2025-06-20 08:31:05,250 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/34b877df-69c0-4489-8a41-83afcc841670/upsert "HTTP/1.1 200 OK"
2025-06-20 08:31:14,596 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/34b877df-69c0-4489-8a41-83afcc841670/upsert "HTTP/1.1 200 OK"
2025-06-20 08:31:25,186 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/34b877df-69c0-4489-8a41-83afcc841670/upsert "HTTP/1.1 200 OK"
2025-06-20 08:31:35,209 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/34b877df-69c0-4489-8a41-83afcc841670/upsert "HTTP/1.1 200 OK"
2025-06-20 08:31:46,008 

✅ Added 1409 document chunks to ChromaDB collection 'patents'


In [38]:
# 8. Test retrieval (only if we have documents)
if chunked_docs:
    print("\n🔍 Testing vector store with sample queries...")

    test_queries = [
        "Audio signal encoding methods",
        # "Image compression techniques",
        # "Wireless communication protocols"
    ]

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        # Get top 3 results
        results = vectorstore.similarity_search_with_score(query, k=3)
        
        print(f"Top {len(results)} results:")
        for i, (doc, score) in enumerate(results):
            print(f"\nResult {i+1} (similarity: {score:.4f})")
            print(f"Title: {doc.metadata.get('title', 'N/A')}")
            print(f"Section: {doc.metadata.get('section', 'N/A')}")
            print(f"Patent ID: {doc.metadata.get('doc_id', 'N/A')}")
            print(f"file Path: {doc.metadata.get('filePath', 'N/A')}")
            print(f"Content preview: {doc.page_content[:150]}...")
else:
    print("\n⚠️ No documents in vector store to search!")

2025-06-20 08:36:15,588 - httpx - INFO - HTTP Request: POST http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/34b877df-69c0-4489-8a41-83afcc841670/query "HTTP/1.1 200 OK"



🔍 Testing vector store with sample queries...

Query: 'Audio signal encoding methods'
Top 3 results:

Result 1 (similarity: 0.4964)
Title: AUDIO SIGNAL ENCODER
Section: Description of Some Embodiments of the Application
Patent ID: EP13899497B9W1
file Path: /app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json
Content preview: Description of Some Embodiments of the Application
The concept for the embodiments as described herein is to determine and apply encoding to audio sig...

Result 2 (similarity: 0.5347)
Title: AUDIO SIGNAL ENCODER
Section: Background
Patent ID: EP13899497B9W1
file Path: /app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json
Content preview: Background
Audio signals, like speech or music, are encoded for example to enable efficient transmission or storage of the audio signals. Examples of ...

Result 3 (similarity: 0.5472)
Title: AUDIO SIGNAL ENCODER
Section: Summary
Patent ID: EP13899497B9W1
file Pat

In [32]:
# vectorstore.delete_collection()

2025-06-20 08:30:44,712 - httpx - INFO - HTTP Request: DELETE http://vector_db:8000/api/v2/tenants/default_tenant/databases/default_database/collections/patents "HTTP/1.1 200 OK"
