In [1]:
import sys
import os
from pathlib import Path

# Determine the project root (adjust as needed depending on notebook location)
notebook_dir = Path.cwd()
project_root = notebook_dir.parent  # If notebooks are in langechain/notebooks

# Add project root to Python path for imports
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /app


In [2]:
# Now you can import your modules
from src.data_pipline.json_loader.json_loader_epo import EPOPatentLoader
from config.json_loader_config import PatentLoaderConfig
from config.chunker_config import  ChunkerConfig
from src.embeddings.text_Chunker import TextChunker
from transformers import AutoTokenizer
print("✅ Successfully imported loader modules")

✅ Successfully imported loader modules


In [3]:

# Create a custom configuration
config = PatentLoaderConfig(
    # Override settings as needed
    max_files=10,
    include_claims=True,
    verbose=True
)

# Create the loader
loader = EPOPatentLoader(config)

# Get all documents



In [4]:
jsonfiles = loader.load_raw_json_files()
jsonfiles

2025-06-19 19:16:13,435 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 10 files out of 10 total files
2025-06-19 19:16:13,435 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 10 JSON files in /app/data/parsed/EPO
2025-06-19 19:16:13,435 - src.data_pipline.json_loader.json_loader_patent - INFO - Loading raw file 1/10: EP13899497W1B9.json
2025-06-19 19:16:13,437 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 10 raw JSON files


[{'file_path': '/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json',
  'data': {'bibliographic_data': {'doc_id': 'EP13899497B9W1',
    'file': 'EP13899497W1B9.xml',
    'language': 'en',
    'country': 'EP',
    'doc_number': '3084761',
    'kind_code': 'B9',
    'correction_code': 'W1',
    'publication_date': '20250611',
    'status': 'c',
    'dtd_version': 'ep-patent-document-v1-7',
    'document_type': 'CORRECTED NEW EUROPEAN PATENT SPECIFICATION',
    'kind': 'B9',
    'publication_date_full': '20250611',
    'correction_information': {'correction_code': 'W1',
     'correction_details_b155': [{'language': 'de', 'text': 'Ansprüche EN'},
      {'language': 'en', 'text': 'Claims EN'},
      {'language': 'fr', 'text': 'Revendications EN'}]},
    'application_number': '13899497.5',
    'application_date': '20131217',
    'corrigendum_bulletin': {'date': '20250611', 'bulletin_number': '202524'},
    'publication_bulletin': {'date': '20161026', 'bulle

2025-06-19 19:16:13,795 - src.data_pipline.json_loader.json_loader_patent - INFO - Limited to 10 files out of 10 total files
2025-06-19 19:16:13,795 - src.data_pipline.json_loader.json_loader_patent - INFO - Found 10 JSON files in /app/data/parsed/EPO
2025-06-19 19:16:13,795 - src.data_pipline.json_loader.json_loader_patent - INFO - Processing file 1/10: EP13899497W1B9.json
2025-06-19 19:16:13,798 - src.data_pipline.json_loader.json_loader_patent - INFO - Successfully loaded 10 documents from 10 files


In [6]:
documents[0]

[Document(metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': ['G10L  19/038       20130101AFI20170426BHEP', 'G10L  19/07        20130101ALI20170426BHEP'], 'file': 'EP13899497W1B9.xml', 'filePath': PosixPath('/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json'), 'title': 'AUDIO SIGNAL ENCODER', 'section': 'title'}, page_content='AUDIO SIGNAL ENCODER'),
 Document(metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': ['G10L  19/038       20130101AFI20170426BHEP', 'G10L  19/07        20130101ALI20170426BHEP'], 'file': 'EP13899497W1B9.xml', 'filePath': PosixPath('/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json'), 'title': 'AUDIO SIGNAL ENCODER', 'section': 'Field',

In [14]:
# Initialize tokenizer and chunker

config = ChunkerConfig(
    base_chunk_size=320,
    chunk_overlap=30,
    max_chunks=500
)

text = "This is a sample text that demonstrates chunking functionality. " * 32
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


chunker = TextChunker(tokenizer, config)

# Use semantic chunking (recommended)
# semantic_chunks = chunker.chunk_text(text, use_semantic=True)

# Or use improved token-based chunking
token_chunks = chunker.ch(text, use_semantic=True)

# Analyze results
analysis = chunker.analyze_chunks(token_chunks)
print(f"Created {analysis['num_chunks']} chunks")
print(f"Average tokens per chunk: {analysis['token_stats']['avg']:.1f}")
# print(f"Average overlap ratio: {analysis['avg_overlap_ratio']:.2f}")

🔍 DEBUG: Input text has 352 tokens
🔍 DEBUG: Base chunk size: 320
🔍 DEBUG: Chunk overlap: 30
🔍 DEBUG: Calculated chunk size: 176 tokens
🔍 DEBUG: Using semantic chunking
🔍 DEBUG: Created 3 chunks
🔍 DEBUG: Chunk 1: 176 tokens, first 50 chars: 'This is a sample text that demonstrates chunking f...'
🔍 DEBUG: Chunk 2: 173 tokens, first 50 chars: 'ext that demonstrates chunking functionality. This...'
🔍 DEBUG: Chunk 3: 63 tokens, first 50 chars: 'ext that demonstrates chunking functionality. This...'
Created 3 chunks
Average tokens per chunk: 137.3


In [17]:
token_chunks[1]

'ext that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking functionality. This is a sample text that demonstrates chunking function

In [19]:
# More realistic test text with variety
realistic_text = """
Patent Application: Advanced Text Processing System

Background of the Invention:
The present invention relates to systems and methods for processing large documents, particularly patent applications and technical documentation. Current systems face challenges in maintaining semantic coherence when splitting documents into smaller segments.

Technical Problem:
Existing text chunking systems often split documents at arbitrary boundaries, leading to loss of context and reduced effectiveness in downstream processing tasks such as embedding generation and similarity matching.

Proposed Solution:
The present invention provides a novel approach to text segmentation that considers semantic boundaries, token distributions, and overlap strategies to maintain contextual integrity while optimizing chunk sizes for machine learning applications.

Detailed Description:
The system comprises a tokenizer component, a boundary detection module, and an optimization engine. The tokenizer component processes input text using transformer-based models. The boundary detection module identifies natural split points such as sentence endings, paragraph breaks, and section boundaries.

Implementation Details:
The optimization engine calculates optimal chunk sizes based on total document length and target parameters. It employs a balanced splitting algorithm that minimizes variance in chunk sizes while respecting semantic boundaries.

Experimental Results:
Testing on various document types shows improved performance compared to naive splitting approaches. The system maintains semantic coherence while achieving target chunk sizes within acceptable tolerance ranges.

Conclusion:
This invention provides a practical solution for intelligent document segmentation in natural language processing applications.
""" * 3  # Repeat to make it longer

# Test with this more realistic text
chunks = chunker.debug_chunking(realistic_text, use_semantic=True)
analysis = chunker.analyze_chunks(chunks)
print(f"Analysis: {analysis}")

🔍 DEBUG: Input text has 819 tokens
🔍 DEBUG: Base chunk size: 320
🔍 DEBUG: Chunk overlap: 30
🔍 DEBUG: Calculated chunk size: 273 tokens
🔍 DEBUG: Using semantic chunking
🔍 DEBUG: Created 4 chunks
🔍 DEBUG: Chunk 1: 273 tokens, first 50 chars: 'Patent Application: Advanced Text Processing Syste...'
🔍 DEBUG: Chunk 2: 281 tokens, first 50 chars: 'target chunk sizes within acceptable tolerance ran...'
🔍 DEBUG: Chunk 3: 269 tokens, first 50 chars: 'ous document types shows improved performance comp...'
🔍 DEBUG: Chunk 4: 85 tokens, first 50 chars: 'tes optimal chunk sizes based on total document le...'
Analysis: {'num_chunks': 4, 'has_duplicates': False, 'unique_chunks': 4, 'token_stats': {'min': 85, 'max': 281, 'avg': 227.0, 'total': 908, 'target_chunk_size': 320}, 'char_stats': {'min': 572, 'max': 1860, 'avg': 1500.8}, 'overlap_stats': {'avg_overlap_ratio': 0.034, 'min_overlap': 0.0, 'max_overlap': 0.102, 'target_overlap': 30}, 'chunks_preview': [{'chunk_num': 1, 'tokens': 273, 'chars': 1804,

2025-06-19 19:51:30,603 - src.embeddings.embeddings - INFO - Initializing embedding model: sentence-transformers/all-mpnet-base-v2
2025-06-19 19:51:30,610 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-06-19 19:51:30,611 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Initializing Text Chunker...
Initializing Document Embedder...


2025-06-19 19:51:32,326 - src.embeddings.embeddings - INFO - Model initialized successfully with vector size 768
2025-06-19 19:51:32,328 - src.embeddings.embeddings - INFO - DocumentEmbedder initialized with model sentence-transformers/all-mpnet-base-v2
2025-06-19 19:51:32,329 - src.embeddings.embeddings - INFO - Added 3 documents. Total documents: 3
2025-06-19 19:51:32,330 - src.embeddings.embeddings - INFO - Processing document 1/3 (33.3%)
2025-06-19 19:51:32,332 - src.embeddings.text_Chunker - INFO - Input text has 22 tokens
2025-06-19 19:51:32,332 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 19:51:32,487 - src.embeddings.text_Chunker - INFO - Input text has 124 tokens
2025-06-19 19:51:32,488 - src.embeddings.text_Chunker - INFO - Text fits in single chunk


Creating sample documents...
Added 3 documents to embedder
Processing documents and generating embeddings...


2025-06-19 19:51:32,596 - src.embeddings.text_Chunker - INFO - Input text has 116 tokens
2025-06-19 19:51:32,596 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 19:51:32,689 - src.embeddings.embeddings - INFO - Processing completed in 0.36s
2025-06-19 19:51:32,689 - src.embeddings.embeddings - INFO - Created 3 chunks from 3 documents
2025-06-19 19:51:32,690 - src.embeddings.text_Chunker - INFO - Input text has 5 tokens
2025-06-19 19:51:32,690 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 19:51:32,724 - src.embeddings.embeddings - INFO - Found 3 results above threshold 0.1
2025-06-19 19:51:32,725 - src.embeddings.text_Chunker - INFO - Input text has 26 tokens
2025-06-19 19:51:32,725 - src.embeddings.text_Chunker - INFO - Text fits in single chunk
2025-06-19 19:51:32,769 - src.embeddings.embeddings - INFO - Found 3 results above threshold 0.1
2025-06-19 19:51:32,773 - src.embeddings.embeddings - INFO - Saved 3 document embeddings 

Generated 3 document chunks with embeddings

Performing simple search...

Top 2 results for query: 'What is machine learning?'

Result 1 (similarity: 0.8190):
Text: Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', 
        that is, methods that leverage data t...
Metadata: {'source': 'sample2', 'type': 'explanation', 'chunk_index': 0, 'total_chunks': 1, 'source_doc_idx': 1}

Result 2 (similarity: 0.5705):
Text: Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. 
        Learn...
Metadata: {'source': 'sample3', 'type': 'explanation', 'chunk_index': 0, 'total_chunks': 1, 'source_doc_idx': 2}

Performing search with longer query...

Top 2 results for complex query:

Result 1 (similarity: 0.7157):
Text: Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. 
        Learn.