1. Extract semantic chunks from your JSON (e.g., abstract, claim, paragraph).

2. Use HuggingFaceEmbeddings or OpenAIEmbeddings.

3. Store structured info as metadata.

4. Use metadata filtering + vector similarity for retrieval.


In [86]:
import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))


from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list


print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [2]:
import json
import glob
import os
from langchain.docstore.document import Document
import sys
# Prepare .json file for embedding
def extract_documents(json_data):
    bibliographic = json_data.get("bibliographic_data", {})
    doc_id = bibliographic.get("doc_id", "UNKNOWN")
    documents = []

    # Common metadata to propagate
    common_meta = {
        "doc_id": doc_id,
        "language": bibliographic.get("language"),
        "country": bibliographic.get("country"),
        "doc_number": bibliographic.get("doc_number"),
        "application_number": bibliographic.get("application_number"),
        "publication_date": bibliographic.get("publication_date"),
        "ipc_classes": bibliographic.get("ipc_classes", []),
        "file":bibliographic.get("file")
    }

    # Title (en preferred)
    title_dict = bibliographic.get("title", {})
    title = title_dict.get("en") or next(iter(title_dict.values()), "")
    if title:
        documents.append(Document(
            page_content=title,
            metadata={**common_meta, "section": "title"}
        ))

    # Abstract
    abstract = bibliographic.get("abstract")
    if abstract:
        documents.append(Document(
            page_content=abstract,
            metadata={**common_meta, "section": "abstract"}
        ))

    # Claims
    for claim in json_data.get("claims", []):
        documents.append(Document(
            page_content=claim["text"],
            metadata={**common_meta, "section": "claim", "claim_number": claim.get("claim_number")}
        ))

    # Main sections
    for section in json_data.get("main_sections", []):
        section_name = section.get("heading_text", "UNKNOWN_SECTION")
        for p in section.get("paragraphs", []):
            documents.append(Document(
                page_content=f"{section_name}\n{p['text']}",
                metadata={**common_meta, "section": section_name, "p_id": p.get("p_id")}
            ))

    return documents


✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions


In [71]:

# def get_chunk_size(text: str, total_tokens: int, base_chunk_size: int = 350, min_chunk_size: int = 180) -> int:
#     # total_tokens = count_tokens(text)

#     # If it's small enough, return as one chunk
#     if total_tokens <= base_chunk_size:
#         return total_tokens

#     # Try to divide the tokens into balanced parts
#     num_splits = total_tokens // base_chunk_size
#     if total_tokens % base_chunk_size != 0:
#         num_splits += 1

#     # Compute new balanced chunk size
#     balanced_chunk_size = total_tokens // num_splits

#     # Make sure it doesn't go below a minimum
#     balanced_chunk_size = int(max(min_chunk_size, balanced_chunk_size))


#     return balanced_chunk_size 

In [72]:
from transformers import AutoTokenizer
from langchain.text_splitter import TokenTextSplitter
from langchain.schema.document import Document  # Required if your docs are raw strings

def chunk_documents(documents):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    # Define a helper to count tokens
    def count_tokens(text):
        return len(tokenizer.encode(text, truncation=False, add_special_tokens=False))

    processed_docs = []
    for doc in documents:
        # If doc is a raw string, wrap it in a Document
        if isinstance(doc, str):
            doc = Document(page_content=doc)

        token_count = count_tokens(doc.page_content)
        balanced_chunk_size = get_chunk_size(doc, token_count)  # You must define this function
        print("Token Count:", token_count)
        print("Balanced Chunk Size:", balanced_chunk_size)

        splitter = TokenTextSplitter(
            chunk_size=balanced_chunk_size,
            chunk_overlap=3,
                tokenizer=tokenizer  # ✅ important
        )

        chunks = splitter.split_documents([doc])
        print("Before:", len(processed_docs))
        processed_docs.extend(chunks)
        print("After:", len(processed_docs))

    return processed_docs


In [None]:

import os
os.environ["TRANSFORMERS_HTTP_TIMEOUT"] = "60"
# load the .json documents
file_list = get_epo_json_file_paths()[:100]
# file_list = glob.glob(json_files_path)

all_documents = []

for file_path in file_list:
    with open(file_path, "r") as f:
        data = json.load(f)
        docs = extract_documents(data)
        chunked_docs = chunk_documents(docs)
        all_documents.extend(chunked_docs)



In [None]:
for i, doc in enumerate(all_documents, start=1):
    if i > 2:
        break
    print(f"Document {i}")
    print("Page Content:", doc.page_content)
    print("Metadata: ", doc.metadata)
print("Wtf")

In [87]:
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import numpy as np

# Load tokenizer and embedding model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

def get_chunk_size(text: str, total_tokens: int, base_chunk_size: int = 350, min_chunk_size: int = 180) -> int:
    if total_tokens <= base_chunk_size:
        return total_tokens

    num_splits = total_tokens // base_chunk_size
    if total_tokens % base_chunk_size != 0:
        num_splits += 1


    balanced_chunk_size = math.ceil(total_tokens / num_splits)

    balanced_chunk_size = int(max(min_chunk_size, balanced_chunk_size))
    return balanced_chunk_size

def chunk_text_by_tokens(text: str, chunk_size: int, overlap: int = 30):
    print("Starting tokenization...")
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    print(f"Tokenized text length: {len(input_ids)} tokens")
    
    # Safety check to prevent infinite loops or extremely slow processing
    if chunk_size <= overlap:
        overlap = 0
    
    chunks = []
    start = 0
    
    # Process in batches for large texts
    while start < len(input_ids):
        print(f"Processing chunk at position {start}/{len(input_ids)}")
        end = min(start + chunk_size, len(input_ids))
        chunk_ids = input_ids[start:end]
        
        try:
            chunk_text = tokenizer.decode(chunk_ids)
            chunks.append(chunk_text)
        except Exception as e:
            print(f"Error decoding chunk: {e}")
            # Skip this chunk or use a fallback approach
        
        # Advance position
        start += chunk_size - overlap
        
        # Safety check to prevent memory issues with too many chunks
        if len(chunks) > 1000:  # Arbitrary limit
            print("⚠️ Warning: Reached maximum number of chunks")
            break
    
    print(f"Created {len(chunks)} chunks")
    return chunks

def embed_document(text: str):
    total_tokens = count_tokens(text)
    chunk_size = get_chunk_size(text, total_tokens)

    chunks = chunk_text_by_tokens(text, chunk_size)
    embeddings = embedding_model.encode(chunks)

    return list(zip(chunks, embeddings))  # list of (chunk, embedding_vector)


2025-06-18 11:47:19,914 - INFO - Use pytorch device_name: cpu
2025-06-18 11:47:19,915 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [28]:

file_list = get_epo_json_file_paths()[:100]
# file_list = glob.glob(json_files_path)

all_documents = []

for file_path in file_list:
    with open(file_path, "r") as f:
        data = json.load(f)
        docs = extract_documents(data)
        # If doc is a raw string, wrap it in a Document
        if isinstance(doc, str):
            doc = Document(page_content=doc)
        chunked_docs = embed_document(doc.page_content)
        all_documents.extend(chunked_docs)


📁 Found 1286 EPO JSON files


NameError: name 'doc' is not defined

In [25]:
first_chunk, first_embedding = all_documents[0]

In [26]:
first_chunk

"[ document ( metadata = { ' doc _ id ' : ' ep13899497b9w1 ', ' language ' : ' en ', ' country ' : ' ep ', ' doc _ number ' : ' 3084761 ', ' application _ number ' : ' 13899497. 5 ', ' publication _ date ' : ' 20250611 ', ' ipc _ classes ' : [ ' g10l 19 / 038 20130101afi20170426bhep ', ' g10l 19 / 07 20130101ali20170426bhep ' ], ' file ' : ' ep13899497w1b9. xml ', ' section ' : ' title ' }, page _ content = ' audio signal encoder ' ), document ( metadata = { ' doc _ id ' : ' ep13899497b9w1 ', ' language ' : ' en ', ' country ' : ' ep ', ' doc _ number ' : ' 3084761 ', ' application _ number ' : ' 13899497. 5 ', ' publication _ date ' : ' 20250611 ', ' ipc _ classes ' : [ ' g10l 19 / 038 20130101afi20170426bhep ', ' g10l 19 / 07 20130101ali20170426bhep ' ], ' file ' : ' ep13899497w1b9. xml ', ' section ' : ' claim ', ' claim _ number ' : ' 0001 '"

In [88]:
import time
from typing import List, Dict, Any
import logging
import numpy as np
from langchain.schema import Document

def embed_documents_with_metadata(documents, debug=True):
    """Process a list of Document objects, preserving metadata"""
    if debug:
        print(f"📋 Starting document embedding process for {len(documents)} documents")
        start_time = time.time()
    
    processed_docs = []
    
    # Track statistics for debugging
    total_tokens = 0
    total_chunks = 0
    errors = 0
    
    for doc_idx, doc in enumerate(documents):
        if debug and doc_idx % 10 == 0:  # Progress update every 10 docs
            print(f"⏳ Processing document {doc_idx+1}/{len(documents)} ({(doc_idx+1)/len(documents)*100:.1f}%)")
        
        try:
            # Get text content and metadata
            text = doc.page_content
            metadata = doc.metadata
            
            if debug and doc_idx < 3:  # Show sample of first few documents
                print(f"\n📄 Document {doc_idx+1} Preview:")
                print(f"  - Content: {text[:100]}..." if len(text) > 100 else f"  - Content: {text}")
                print(f"  - Metadata: {metadata}")

            
            # Calculate chunk size
            doc_tokens = count_tokens(text)
            total_tokens += doc_tokens
            chunk_size = get_chunk_size(text, doc_tokens)
            
            if debug and doc_idx < 3:
                print(f"  - Token count: {doc_tokens}")
                print(f"  - Calculated chunk size: {chunk_size}")
            
            # Create chunks
            chunk_start_time = time.time()
            print("x")
            chunks = chunk_text_by_tokens(text, chunk_size)
            print("y")
            total_chunks += len(chunks)
            
            if debug and doc_idx < 3:
                chunk_time = time.time() - chunk_start_time
                print(f"  - Chunks created: {len(chunks)} (in {chunk_time:.2f}s)")
                print(f"  - First chunk: {chunks[0][:50]}...")
            
            # Generate embeddings for all chunks
            embedding_start_time = time.time()
            try:
                
                batch_size = 8 
                all_embeddings = []
                
                for batch_idx in range(0, len(chunks), batch_size):
                    batch = chunks[batch_idx:batch_idx+batch_size]
                    print(f"  - Processing batch {batch_idx//batch_size + 1}/{(len(chunks)+batch_size-1)//batch_size}")
                    batch_embeddings = embedding_model.encode(batch, show_progress_bar=False)
                    all_embeddings.extend(batch_embeddings)
                
                embeddings = all_embeddings
                print("✓ Embedding completed")
            except Exception as e:
                print(f"❌ Error during embedding: {str(e)}")
                raise
            print("x")
            if debug and doc_idx < 3:
                embedding_time = time.time() - embedding_start_time
                print(f"  - Embeddings generated: {len(embeddings)} vectors of shape {embeddings[0].shape}")
                print(f"  - Embedding time: {embedding_time:.2f}s ({embedding_time/len(chunks):.4f}s per chunk)")
            
            # Create new documents with chunks and metadata
            for i, (chunk_text, embedding) in enumerate(zip(chunks, embeddings)):
                # Create a copy of metadata and add chunk information
                chunk_metadata = metadata.copy()
                chunk_metadata["chunk_index"] = i
                chunk_metadata["total_chunks"] = len(chunks)
                chunk_metadata["source_doc_idx"] = doc_idx
                
                # Check for NaN values in embedding
                if np.isnan(embedding).any():
                    if debug:
                        print(f"⚠️ Warning: NaN values detected in embedding for document {doc_idx}, chunk {i}")
                    # Replace NaN with zeros
                    embedding = np.nan_to_num(embedding)
                
                # Create a new Document object with chunk text and original metadata
                processed_docs.append({
                    "text": chunk_text,
                    "embedding": embedding,
                    "metadata": chunk_metadata
                })
                
        except Exception as e:
            errors += 1
            if debug:
                print(f"❌ Error processing document {doc_idx}: {str(e)}")
    
    if debug:
        end_time = time.time()
        total_time = end_time - start_time
        print(f"\n✅ Embedding completed in {total_time:.2f}s")
        print(f"📊 Statistics:")
        print(f"  - Documents processed: {len(documents)}")
        print(f"  - Total tokens: {total_tokens}")
        print(f"  - Total chunks created: {total_chunks}")
        print(f"  - Average chunks per document: {total_chunks/len(documents):.2f}")
        print(f"  - Processing speed: {total_tokens/total_time:.2f} tokens/second")
        print(f"  - Documents with errors: {errors}")
        print(f"  - Success rate: {(len(documents)-errors)/len(documents)*100:.2f}%")
        
        # Memory usage of embeddings
        embedding_size = sum(emb["embedding"].nbytes for emb in processed_docs)
        print(f"  - Embedding memory usage: {embedding_size/1024/1024:.2f} MB")
    
    return processed_docs



In [89]:
# Example usage:
if __name__ == "__main__":
    # Create some test documents
    test_docs = [
        Document(page_content="This is a test document with some content that will be embedded.",
                metadata={"source": "test", "id": 1}),
        Document(page_content="This is another document with different content for embedding.",
                metadata={"source": "test", "id": 2}),
    ]
    
    # Process with debug output
    result = embed_documents_with_metadata(test_docs, debug=True)
    
    # Show first result
    print("\n🔍 First Result Sample:")
    print(f"Text: {result[0]['text']}")
    print(f"Embedding shape: {result[0]['embedding'].shape}")
    print(f"Metadata: {result[0]['metadata']}")

📋 Starting document embedding process for 2 documents
⏳ Processing document 1/2 (50.0%)

📄 Document 1 Preview:
  - Content: This is a test document with some content that will be embedded.
  - Metadata: {'source': 'test', 'id': 1}
  - Token count: 13
  - Calculated chunk size: 13
x
Starting tokenization...
Tokenized text length: 13 tokens
Processing chunk at position 0/13
Created 1 chunks
y
  - Chunks created: 1 (in 0.00s)
  - First chunk: this is a test document with some content that wil...
  - Processing batch 1/1
✓ Embedding completed
x
  - Embeddings generated: 1 vectors of shape (384,)
  - Embedding time: 0.05s (0.0461s per chunk)

📄 Document 2 Preview:
  - Content: This is another document with different content for embedding.
  - Metadata: {'source': 'test', 'id': 2}
  - Token count: 12
  - Calculated chunk size: 12
x
Starting tokenization...
Tokenized text length: 12 tokens
Processing chunk at position 0/12
Created 1 chunks
y
  - Chunks created: 1 (in 0.00s)
  - First chunk: t

In [80]:

# Process all files
file_list = get_epo_json_file_paths()[:10]
all_documents = []

len(file_list)


📁 Found 1286 EPO JSON files


10

In [92]:
for file_path in file_list:
    print(file_path)
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            # Extract documents (returns list of Document objects)
            docs = extract_documents(data)
            
            # Process each document with metadata preservation
            chunked_docs = embed_documents_with_metadata(docs)
            all_documents.extend(chunked_docs)
            
        print(f"Processed {file_path}, total documents: {len(all_documents)}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

print(f"Total embedded chunks: {len(all_documents)}")

# Example of examining the first few documents
for i, doc in enumerate(all_documents[:3]):
    print(f"\nDocument {i+1}:")
    print(f"Text: {doc['text'][:100]}...")
    print(f"Embedding shape: {doc['embedding'].shape}")
    print(f"Metadata: {doc['metadata']}")

Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors


/app/data/parsed/EPO/EPRTBJV2025000024001001/EPW1B9/EP13899497W1B9/EP13899497W1B9.json
📋 Starting document embedding process for 180 documents
⏳ Processing document 1/180 (0.6%)

📄 Document 1 Preview:
  - Content: AUDIO SIGNAL ENCODER
  - Metadata: {'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': ['G10L  19/038       20130101AFI20170426BHEP', 'G10L  19/07        20130101ALI20170426BHEP'], 'file': 'EP13899497W1B9.xml', 'section': 'title'}
  - Token count: 5
  - Calculated chunk size: 5
x
Starting tokenization...
Tokenized text length: 5 tokens
Processing chunk at position 0/5
Created 1 chunks
y
  - Chunks created: 1 (in 0.00s)
  - First chunk: audio signal encoder...
  - Processing batch 1/1
✓ Embedding completed
x
  - Embeddings generated: 1 vectors of shape (384,)
  - Embedding time: 0.03s (0.0272s per chunk)

📄 Document 2 Preview:
  - Content: A processor-impleme

In [91]:
all_documents[3]

{'text': 'scale factor squared ; determining the best leader class associated with the single potential code vector which generates the smallest associated distance ; and sorting components of the best leader class by the reverse ordering of the descending order of absolute values of the components of the single vector of parameters to generate an output lattice - quantized vector.',
 'embedding': array([-1.39612034e-02,  3.35320830e-02, -6.83307201e-02, -3.19998823e-02,
        -2.53480002e-02,  5.13324663e-02, -4.18933146e-02, -6.43100543e-03,
        -2.48833969e-02,  4.77418443e-03,  4.21190932e-02,  3.78175043e-02,
         2.91607752e-02,  3.00353784e-02, -8.81722793e-02, -1.84398983e-02,
         1.23946741e-02,  1.27153769e-01, -8.33532736e-02, -6.15738258e-02,
         4.38563526e-02, -1.01426795e-01, -2.92504746e-02, -4.27244231e-03,
         7.42027014e-02, -2.93293186e-02,  3.31224091e-02,  3.88909392e-02,
         9.69608501e-02, -7.51155764e-02,  2.25826353e-02,  6.060751