1. Extract semantic chunks from your JSON (e.g., abstract, claim, paragraph).

2. Use HuggingFaceEmbeddings or OpenAIEmbeddings.

3. Store structured info as metadata.

4. Use metadata filtering + vector similarity for retrieval.


In [1]:
import sys
import os
from pathlib import Path
import time
import json

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))


from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list


print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [2]:
import json
import glob
import os
from langchain.docstore.document import Document
import sys
# Prepare .json file for embedding
def extract_documents(json_data):
    bibliographic = json_data.get("bibliographic_data", {})
    doc_id = bibliographic.get("doc_id", "UNKNOWN")
    documents = []

    # Common metadata to propagate
    common_meta = {
        "doc_id": doc_id,
        "language": bibliographic.get("language"),
        "country": bibliographic.get("country"),
        "doc_number": bibliographic.get("doc_number"),
        "application_number": bibliographic.get("application_number"),
        "publication_date": bibliographic.get("publication_date"),
        "ipc_classes": bibliographic.get("ipc_classes", []),
    }

    # Title (en preferred)
    title_dict = bibliographic.get("title", {})
    title = title_dict.get("en") or next(iter(title_dict.values()), "")
    if title:
        documents.append(Document(
            page_content=title,
            metadata={**common_meta, "section": "title"}
        ))

    # Abstract
    abstract = bibliographic.get("abstract")
    if abstract:
        documents.append(Document(
            page_content=abstract,
            metadata={**common_meta, "section": "abstract"}
        ))

    # Claims
    for claim in json_data.get("claims", []):
        documents.append(Document(
            page_content=claim["text"],
            metadata={**common_meta, "section": "claim", "claim_number": claim.get("claim_number")}
        ))

    # Main sections
    for section in json_data.get("main_sections", []):
        section_name = section.get("heading_text", "UNKNOWN_SECTION")
        for p in section.get("paragraphs", []):
            documents.append(Document(
                page_content=f"{section_name}\n{p['text']}",
                metadata={**common_meta, "section": section_name, "p_id": p.get("p_id")}
            ))

    return documents


In [3]:
# Test the new JSON loading functionality
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))

# Import the JSON loader functions
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions


In [22]:

os.environ["TRANSFORMERS_HTTP_TIMEOUT"] = "60"
# load the .json documents
file_list = get_epo_json_file_paths()[:50]
# file_list = glob.glob(json_files_path)

all_documents = []

# Preprocessing documents
for file_path in file_list:
    with open(file_path, "r") as f:
        data = json.load(f)
        docs = extract_documents(data)
        all_documents.extend(docs)

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True
    )
all_chunks = splitter.split_documents(all_documents)
print("Total chunks: ", len(all_chunks))
print("Chunks: ", all_chunks[0])


📁 Found 1286 EPO JSON files
Total chunks:  2160
Chunks:  page_content='AUDIO SIGNAL ENCODER' metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': ['G10L  19/038       20130101AFI20170426BHEP', 'G10L  19/07        20130101ALI20170426BHEP'], 'section': 'title', 'start_index': 0}


In [23]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectors = []

for chunk in all_chunks:
    vector = embeddings.embed_query(chunk.page_content)
    vectors.append(vector)

# for i, vector in enumerate(vectors[:5]):
#     print(f"Vector {i}: {vector}")

2025-06-18 13:58:28,105 - INFO - Use pytorch device_name: cpu
2025-06-18 13:58:28,107 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [24]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

vectorstore = Chroma.from_documents(filter_complex_metadata(all_chunks), embeddings)


ids = vector_store.add_documents(documents=all_chunks)

results = vector_store.similarity_search("How many paper has written about APPARATUS?")
print(results[0])

results = vector_store.similarity_search_with_score("Which paper writes this: The invention relates to a substance determining apparatus and substance determining method for determining a substance within a fluid. The invention relates further to a binding device and an analyzing device for cooperating with each other for determining a substance within a fluid, an analyzing method for determining a substance within a fluid and an analyzing computer program for determining a substance within a fluid.")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

page_content='Summary
There is provided according to a first aspect an apparatus according to claim 1.' metadata={'section': 'Summary', 'country': 'EP', 'start_index': 0, 'application_number': '17876867.7', 'language': 'en', 'publication_date': '20250611', 'doc_id': 'EP17876867B1', 'p_id': 'p0009', 'doc_number': '3549354'}
Score: 0.893546998500824

page_content='Definition of the invention
The composition is preferably fluid.' metadata={'publication_date': '20250611', 'p_id': 'p0046', 'country': 'EP', 'doc_number': '4440377', 'doc_id': 'EP22818205B1', 'application_number': '22818205.1', 'section': 'Definition of the invention', 'start_index': 0, 'language': 'en'}
