1. Extract semantic chunks from your JSON (e.g., abstract, claim, paragraph).

2. Use HuggingFaceEmbeddings or OpenAIEmbeddings.

3. Store structured info as metadata.

4. Use metadata filtering + vector similarity for retrieval.


In [2]:
import sys
import os
from pathlib import Path
import time
import json

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))


from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list


print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [3]:
import json
import glob
import os
from langchain.docstore.document import Document
import sys
# Prepare .json file for embedding
def extract_documents(json_data):
    bibliographic = json_data.get("bibliographic_data", {})
    doc_id = bibliographic.get("doc_id", "UNKNOWN")
    documents = []

    # Common metadata to propagate
    common_meta = {
        "doc_id": doc_id,
        "language": bibliographic.get("language"),
        "country": bibliographic.get("country"),
        "doc_number": bibliographic.get("doc_number"),
        "application_number": bibliographic.get("application_number"),
        "publication_date": bibliographic.get("publication_date"),
        "ipc_classes": bibliographic.get("ipc_classes", []),
    }

    # Title (en preferred)
    title_dict = bibliographic.get("title", {})
    title = title_dict.get("en") or next(iter(title_dict.values()), "")
    if title:
        documents.append(Document(
            page_content=title,
            metadata={**common_meta, "section": "title"}
        ))

    # Abstract
    abstract = bibliographic.get("abstract")
    if abstract:
        documents.append(Document(
            page_content=abstract,
            metadata={**common_meta, "section": "abstract"}
        ))

    # Claims
    for claim in json_data.get("claims", []):
        documents.append(Document(
            page_content=claim["text"],
            metadata={**common_meta, "section": "claim", "claim_number": claim.get("claim_number")}
        ))

    # Main sections
    for section in json_data.get("main_sections", []):
        section_name = section.get("heading_text", "UNKNOWN_SECTION")
        for p in section.get("paragraphs", []):
            documents.append(Document(
                page_content=f"{section_name}\n{p['text']}",
                metadata={**common_meta, "section": section_name, "p_id": p.get("p_id")}
            ))

    return documents


In [4]:
# Test the new JSON loading functionality
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))

# Import the JSON loader functions
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions


In [5]:

os.environ["TRANSFORMERS_HTTP_TIMEOUT"] = "60"
# load the .json documents
file_list = get_epo_json_file_paths()[:50]
# file_list = glob.glob(json_files_path)

all_documents = []

# Preprocessing documents
for file_path in file_list:
    with open(file_path, "r") as f:
        data = json.load(f)
        docs = extract_documents(data)
        all_documents.extend(docs)

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True
    )
all_chunks = splitter.split_documents(all_documents)
print("Total chunks: ", len(all_chunks))
print("Chunks: ", all_chunks[0])


📁 Found 1286 EPO JSON files
Total chunks:  2160
Chunks:  page_content='AUDIO SIGNAL ENCODER' metadata={'doc_id': 'EP13899497B9W1', 'language': 'en', 'country': 'EP', 'doc_number': '3084761', 'application_number': '13899497.5', 'publication_date': '20250611', 'ipc_classes': ['G10L  19/038       20130101AFI20170426BHEP', 'G10L  19/07        20130101ALI20170426BHEP'], 'section': 'title', 'start_index': 0}


In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectors = []

for chunk in all_chunks:
    vector = embeddings.embed_query(chunk.page_content)
    vectors.append(vector)

# for i, vector in enumerate(vectors[:5]):
#     print(f"Vector {i}: {vector}")

2025-06-18 15:48:03,712 - INFO - Use pytorch device_name: cpu
2025-06-18 15:48:03,713 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

vectorstore = Chroma.from_documents(filter_complex_metadata(all_chunks), embeddings)


ids = vector_store.add_documents(documents=all_chunks)

# results = vector_store.similarity_search("How many paper has written about APPARATUS?")
# print(results[0])

results = vector_store.similarity_search_with_score(
    "Which paper mentions about electronics")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)


Score: 1.0281710624694824

page_content='Description of Some Embodiments of the Application
As used in this application, the term 'circuitry' refers to all of the following:' metadata={'country': 'EP', 'language': 'en', 'doc_id': 'EP13899497B9W1', 'section': 'Description of Some Embodiments of the Application', 'p_id': 'p0171', 'start_index': 0, 'application_number': '13899497.5', 'doc_number': '3084761', 'publication_date': '20250611'}


In [10]:
print(results[1])

(Document(id='516650fa-da53-46fe-b63d-ec08d845d7c4', metadata={'doc_id': 'EP13899497B9W1', 'application_number': '13899497.5', 'section': 'Description of Some Embodiments of the Application', 'p_id': 'p0015', 'doc_number': '3084761', 'country': 'EP', 'language': 'en', 'publication_date': '20250611', 'start_index': 0}, page_content='Description of Some Embodiments of the Application\nThe electronic device or apparatus 10 in some embodiments comprises a microphone 11, which is linked via an analogue-to-digital converter (ADC) 14 to a processor 21. The processor 21 is further linked via a digital-to-analogue (DAC) converter 32 to loudspeakers 33. The processor 21 is further linked to a transceiver (RX/TX) 13, to a user interface (UI) 15 and to a memory 22.'), 1.0854408740997314)
