1. Extract semantic chunks from your JSON (e.g., abstract, claim, paragraph).

2. Use HuggingFaceEmbeddings or OpenAIEmbeddings.

3. Store structured info as metadata.

4. Use metadata filtering + vector similarity for retrieval.


In [8]:
import sys
import os
from pathlib import Path
import time
import json

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))


from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list


print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [9]:
import json
import glob
import os
from langchain.docstore.document import Document
import sys
# Prepare .json file for embedding
def extract_documents(json_data):
    bibliographic = json_data.get("bibliographic_data", {})
    doc_id = bibliographic.get("doc_id", "UNKNOWN")
    documents = []

    # Common metadata to propagate
    common_meta = {
        "doc_id": doc_id,
        "language": bibliographic.get("language"),
        "country": bibliographic.get("country"),
        "doc_number": bibliographic.get("doc_number"),
        "application_number": bibliographic.get("application_number"),
        "publication_date": bibliographic.get("publication_date"),
        "ipc_classes": bibliographic.get("ipc_classes", []),
        "file":bibliographic.get("file")
    }

    # Title (en preferred)
    title_dict = bibliographic.get("title", {})
    title = title_dict.get("en") or next(iter(title_dict.values()), "")
    if title:
        documents.append(Document(
            page_content=title,
            metadata={**common_meta, "section": "title"}
        ))

    # Abstract
    abstract = bibliographic.get("abstract")
    if abstract:
        documents.append(Document(
            page_content=abstract,
            metadata={**common_meta, "section": "abstract"}
        ))

    # Claims
    for claim in json_data.get("claims", []):
        documents.append(Document(
            page_content=claim["text"],
            metadata={**common_meta, "section": "claim", "claim_number": claim.get("claim_number")}
        ))

    # Main sections
    for section in json_data.get("main_sections", []):
        section_name = section.get("heading_text", "UNKNOWN_SECTION")
        for p in section.get("paragraphs", []):
            documents.append(Document(
                page_content=f"{section_name}\n{p['text']}",
                metadata={**common_meta, "section": section_name, "p_id": p.get("p_id")}
            ))

    return documents


In [10]:
# Test the new JSON loading functionality
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))

# Import the JSON loader functions
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

🔍 Testing JSON File Loading Functions


In [11]:

def get_chunk_size(text: str, total_tokens: int, base_chunk_size: int = 1000, min_chunk_size: int = 300) -> int:
    # total_tokens = count_tokens(text)

    # If it's small enough, return as one chunk
    if total_tokens <= base_chunk_size:
        return total_tokens

    # Try to divide the tokens into balanced parts
    num_splits = total_tokens // base_chunk_size
    if total_tokens % base_chunk_size != 0:
        num_splits += 1

    # Compute new balanced chunk size
    balanced_chunk_size = total_tokens // num_splits

    # Make sure it doesn't go below a minimum
    balanced_chunk_size = int(max(min_chunk_size, balanced_chunk_size))


    return balanced_chunk_size 

In [12]:
# chunk documents
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(documents):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    processed_docs = []
    for doc in documents:
        token_count =len(str(doc))
        balanced_chunk_size = get_chunk_size(doc, token_count)
        print("Token Count: ", token_count)
        print("balanced_chunk_size: ", balanced_chunk_size)
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=balanced_chunk_size,
            # chunk_size=500,
            chunk_overlap=50
            )

        chunks = splitter.split_documents([doc])
        # print("Chunks: ", chunks)
        print(len(processed_docs))
        processed_docs.extend(chunks)
        print(len(processed_docs))
        # processed_docs.append(doc)
    return processed_docs


In [None]:

import os
os.environ["TRANSFORMERS_HTTP_TIMEOUT"] = "60"
# load the .json documents
file_list = get_epo_json_file_paths()[:100]
# file_list = glob.glob(json_files_path)

all_documents = []

for file_path in file_list:
    with open(file_path, "r") as f:
        data = json.load(f)
        docs = extract_documents(data)
        chunked_docs = chunk_documents(docs)
        all_documents.extend(chunked_docs)



📁 Found 1286 EPO JSON files
Token Count:  360
balanced_chunk_size:  360
0
1
Token Count:  4152
balanced_chunk_size:  830
1
6
Token Count:  553
balanced_chunk_size:  553
6
7
Token Count:  687
balanced_chunk_size:  687
7
8
Token Count:  833
balanced_chunk_size:  833
8
9
Token Count:  4756
balanced_chunk_size:  951
9
14
Token Count:  1055
balanced_chunk_size:  527
14
16
Token Count:  733
balanced_chunk_size:  733
16
17
Token Count:  832
balanced_chunk_size:  832
17
18
Token Count:  519
balanced_chunk_size:  519
18
19
Token Count:  558
balanced_chunk_size:  558
19
20
Token Count:  908
balanced_chunk_size:  908
20
21
Token Count:  566
balanced_chunk_size:  566
21
22
Token Count:  867
balanced_chunk_size:  867
22
23
Token Count:  520
balanced_chunk_size:  520
23
24
Token Count:  544
balanced_chunk_size:  544
24
25
Token Count:  752
balanced_chunk_size:  752
25
26
Token Count:  501
balanced_chunk_size:  501
26
27
Token Count:  618
balanced_chunk_size:  618
27
28
Token Count:  704
balanced_chu

In [None]:
for i, doc in enumerate(all_documents, start=1):
    if i > 2:
        break
    print(f"Document {i}")
    print("Page Content:", doc.page_content)
    print("Metadata: ", doc.metadata)
print("Wtf")