In [12]:
import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths - FIXED to include embeddings directory
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))
sys.path.append(str(src_dir / "embeddings"))  # Add this line to include embeddings

# Other imports
from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list

# Import the embeddings module
from embeddings import DocumentEmbedder, batch_process_json_files, extract_documents_epo

# Import the JSON loader functions
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents
from embeddings import DocumentEmbedder, EmbeddingConfig

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

🔍 Testing JSON File Loading Functions
📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [13]:

import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"




In [14]:

# Create custom configuration
config = EmbeddingConfig(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    base_chunk_size=300,  # Larger chunks
    min_chunk_size=200,   # Higher minimum
    chunk_overlap=50,     # More overlap
    batch_size=16,        # Larger batches
    enable_debug=False    # Less verbose output
)

file_list = get_epo_json_file_paths()[:40]
# Initialize with custom config
embedder = DocumentEmbedder(config=config)

# Rest of your workflow remains the same
embedder.add_file_paths(file_list)
embedder.load_json_files(extract_fn=extract_documents_epo)
processed_docs = embedder.process_all_documents()

# for i, doc in enumerate(processed_docs[:3]):
#     print(f"\nDoc {i}:\n", doc)

from langchain_core.documents import Document

processed_docs = [
    Document(page_content=doc["text"], metadata=doc.get("metadata", {}))
    for doc in processed_docs
]

2025-06-20 07:17:48,226 - INFO - Initializing models with: sentence-transformers/all-MiniLM-L6-v2


📁 Found 1286 EPO JSON files


2025-06-20 07:17:48,549 - INFO - Use pytorch device_name: cpu
2025-06-20 07:17:48,549 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-20 07:17:49,867 - INFO - Models initialized successfully
2025-06-20 07:17:49,888 - INFO - Added 40 valid files. Total files to process: 40
2025-06-20 07:17:49,904 - INFO - Loaded 632 documents from 40 files (0 errors)
2025-06-20 07:17:49,904 - INFO - Total documents in memory: 632
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
2025-06-20 07:17:58,100 - INFO - ✅ Processed 683 document chunks from 632 documents


In [19]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectors = []

for chunk in processed_docs:
    vector = embeddings.embed_query(chunk.page_content)
    vectors.append(vector)

# #Create the vector store from documents
vector_store = Chroma.from_documents(
    documents=filter_complex_metadata(processed_docs),
    embedding=embeddings,
    collection_name="example_collection",
    # persist_directory="./chroma_langchain_db"
)


2025-06-20 07:35:43,819 - INFO - Use pytorch device_name: cpu
2025-06-20 07:35:43,820 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [20]:
# vector_store = Chroma(
#     collection_name="example_collection",
#     embedding_function=embeddings,
#     persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
# )

ids = vector_store.add_documents(documents=processed_docs)

query = "I want to patent a system that synchronizes audio signals from different speakers in a room."
embedded_query = embeddings.embed_query(query)

results = vector_store.similarity_search_with_score(query, k=3)


for i, (doc, score) in enumerate(results):
    print(f"\nResult #{i+1}")
    print("-" * 40)
    print(f"🔹 Score: {score:.4f}")
    print(f"Title: {doc.metadata.get('title')}")
    print(f"File: {doc.metadata.get('file')}")
    print(f"📘 Section: {doc.metadata.get('section')}")
    print(f"📄 Application No.: {doc.metadata.get('application_number')}")
    print(f"📅 Publication Date: {doc.metadata.get('publication_date')}")
    print(f"🌍 Country: {doc.metadata.get('country')}")
    print(f"🆔 Patent ID: {doc.metadata.get('doc_id')}")
    print(f"📑 Match:\n\"{doc.page_content}\"\n")


Result #1
----------------------------------------
🔹 Score: 1.0109
Title: BEAMFORMING USING AN IN-EAR AUDIO DEVICE
File: EP19762032W1B8.xml
📘 Section: title
📄 Application No.: 19762032.1
📅 Publication Date: 20250611
🌍 Country: EP
🆔 Patent ID: EP19762032B8W1
📑 Match:
"beamforming using an in - ear audio device"


Result #2
----------------------------------------
🔹 Score: 1.0109
Title: BEAMFORMING USING AN IN-EAR AUDIO DEVICE
File: EP19762032W1B8.xml
📘 Section: title
📄 Application No.: 19762032.1
📅 Publication Date: 20250611
🌍 Country: EP
🆔 Patent ID: EP19762032B8W1
📑 Match:
"beamforming using an in - ear audio device"


Result #3
----------------------------------------
🔹 Score: 1.0109
Title: BEAMFORMING USING AN IN-EAR AUDIO DEVICE
File: EP19762032W1B8.xml
📘 Section: title
📄 Application No.: 19762032.1
📅 Publication Date: 20250611
🌍 Country: EP
🆔 Patent ID: EP19762032B8W1
📑 Match:
"beamforming using an in - ear audio device"

