In [1]:
import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths - FIXED to include embeddings directory
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))
sys.path.append(str(src_dir / "embeddings"))  # Add this line to include embeddings

# Other imports
from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list

# Import the embeddings module
from embeddings import DocumentEmbedder, batch_process_json_files, extract_documents_epo

# Import the JSON loader functions
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents
from embeddings import DocumentEmbedder, EmbeddingConfig

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions
📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [2]:

import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"




In [3]:

# Create custom configuration
config = EmbeddingConfig(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    base_chunk_size=300,  # Larger chunks
    min_chunk_size=200,   # Higher minimum
    chunk_overlap=50,     # More overlap
    batch_size=16,        # Larger batches
    enable_debug=False    # Less verbose output
)

file_list = get_epo_json_file_paths()[:2]
# Initialize with custom config
embedder = DocumentEmbedder(config=config)

# Rest of your workflow remains the same
embedder.add_file_paths(file_list)
embedder.load_json_files(extract_fn=extract_documents_epo)
processed_docs = embedder.process_all_documents()

2025-06-18 17:12:31,600 - INFO - Initializing models with: sentence-transformers/all-MiniLM-L6-v2


📁 Found 1286 EPO JSON files


2025-06-18 17:12:31,949 - INFO - Use pytorch device_name: cpu
2025-06-18 17:12:31,950 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-18 17:12:33,679 - INFO - Models initialized successfully
2025-06-18 17:12:33,680 - INFO - Added 2 valid files. Total files to process: 2
2025-06-18 17:12:33,685 - INFO - Loaded 196 documents from 2 files (0 errors)
2025-06-18 17:12:33,685 - INFO - Total documents in memory: 196
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
2025-06-18 17:12:36,293 - INFO - ✅ Processed 206 document chunks from 196 documents


In [8]:

long_patent_description = """
A processor-implemented method for encoding at least one audio signal, wherein the method comprises: generating at least one vector of parameters defining the at least one audio signal; sorting absolute-valued components of the at least one vector of parameters according to a descending order of the absolute values of the components of the at least one vector of parameters to generate an associated at least one ordered vector of parameters; selecting from a list of leader classes at least one potential code vector; performing, for each single of the selected at least one potential code vector individually and for each single of the at least one ordered vector of parameters individually, a step of determining a distance between the single potential code vector and the single ordered vector of parameters, wherein the step of determining comprises: (a) generating a first and a second intermediary distance value, respectively, wherein the first intermediary distance value is given by the sum of the products of the corresponding components of the single potential code vector and the single ordered vector of parameters and the second intermediary distance value is given by the sum of the squares of the components of the single potential code vector; (b1) updating the first intermediary distance value by subtracting the product of a last component of the single potential code vector and a last component of the single ordered vector of parameters from the first intermediary distance value and updating the second intermediary distance value by adding the square of the last component of the single potential code vector to the second intermediary distance value dependent on conditions of when the single potential code vector is of non-zero parity and when the number of minus signs of the components of the single vector of parameters differs from the constraint of the leader class parity associated with the single potential code vector; (b2) updating the first intermediary distance value by adding the product of a last component of the single potential code vector and a last component of the single ordered vector of parameters to the first intermediary distance value and updating the second intermediary distance value by adding the square of the last component of the single potential code vector to the second intermediary distance value dependent on conditions of when the single potential code vector is of non-zero parity and when the number of minus signs of the components of the single vector of parameters does not differ from the constraint of the leader class parity associated with the single potential code vector; (b3) updating the first intermediary distance value by adding the product of the last component of the single potential code vector and the last component of the single ordered vector of parameters to the first intermediary distance value and updating the second intermediary distance value by adding the square of the last component of the single potential code vector to the second intermediary distance value dependent on a condition of when the single potential code vector is not of non-zero parity; (c) determining the distance between the single potential code vector and the single ordered vector of parameters by subtracting the first intermediary distance value multiplied by a scale factor from the second intermediary distance value multiplied by the scale factor squared; determining the best leader class associated with the single potential code vector which generates the smallest associated distance; and sorting components of the best leader class by the reverse ordering of the descending order of absolute values of the components of the single vector of parameters to generate an output lattice-quantized vector
"""

results = embedder.similarity_search_with_long_query(
    query=long_patent_description, 
    top_k=10,
    min_similarity=0.005,
    aggregation_method="max"
)

for i, result in enumerate(results, 1):
    print(f"\n{i}. Similarity: {result['similarity']:.4f}")
    print(f"   Section: {result['metadata'].get('section', 'Unknown')}")
    print(f"   doc_id: {result['metadata'].get('doc_id', 'Unknown')}")
    print(f"   Text: {result['text'][:200]}...")

2025-06-18 17:15:25,956 - INFO - Query is long (690 tokens), chunking before embedding
2025-06-18 17:15:26,047 - INFO - Found 206 results above threshold 0.005
2025-06-18 17:15:26,047 - INFO - Used max aggregation across 4 query chunks



1. Similarity: 0.9999
   Section: claim
   doc_id: EP13899497B9W1
   Text: a processor - implemented method for encoding at least one audio signal, wherein the method comprises : generating at least one vector of parameters defining the at least one audio signal ; sorting ab...

2. Similarity: 0.9770
   Section: claim
   doc_id: EP13899497B9W1
   Text: vector of parameters and the second intermediary distance value is given by the sum of the squares of the components of the single potential code vector ; ( b1 ) update the first intermediary distance...

3. Similarity: 0.9759
   Section: claim
   doc_id: EP13899497B9W1
   Text: and the second intermediary distance value is given by the sum of the squares of the components of the single potential code vector ; ( b1 ) updating the first intermediary distance value by subtracti...

4. Similarity: 0.9673
   Section: claim
   doc_id: EP13899497B9W1
   Text: ##ry distance value and updating the second intermediary distance value by adding th