In [1]:
import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths - FIXED to include embeddings directory
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))
sys.path.append(str(src_dir / "embeddings"))  # Add this line to include embeddings

# Other imports
from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list

# Import the embeddings module
from embeddings import DocumentEmbedder, batch_process_json_files, extract_documents_epo

# Import the JSON loader functions
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents
from embeddings import DocumentEmbedder, EmbeddingConfig

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions
📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [2]:

import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"




In [3]:

# Create custom configuration
config = EmbeddingConfig(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    base_chunk_size=300,  # Larger chunks
    min_chunk_size=200,   # Higher minimum
    chunk_overlap=50,     # More overlap
    batch_size=16,        # Larger batches
    enable_debug=False    # Less verbose output
)

file_list = get_epo_json_file_paths()[:2]
# Initialize with custom config
embedder = DocumentEmbedder(config=config)

# Rest of your workflow remains the same
embedder.add_file_paths(file_list)
embedder.load_json_files(extract_fn=extract_documents_epo)
processed_docs = embedder.process_all_documents()

2025-06-19 14:15:39,105 - INFO - Initializing models with: sentence-transformers/all-MiniLM-L6-v2


📁 Found 1286 EPO JSON files


2025-06-19 14:15:39,447 - INFO - Use pytorch device_name: cpu
2025-06-19 14:15:39,448 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-19 14:15:41,012 - INFO - Models initialized successfully
2025-06-19 14:15:41,014 - INFO - Added 2 valid files. Total files to process: 2
2025-06-19 14:15:41,022 - INFO - Loaded 196 documents from 2 files (0 errors)
2025-06-19 14:15:41,023 - INFO - Total documents in memory: 196
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
2025-06-19 14:15:43,818 - INFO - ✅ Processed 206 document chunks from 196 documents


In [4]:

long_patent_description = """
AUDIO SIGNAL ENCODER
A processor-implemented method for encoding at least one audio signal, wherein the method comprises: generating at least one vector of parameters defining the at least one audio signal; sorting absolute-valued components of the at least one vector of parameters according to a descending order of the absolute values of the components of the at least one vector of parameters to generate an associated at least one ordered vector of parameters; selecting from a list of leader classes at least one potential code vector; performing, for each single of the selected at least one potential code vector individually and for each single of the at least one ordered vector of parameters individually,
Verfahren ausgeführt durch einen Server (2), wobei das Verfahren umfasst: - Bereithalten (300) einer Vielzahl von historischen Funksignalinformationen, die von einer Vielzahl von Nutzfahrzeugen (101-103) stammen, wobei jede der historischen Funksignalinformationen von einem jeweiligen der Nutzfahrzeuge an einer jeweiligen Nutzfahrzeugposition erfasst wurde, und wobei jede der historischen Funksignalinformationen die jeweilige Nutzfahrzeugposition und eine durch das jeweilige Nutzfahrzeug an der jeweiligen Nutzfahrzeugposition erfasste jeweilige Funksignalqualität eines Funkkommunikationssystems repräsentiert; - Überwachen eines Nutzfahrzeugs (103), das sich entlang einer vorgegebenen Route (112) bewegt, durch; - Empfangen (301) von von dem überwachten Nutzfahrzeug (103) über das Funkkommunikationssystem gesendeten Überwachungsinformationen, wobei 

"""

results = embedder.similarity_search_with_long_query(
    query=long_patent_description, 
    top_k=15,
    min_similarity=0.4,
    aggregation_method="max"
)

for i, result in enumerate(results, 1):
    print(f"\n{i}. Similarity: {result['similarity']:.4f}")
    print(f"   Section: {result['metadata'].get('section', 'Unknown')}")
    print(f"   doc_id: {result['metadata'].get('doc_id', 'Unknown')}")
    print(f"   file: {result['metadata'].get('file', 'Unknown')}")
    print(f"   Text: {result['text'][:200]}...")

2025-06-19 14:15:46,854 - INFO - Query is long (426 tokens), chunking before embedding
2025-06-19 14:15:46,948 - INFO - Found 112 results above threshold 0.4
2025-06-19 14:15:46,948 - INFO - Used max aggregation across 3 query chunks



1. Similarity: 0.8551
   Section: claim
   doc_id: EP13899497B9W1
   file: EP13899497W1B9.xml
   Text: a processor - implemented method for encoding at least one audio signal, wherein the method comprises : generating at least one vector of parameters defining the at least one audio signal ; sorting ab...

2. Similarity: 0.8405
   Section: claim
   doc_id: EP22169662B9W1
   file: EP22169662W1B9.xml
   Text: verfahren ausgefuhrt durch einen server ( 2 ), wobei das verfahren umfasst : - bereithalten ( 300 ) einer vielzahl von historischen funksignalinformationen, die von einer vielzahl von nutzfahrzeugen (...

3. Similarity: 0.7831
   Section: claim
   doc_id: EP22169662B9W1
   file: EP22169662W1B9.xml
   Text: verfahren nach einem der anspruche 2 und 3, wobei fur das bestimmen der zu erwartenden funksignalqualitat alle historischen funksignalinformationen der historischen funksignalinformationen berucksicht...

4. Similarity: 0.7767
   Section: claim
   doc_id: EP13899497B9W1
   file: 

In [5]:
embedder

DocumentEmbedder(model=sentence-transformers/all-MiniLM-L6-v2, documents=196, processed=206)