In [1]:
import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths - FIXED to include embeddings directory
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))
sys.path.append(str(src_dir / "embeddings"))  # Add this line to include embeddings

# Other imports
from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list

# Import the embeddings module
from embeddings import DocumentEmbedder, batch_process_json_files, extract_documents_epo

# Import the JSON loader functions
sys.path.append(str(src_dir / "data_pipline" / "json_loader"))
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents
from embeddings import DocumentEmbedder, EmbeddingConfig

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

✅ Successfully imported from data_config.py
🔍 Testing JSON File Loading Functions
📁 Current directory: /app/notebooks
📁 Project root: /app
📁 Source directory: /app/src
✅ Python paths configured


In [3]:

import sys
import os
from pathlib import Path
import time
import json
import math

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"




In [4]:

# Create custom configuration
config = EmbeddingConfig(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    base_chunk_size=300,  # Larger chunks
    min_chunk_size=200,   # Higher minimum
    chunk_overlap=50,     # More overlap
    batch_size=16,        # Larger batches
    enable_debug=False    # Less verbose output
)

file_list = get_epo_json_file_paths()[:10]
# Initialize with custom config
embedder = DocumentEmbedder(config=config)

# Rest of your workflow remains the same
embedder.add_file_paths(file_list)
embedder.load_json_files(extract_fn=extract_documents_epo)
processed_docs = embedder.process_all_documents()

2025-06-18 15:22:24,376 - INFO - Initializing models with: sentence-transformers/all-MiniLM-L6-v2


📁 Found 1286 EPO JSON files


2025-06-18 15:22:24,684 - INFO - Use pytorch device_name: cpu
2025-06-18 15:22:24,685 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-06-18 15:22:26,168 - INFO - Models initialized successfully
2025-06-18 15:22:26,186 - INFO - Added 10 valid files. Total files to process: 10
2025-06-18 15:22:26,196 - INFO - Loaded 331 documents from 10 files (0 errors)
2025-06-18 15:22:26,197 - INFO - Total documents in memory: 331
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
2025-06-18 15:22:32,975 - INFO - ✅ Processed 360 document chunks from 331 documents
