In [None]:
import sys
import os
from pathlib import Path
import time
import json

# Add source directories to Python path
current_dir = Path.cwd()
project_root = current_dir.parent  # Go up one level from notebooks to project root
src_dir = project_root / "src"

# Add paths
sys.path.append(str(src_dir / "data_pipline"))
sys.path.append(str(src_dir / "EU_XML_data_loader"))


from data_pipline import DataPipeline
import get_raw_data_paths_EPO  
from xml_loader_EPO import process_xml_files_list


print(f"📁 Current directory: {current_dir}")
print(f"📁 Project root: {project_root}")
print(f"📁 Source directory: {src_dir}")
print(f"✅ Python paths configured")

In [None]:
# Import and use the enhanced pipeline
from data_pipline import DataPipeline

# Initialize pipeline
pipeline = DataPipeline()

In [None]:
result_extraction = pipeline.extract_epo_data()

In [None]:
results = pipeline.process_epo_xml_to_json(
    debug=False,
    verbose=False
    )

In [None]:
# Test the new JSON loading functionality
sys.path.append(str(src_dir / "data_pipline" / "loader"))

# Import the JSON loader functions
from json_loader_epo import get_epo_json_file_paths, get_all_json_file_paths, load_json_documents

print("🔍 Testing JSON File Loading Functions")
print("=" * 50)

In [None]:
# Get all JSON file paths from parced directory
print("📁 Getting all JSON files from parced directory...")
all_json_files = get_all_json_file_paths()

print(f"\n📊 Results:")
print(f"  • Total JSON files found: {len(all_json_files)}")

# Show first few file paths as examples
if all_json_files:
    print(f"\n📄 Sample file paths:")
    for i, file_path in enumerate(all_json_files[:5]):  # Show first 5
        file_name = Path(file_path).name
        print(f"  {i+1}. {file_name}")
        if i == 0:  # Show full path for first file
            print(f"     Full path: {file_path}")

In [None]:
# Get only EPO JSON files
print("📁 Getting EPO JSON files specifically...")
epo_json_files = get_epo_json_file_paths()

print(f"\n📊 EPO Results:")
print(f"  • EPO JSON files found: {len(epo_json_files)}")

# Verify they're the same (since all files should be EPO files)
if len(all_json_files) == len(epo_json_files):
    print(f"  ✅ All JSON files are EPO files (as expected)")
else:
    print(f"  ⚠️ Mismatch: {len(all_json_files)} total vs {len(epo_json_files)} EPO files")

In [None]:
# Test loading documents (load first few files only for testing)
print("📄 Testing document loading...")

# Load all documents
documents = load_json_documents()

print(f"\n📊 Document Loading Results:")
print(f"  • Documents loaded: {len(documents)}")

if documents:
    # Show info about first document
    first_doc = documents[0]
    print(f"\n📋 Sample Document Info:")
    print(f"  • Content length: {len(first_doc.page_content)} characters")
    print(f"  • Metadata keys: {list(first_doc.metadata.keys())}")
    print(f"  • Patent ID: {first_doc.metadata.get('patent_id', 'N/A')}")
    print(f"  • Title: {first_doc.metadata.get('title', 'N/A')[:100]}..." if first_doc.metadata.get('title') else "  • Title: N/A")
    
    # Show content preview
    content_preview = first_doc.page_content[:300] + "..." if len(first_doc.page_content) > 300 else first_doc.page_content
    print(f"\n📝 Content Preview:")
    print(content_preview)

In [None]:
# Summary of all operations
print("\n🎉 Summary of Operations:")
print("=" * 40)
print(f"📦 Archive extraction: {'✅ Complete' if result_extraction.get('success', True) else '❌ Failed'}")
print(f"🔄 XML to JSON processing: {'✅ Complete' if results.get('success', True) else '❌ Failed'}")
print(f"📁 JSON files found: {len(all_json_files) if 'all_json_files' in globals() else 'N/A'}")
print(f"📄 Documents loaded: {len(documents) if 'documents' in globals() else 'N/A'}")

print(f"\n📂 Data Pipeline Status: Ready for Vector Store Creation!")