In [2]:
# Data Pipeline - Extract EPO Archives
import sys
import os
from pathlib import Path

# Add the data_pipeline to path
sys.path.append('../src/data_pipline')
sys.path.append('../src/EU_XML_data_loader')

print("📁 Setting up paths...")
print(f"Current working directory: {os.getcwd()}")
print(f"Python path additions: {sys.path[-2:]}")

📁 Setting up paths...
Current working directory: /app/notebooks
Python path additions: ['../src/data_pipline', '../src/EU_XML_data_loader']


In [3]:
# Import required modules
try:
    from data_pipline import DataPipeline
    from load_raw_data import extract_epo_archives
    print("✅ Successfully imported pipeline modules")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("🔧 Please ensure the following files exist:")
    print("   - src/data_pipline/data_pipline.py")
    print("   - src/EU_XML_data_loader/load_raw_data.py")
    raise

✅ Successfully imported pipeline modules


In [4]:
def run_epo_data_pipeline_updated():
    """
    Updated EPO data pipeline function that handles the correct archive structure
    """
    print("🚀 Starting Updated EPO Data Pipeline...")
    
    try:
        # Initialize the data pipeline
        pipeline = DataPipeline()
        print("✅ Data pipeline initialized")
        
        # Check what archives are available
        print("\n🔍 Checking EPO archive structure...")
        archive_info = pipeline.check_epo_archive_structure()
        
        if archive_info["total"] == 0:
            print("❌ No EPO archives found in data/archive/EPO")
            print("📁 Expected structure: data/archive/EPO/EPRTBJV*/DOC/*/")
            return None
        
        print(f"✅ Found {archive_info['total']} EPO archive folders:")
        for archive in archive_info["archives"]:
            print(f"  📦 {archive['name']}")
            print(f"    📂 DOC folder: {archive['doc_folder']}")
            print(f"    📁 Subfolders: {len(archive['subfolders'])}")
            
            # Show subfolder details
            for subfolder in archive['subfolders']:
                zip_count = len(list(subfolder.glob("*.zip")))
                print(f"      📋 {subfolder.name}: {zip_count} ZIP files")
        
        # Show total files to process
        total_zips = sum(archive['zip_count'] for archive in archive_info['archives'])
        print(f"\n📊 Total ZIP files to process: {total_zips}")
        
        # Run the extraction
        print("\n🔄 Starting extraction process...")
        result = pipeline.extract_epo_data(verbose=True)
        
        print("\n✅ EPO Data Pipeline completed!")
        print(f"📊 Final Results:")
        print(f"   Archives processed: {result.get('archives_processed', 0)}")
        print(f"   Files extracted: {result.get('files_extracted', 0)}")
        print(f"   XML files: {result.get('xml_files', 0)}")
        print(f"   PDF files: {result.get('pdf_files', 0)}")
        print(f"   Errors: {result.get('errors', 0)}")
        
        return result
        
    except Exception as e:
        print(f"❌ Error in EPO Data Pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [5]:
# Run the EPO data pipeline
result = run_epo_data_pipeline_updated()

if result:
    print("\n🎉 Pipeline execution completed successfully!")
else:
    print("\n❌ Pipeline execution failed or was cancelled")

2025-06-16 13:44:33,835 - INFO - Data pipeline initialized with base directory: /app/notebooks/..
2025-06-16 13:44:33,837 - INFO - Checking EPO archive structure...
2025-06-16 13:44:33,897 - INFO - Found archive: EPRTBJV2025000024001001 with 5461 ZIP files
2025-06-16 13:44:33,909 - INFO - Starting EPO data extraction...
2025-06-16 13:44:33,910 - INFO - Checking EPO archive structure...
2025-06-16 13:44:33,923 - INFO - Found archive: EPRTBJV2025000024001001 with 5461 ZIP files
2025-06-16 13:44:33,925 - INFO - Found 0 ZIP files in /app/notebooks/../data/archive/EPO/EPRTBJV2025000024001001


🚀 Starting Updated EPO Data Pipeline...
✅ Data pipeline initialized

🔍 Checking EPO archive structure...
✅ Found 1 EPO archive folders:
  📦 EPRTBJV2025000024001001
    📂 DOC folder: /app/notebooks/../data/archive/EPO/EPRTBJV2025000024001001/DOC
    📁 Subfolders: 11
      📋 EPW1B9: 2 ZIP files
      📋 EPW1A8: 2 ZIP files
      📋 EPW1A9: 2 ZIP files
      📋 EPW1B8: 29 ZIP files
      📋 EPNWB1: 1673 ZIP files
      📋 EPNWA1: 2975 ZIP files
      📋 EPW2B9: 1 ZIP files
      📋 EPW2A8: 1 ZIP files
      📋 EPNWB2: 32 ZIP files
      📋 EPNWA3: 310 ZIP files
      📋 EPNWA2: 434 ZIP files

📊 Total ZIP files to process: 5461

🔄 Starting extraction process...
🔍 Found 1 EPO archive folders
  📦 EPRTBJV2025000024001001: 5461 ZIP files

🚀 Processing archive: EPRTBJV2025000024001001
✅ Completed archive: EPRTBJV2025000024001001

📊 EPO Extraction Summary:
  📁 Archives processed: 0
  📄 Files extracted: 0
  📋 XML files: 0
  📑 PDF files: 0
  ❌ Errors: 0

✅ EPO Data Pipeline completed!
📊 Final Results:
   Ar

In [7]:
# Add this debugging cell to understand what's happening
def debug_archive_structure():
    """Debug the archive structure to see what's inside the ZIP files"""
    import zipfile
    from pathlib import Path
    
    archive_path = Path("../data/archive/EPO/EPRTBJV2025000024001001")
    
    print("🔍 Debugging archive structure...")
    
    # Check one subfolder in detail
    epnwa1_folder = archive_path / "DOC" / "EPNWA1"
    if epnwa1_folder.exists():
        zip_files = list(epnwa1_folder.glob("*.zip"))
        print(f"📁 Checking folder: {epnwa1_folder}")
        print(f"📦 Found {len(zip_files)} ZIP files")
        
        if zip_files:
            # Examine the first ZIP file
            first_zip = zip_files[0]
            print(f"\n🔍 Examining: {first_zip.name}")
            
            try:
                with zipfile.ZipFile(first_zip, 'r') as zip_ref:
                    file_list = zip_ref.namelist()
                    print(f"📋 Files inside ZIP ({len(file_list)} total):")
                    
                    xml_files = [f for f in file_list if f.endswith('.xml')]
                    pdf_files = [f for f in file_list if f.endswith('.pdf')]
                    
                    print(f"   📄 XML files: {len(xml_files)}")
                    print(f"   📑 PDF files: {len(pdf_files)}")
                    
                    # Show first few files
                    for i, file_name in enumerate(file_list[:10]):
                        print(f"     {i+1}. {file_name}")
                    
                    if len(file_list) > 10:
                        print(f"     ... and {len(file_list) - 10} more files")
                        
            except Exception as e:
                print(f"❌ Error reading ZIP file: {e}")
    
    else:
        print(f"❌ EPNWA1 folder not found at: {epnwa1_folder}")

# Run the debug function
debug_archive_structure()

🔍 Debugging archive structure...
📁 Checking folder: ../data/archive/EPO/EPRTBJV2025000024001001/DOC/EPNWA1
📦 Found 2975 ZIP files

🔍 Examining: EP24217419NWA1.zip
📋 Files inside ZIP (8 total):
   📄 XML files: 2
   📑 PDF files: 1
     1. EP24217419NWA1.xml
     2. TOC.xml
     3. EP24217419NWA1.pdf
     4. imgaf001.tif
     5. imgf0001.tif
     6. imgf0002.tif
     7. srep0001.tif
     8. srep0002.tif


In [8]:
# Also add this cell to check if the extraction is working at a lower level
def test_single_zip_extraction():
    """Test extracting a single ZIP file manually"""
    import zipfile
    import tempfile
    from pathlib import Path
    
    archive_path = Path("../data/archive/EPO/EPRTBJV2025000024001001")
    epnwa1_folder = archive_path / "DOC" / "EPNWA1"
    
    if not epnwa1_folder.exists():
        print(f"❌ Folder not found: {epnwa1_folder}")
        return
    
    zip_files = list(epnwa1_folder.glob("*.zip"))
    if not zip_files:
        print("❌ No ZIP files found")
        return
    
    # Try to extract the first ZIP file
    test_zip = zip_files[0]
    print(f"🧪 Testing extraction of: {test_zip.name}")
    
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            with zipfile.ZipFile(test_zip, 'r') as zip_ref:
                zip_ref.extractall(temp_path)
            
            # Check what was extracted
            extracted_files = list(temp_path.rglob("*"))
            xml_files = [f for f in extracted_files if f.suffix == '.xml']
            pdf_files = [f for f in extracted_files if f.suffix == '.pdf']
            
            print(f"✅ Extraction successful!")
            print(f"📁 Total extracted files: {len(extracted_files)}")
            print(f"📄 XML files: {len(xml_files)}")
            print(f"📑 PDF files: {len(pdf_files)}")
            
            # Show file structure
            print("\n📂 Extracted structure:")
            for file_path in extracted_files[:15]:  # Show first 15
                if file_path.is_file():
                    print(f"   📄 {file_path.relative_to(temp_path)}")
                elif file_path.is_dir():
                    print(f"   📁 {file_path.relative_to(temp_path)}/")
            
            if len(extracted_files) > 15:
                print(f"   ... and {len(extracted_files) - 15} more")
                
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        import traceback
        traceback.print_exc()

# Run the test
test_single_zip_extraction()

🧪 Testing extraction of: EP24217419NWA1.zip
✅ Extraction successful!
📁 Total extracted files: 8
📄 XML files: 2
📑 PDF files: 1

📂 Extracted structure:
   📄 srep0001.tif
   📄 EP24217419NWA1.xml
   📄 EP24217419NWA1.pdf
   📄 imgaf001.tif
   📄 imgf0001.tif
   📄 imgf0002.tif
   📄 TOC.xml
   📄 srep0002.tif


In [9]:
# Add this new cell to fix the extraction issue
def working_epo_extraction():
    """
    Working EPO extraction function that properly handles the ZIP structure
    """
    import zipfile
    import shutil
    from pathlib import Path
    
    # Paths
    archive_base = Path("../data/archive/EPO/EPRTBJV2025000024001001")
    output_base = Path("../data/raw/EPO/EPRTBJV2025000024001001")
    
    # Create output directory
    output_base.mkdir(parents=True, exist_ok=True)
    
    print(f"🚀 Starting Working EPO Extraction")
    print(f"📂 Source: {archive_base}")
    print(f"📁 Output: {output_base}")
    
    doc_folder = archive_base / "DOC"
    if not doc_folder.exists():
        print(f"❌ DOC folder not found: {doc_folder}")
        return
    
    stats = {
        'zip_files_processed': 0,
        'xml_files': 0,
        'pdf_files': 0,
        'tif_files': 0,
        'other_files': 0,
        'errors': 0,
        'subfolders_processed': 0
    }
    
    # Process each subfolder
    for subfolder in doc_folder.iterdir():
        if not subfolder.is_dir():
            continue
        
        print(f"\n📁 Processing subfolder: {subfolder.name}")
        subfolder_output = output_base / subfolder.name
        subfolder_output.mkdir(parents=True, exist_ok=True)
        
        zip_files = list(subfolder.glob("*.zip"))
        print(f"📦 Found {len(zip_files)} ZIP files")
        
        # Process all ZIP files (or limit for testing)
        process_count = min(len(zip_files), 10)  # Process first 10 per folder for testing
        
        for i, zip_file in enumerate(zip_files[:process_count], 1):
            try:
                print(f"  🔄 Processing {i}/{process_count}: {zip_file.name}")
                
                # Create individual folder for each ZIP file
                zip_output_dir = subfolder_output / zip_file.stem
                zip_output_dir.mkdir(parents=True, exist_ok=True)
                
                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    # Extract all files
                    zip_ref.extractall(zip_output_dir)
                    
                    # Count different file types
                    extracted_files = list(zip_output_dir.rglob("*"))
                    
                    zip_xml = len([f for f in extracted_files if f.suffix.lower() == '.xml' and f.is_file()])
                    zip_pdf = len([f for f in extracted_files if f.suffix.lower() == '.pdf' and f.is_file()])
                    zip_tif = len([f for f in extracted_files if f.suffix.lower() == '.tif' and f.is_file()])
                    zip_other = len([f for f in extracted_files if f.is_file() and f.suffix.lower() not in ['.xml', '.pdf', '.tif']])
                    
                    stats['xml_files'] += zip_xml
                    stats['pdf_files'] += zip_pdf
                    stats['tif_files'] += zip_tif
                    stats['other_files'] += zip_other
                    stats['zip_files_processed'] += 1
                    
                    print(f"    ✅ Extracted: {zip_xml} XML, {zip_pdf} PDF, {zip_tif} TIF, {zip_other} other")
                
            except Exception as e:
                print(f"    ❌ Error with {zip_file.name}: {e}")
                stats['errors'] += 1
        
        stats['subfolders_processed'] += 1
        
        if len(zip_files) > process_count:
            print(f"  ⏩ Processed {process_count} of {len(zip_files)} files (limited for testing)")
    
    print(f"\n📊 Extraction Results:")
    print(f"   Subfolders processed: {stats['subfolders_processed']}")
    print(f"   ZIP files processed: {stats['zip_files_processed']}")
    print(f"   📄 XML files: {stats['xml_files']}")
    print(f"   📑 PDF files: {stats['pdf_files']}")
    print(f"   🖼️  TIF files: {stats['tif_files']}")
    print(f"   📋 Other files: {stats['other_files']}")
    print(f"   ❌ Errors: {stats['errors']}")
    
    # Show output structure
    print(f"\n📂 Output structure created at: {output_base}")
    
    return stats

# Run the working extraction
extraction_stats = working_epo_extraction()

🚀 Starting Working EPO Extraction
📂 Source: ../data/archive/EPO/EPRTBJV2025000024001001
📁 Output: ../data/raw/EPO/EPRTBJV2025000024001001

📁 Processing subfolder: EPW1B9
📦 Found 2 ZIP files
  🔄 Processing 1/2: EP22169662W1B9.zip
    ✅ Extracted: 2 XML, 1 PDF, 3 TIF, 0 other
  🔄 Processing 2/2: EP13899497W1B9.zip
    ✅ Extracted: 2 XML, 1 PDF, 25 TIF, 0 other

📁 Processing subfolder: EPW1A8
📦 Found 2 ZIP files
  🔄 Processing 1/2: EP23166881W1A8.zip
    ✅ Extracted: 2 XML, 1 PDF, 1 TIF, 0 other
  🔄 Processing 2/2: EP23205649W1A8.zip
    ✅ Extracted: 2 XML, 1 PDF, 1 TIF, 0 other

📁 Processing subfolder: EPW1A9
📦 Found 2 ZIP files
  🔄 Processing 1/2: EP23204356W1A9.zip
    ✅ Extracted: 2 XML, 1 PDF, 11 TIF, 0 other
  🔄 Processing 2/2: EP24185369W1A9.zip
    ✅ Extracted: 2 XML, 1 PDF, 3 TIF, 0 other

📁 Processing subfolder: EPW1B8
📦 Found 29 ZIP files
  🔄 Processing 1/10: EP22732225W1B8.zip
    ✅ Extracted: 2 XML, 1 PDF, 0 TIF, 0 other
  🔄 Processing 2/10: EP21207842W1B8.zip
    ✅ Extracted

In [11]:
def optimized_epo_extraction():
    """
    Optimized EPO extraction function that:
    - Processes ALL ZIP files (not just first 10)
    - Extracts only PDF and XML files directly
    - No temporary folders or intermediate storage
    - Clean, efficient output structure
    """
    import zipfile
    import shutil
    from pathlib import Path
    import time
    
    # Paths
    archive_base = Path("../data/archive/EPO/EPRTBJV2025000024001001")
    output_base = Path("../data/raw/EPO/EPRTBJV2025000024001001")
    
    # Create output directory
    output_base.mkdir(parents=True, exist_ok=True)
    
    start_time = time.time()
    print(f"🚀 Starting Optimized EPO Extraction (PDF & XML only)")
    print(f"📂 Source: {archive_base}")
    print(f"📁 Output: {output_base}")
    print(f"📅 Start time: {time.strftime('%H:%M:%S')}")
    
    doc_folder = archive_base / "DOC"
    if not doc_folder.exists():
        print(f"❌ DOC folder not found: {doc_folder}")
        return None
    
    stats = {
        'zip_files_processed': 0,
        'xml_files_extracted': 0,
        'pdf_files_extracted': 0,
        'files_skipped': 0,
        'errors': 0,
        'subfolders_processed': 0,
        'total_zip_files': 0
    }
    
    # Count total ZIP files first
    for subfolder in doc_folder.iterdir():
        if subfolder.is_dir():
            zip_count = len(list(subfolder.glob("*.zip")))
            stats['total_zip_files'] += zip_count
    
    print(f"📊 Total ZIP files to process: {stats['total_zip_files']}")
    
    # Process each subfolder
    for subfolder in doc_folder.iterdir():
        if not subfolder.is_dir():
            continue
        
        print(f"\n📁 Processing subfolder: {subfolder.name}")
        subfolder_output = output_base / subfolder.name
        subfolder_output.mkdir(parents=True, exist_ok=True)
        
        zip_files = list(subfolder.glob("*.zip"))
        print(f"📦 Processing {len(zip_files)} ZIP files...")
        
        # Process ALL ZIP files
        for i, zip_file in enumerate(zip_files, 1):
            try:
                if i % 100 == 0 or i == 1:  # Progress update every 100 files
                    print(f"  🔄 Processing {i}/{len(zip_files)}: {zip_file.name}")
                
                # Create individual folder for each ZIP file
                zip_output_dir = subfolder_output / zip_file.stem
                zip_output_dir.mkdir(parents=True, exist_ok=True)
                
                zip_xml_count = 0
                zip_pdf_count = 0
                zip_skipped = 0
                
                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    # Get all file names in the ZIP
                    all_files = zip_ref.namelist()
                    
                    # Filter for only PDF and XML files
                    target_files = [f for f in all_files if f.lower().endswith(('.pdf', '.xml'))]
                    zip_skipped = len(all_files) - len(target_files)
                    
                    # Extract only PDF and XML files
                    for file_name in target_files:
                        try:
                            # Extract individual file
                            zip_ref.extract(file_name, zip_output_dir)
                            
                            # Count by type
                            if file_name.lower().endswith('.xml'):
                                zip_xml_count += 1
                            elif file_name.lower().endswith('.pdf'):
                                zip_pdf_count += 1
                                
                        except Exception as file_error:
                            print(f"    ⚠️ Error extracting {file_name}: {file_error}")
                            continue
                
                # Update global stats
                stats['xml_files_extracted'] += zip_xml_count
                stats['pdf_files_extracted'] += zip_pdf_count
                stats['files_skipped'] += zip_skipped
                stats['zip_files_processed'] += 1
                
                # Show progress for first few files or every 100th
                if i <= 5 or i % 100 == 0:
                    print(f"    ✅ {zip_file.name}: {zip_xml_count} XML, {zip_pdf_count} PDF, {zip_skipped} skipped")
                
            except Exception as e:
                print(f"    ❌ Error with {zip_file.name}: {e}")
                stats['errors'] += 1
        
        stats['subfolders_processed'] += 1
        print(f"  ✅ Completed {subfolder.name}: {len(zip_files)} ZIP files processed")
    
    # Final results
    end_time = time.time()
    total_time = end_time - start_time
    
    print(f"\n" + "=" * 60)
    print(f"🎉 Optimized EPO Extraction Completed!")
    print(f"📅 End time: {time.strftime('%H:%M:%S')}")
    print(f"⏱️ Total time: {total_time:.2f} seconds ({total_time/60:.1f} minutes)")
    print(f"\n📊 Final Statistics:")
    print(f"   Subfolders processed: {stats['subfolders_processed']}")
    print(f"   ZIP files processed: {stats['zip_files_processed']}/{stats['total_zip_files']}")
    print(f"   📄 XML files extracted: {stats['xml_files_extracted']}")
    print(f"   📑 PDF files extracted: {stats['pdf_files_extracted']}")
    print(f"   📋 Total files extracted: {stats['xml_files_extracted'] + stats['pdf_files_extracted']}")
    print(f"   ⏩ Files skipped (non-PDF/XML): {stats['files_skipped']}")
    print(f"   ❌ Errors: {stats['errors']}")
    print(f"   📈 Processing rate: {stats['zip_files_processed']/(total_time/60):.1f} ZIP files/minute")
    
    print(f"\n📂 Clean output structure created at: {output_base}")
    print(f"   📁 Contains only PDF and XML files")
    print(f"   🗂️ Organized by: Archive/Subfolder/ZipName/files")
    
    return stats

# Run the optimized extraction
print("⚠️ This will process ALL 5461 ZIP files and may take 30-60 minutes")
print("🎯 Only PDF and XML files will be kept")
proceed = input("Do you want to proceed? (y/N): ")

if proceed.lower() == 'y':
    final_stats = optimized_epo_extraction()
else:
    print("❌ Extraction cancelled")

⚠️ This will process ALL 5461 ZIP files and may take 30-60 minutes
🎯 Only PDF and XML files will be kept


Do you want to proceed? (y/N):  y


🚀 Starting Optimized EPO Extraction (PDF & XML only)
📂 Source: ../data/archive/EPO/EPRTBJV2025000024001001
📁 Output: ../data/raw/EPO/EPRTBJV2025000024001001
📅 Start time: 13:57:47
❌ DOC folder not found: ../data/archive/EPO/EPRTBJV2025000024001001/DOC


In [12]:
def process_all_epo_archives():
    """
    Process ALL EPO archives in the archive/EPO directory
    - Handles both ZIP files and already extracted folders
    - Extracts only PDF and XML files
    - Processes all archives, not just one
    """
    import zipfile
    import shutil
    from pathlib import Path
    import time
    
    # Base paths
    epo_archive_dir = Path("../data/archive/EPO")
    epo_raw_dir = Path("../data/raw/EPO")
    
    # Create output directory
    epo_raw_dir.mkdir(parents=True, exist_ok=True)
    
    start_time = time.time()
    print(f"🚀 Processing ALL EPO Archives")
    print(f"📂 Source directory: {epo_archive_dir}")
    print(f"📁 Output directory: {epo_raw_dir}")
    print(f"📅 Start time: {time.strftime('%H:%M:%S')}")
    print("=" * 60)
    
    if not epo_archive_dir.exists():
        print(f"❌ EPO archive directory not found: {epo_archive_dir}")
        return None
    
    # Initialize stats
    global_stats = {
        'archives_found': 0,
        'archives_processed': 0,
        'zip_files_processed': 0,
        'xml_files_extracted': 0,
        'pdf_files_extracted': 0,
        'files_skipped': 0,
        'errors': 0,
        'total_zip_files': 0
    }
    
    # Find all EPO archives (both ZIP files and directories)
    archives_to_process = []
    
    for item in epo_archive_dir.iterdir():
        if item.name.startswith('.'):  # Skip hidden files like .DS_Store
            continue
            
        if item.is_file() and item.suffix.lower() == '.zip':
            # Main archive ZIP file
            archives_to_process.append({
                'name': item.stem,
                'path': item,
                'type': 'main_zip'
            })
        elif item.is_dir() and item.name.startswith('EPRTBJV'):
            # Already extracted archive directory
            archives_to_process.append({
                'name': item.name,
                'path': item,
                'type': 'directory'
            })
    
    global_stats['archives_found'] = len(archives_to_process)
    print(f"🔍 Found {len(archives_to_process)} EPO archives to process:")
    
    for archive in archives_to_process:
        print(f"  📦 {archive['name']} ({archive['type']})")
    
    print("\n" + "=" * 60)
    
    # Process each archive
    for i, archive in enumerate(archives_to_process, 1):
        print(f"\n[{i}/{len(archives_to_process)}] Processing: {archive['name']}")
        
        try:
            if archive['type'] == 'main_zip':
                # Extract main ZIP file first
                result = process_main_archive_zip(archive, epo_raw_dir)
            else:
                # Process already extracted directory
                result = process_extracted_archive_directory(archive, epo_raw_dir)
            
            if result:
                # Aggregate stats
                for key in ['zip_files_processed', 'xml_files_extracted', 'pdf_files_extracted', 'files_skipped', 'errors']:
                    global_stats[key] += result.get(key, 0)
                global_stats['archives_processed'] += 1
                
                print(f"✅ Completed {archive['name']}: {result.get('zip_files_processed', 0)} ZIPs, "
                      f"{result.get('xml_files_extracted', 0)} XML, {result.get('pdf_files_extracted', 0)} PDF")
            else:
                print(f"❌ Failed to process {archive['name']}")
                global_stats['errors'] += 1
                
        except Exception as e:
            print(f"❌ Error processing {archive['name']}: {e}")
            global_stats['errors'] += 1
    
    # Final results
    end_time = time.time()
    total_time = end_time - start_time
    
    print("\n" + "=" * 60)
    print(f"🎉 ALL EPO Archives Processing Completed!")
    print(f"📅 End time: {time.strftime('%H:%M:%S')}")
    print(f"⏱️ Total time: {total_time:.2f} seconds ({total_time/60:.1f} minutes)")
    print(f"\n📊 Global Statistics:")
    print(f"   Archives found: {global_stats['archives_found']}")
    print(f"   Archives processed: {global_stats['archives_processed']}")
    print(f"   ZIP files processed: {global_stats['zip_files_processed']}")
    print(f"   📄 XML files extracted: {global_stats['xml_files_extracted']}")
    print(f"   📑 PDF files extracted: {global_stats['pdf_files_extracted']}")
    print(f"   📋 Total files extracted: {global_stats['xml_files_extracted'] + global_stats['pdf_files_extracted']}")
    print(f"   ⏩ Files skipped: {global_stats['files_skipped']}")
    print(f"   ❌ Errors: {global_stats['errors']}")
    
    if total_time > 60:
        print(f"   📈 Processing rate: {global_stats['zip_files_processed']/(total_time/60):.1f} ZIP files/minute")
    
    return global_stats

def process_main_archive_zip(archive_info, output_base):
    """Process a main archive ZIP file (like EPRTBJV2025000024001001.zip)"""
    import zipfile
    import tempfile
    
    archive_zip = archive_info['path']
    archive_name = archive_info['name']
    
    print(f"  🔄 Extracting main ZIP: {archive_zip.name}")
    
    try:
        # Create temporary directory for main ZIP extraction
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            # Extract main ZIP
            with zipfile.ZipFile(archive_zip, 'r') as zip_ref:
                zip_ref.extractall(temp_path)
            
            # Look for DOC folder in extracted content
            doc_folder = None
            for item in temp_path.rglob('DOC'):
                if item.is_dir():
                    doc_folder = item
                    break
            
            if not doc_folder:
                print(f"    ❌ No DOC folder found in {archive_zip.name}")
                return None
            
            # Process the DOC folder
            archive_output = output_base / archive_name
            return process_doc_folder(doc_folder, archive_output, archive_name)
            
    except Exception as e:
        print(f"    ❌ Error extracting main ZIP {archive_zip.name}: {e}")
        return None

def process_extracted_archive_directory(archive_info, output_base):
    """Process an already extracted archive directory"""
    archive_dir = archive_info['path']
    archive_name = archive_info['name']
    
    print(f"  🔄 Processing extracted directory: {archive_name}")
    
    # Look for DOC folder
    doc_folder = archive_dir / "DOC"
    if not doc_folder.exists():
        print(f"    ❌ No DOC folder found in {archive_name}")
        return None
    
    # Process the DOC folder
    archive_output = output_base / archive_name
    return process_doc_folder(doc_folder, archive_output, archive_name)

def process_doc_folder(doc_folder, output_base, archive_name):
    """Process the DOC folder containing ZIP files"""
    import zipfile
    
    output_base.mkdir(parents=True, exist_ok=True)
    
    stats = {
        'zip_files_processed': 0,
        'xml_files_extracted': 0,
        'pdf_files_extracted': 0,
        'files_skipped': 0,
        'errors': 0
    }
    
    print(f"    📂 Processing DOC folder: {doc_folder}")
    
    # Process each subfolder in DOC
    subfolders = [sf for sf in doc_folder.iterdir() if sf.is_dir()]
    print(f"    📁 Found {len(subfolders)} subfolders")
    
    for subfolder in subfolders:
        subfolder_output = output_base / subfolder.name
        subfolder_output.mkdir(parents=True, exist_ok=True)
        
        zip_files = list(subfolder.glob("*.zip"))
        if not zip_files:
            continue
            
        print(f"      📦 {subfolder.name}: Processing {len(zip_files)} ZIP files")
        
        # Process ZIP files in this subfolder
        for j, zip_file in enumerate(zip_files, 1):
            try:
                # Progress update for large folders
                if len(zip_files) > 100 and j % 100 == 0:
                    print(f"        🔄 Progress: {j}/{len(zip_files)}")
                
                zip_output_dir = subfolder_output / zip_file.stem
                zip_output_dir.mkdir(parents=True, exist_ok=True)
                
                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    all_files = zip_ref.namelist()
                    target_files = [f for f in all_files if f.lower().endswith(('.pdf', '.xml'))]
                    
                    zip_xml = 0
                    zip_pdf = 0
                    
                    for file_name in target_files:
                        try:
                            zip_ref.extract(file_name, zip_output_dir)
                            if file_name.lower().endswith('.xml'):
                                zip_xml += 1
                            elif file_name.lower().endswith('.pdf'):
                                zip_pdf += 1
                        except Exception:
                            continue
                    
                    stats['xml_files_extracted'] += zip_xml
                    stats['pdf_files_extracted'] += zip_pdf
                    stats['files_skipped'] += len(all_files) - len(target_files)
                    stats['zip_files_processed'] += 1
                    
            except Exception as e:
                print(f"        ❌ Error with {zip_file.name}: {e}")
                stats['errors'] += 1
        
        print(f"      ✅ Completed {subfolder.name}: {len(zip_files)} ZIP files")
    
    return stats

# Run the comprehensive extraction
print("🎯 This will process ALL EPO archives in your archive/EPO directory:")
print("  - EPRTBJV2025000021001001.zip")
print("  - EPRTBJV2025000022001001.zip") 
print("  - EPRTBJV2025000023001001.zip")
print("  - EPRTBJV2025000024001001.zip")
print("  - EPRTBJV2025000024001001/ (already extracted)")
print("\n⚠️ This may take several hours depending on the total number of ZIP files")
print("🎯 Only PDF and XML files will be kept")

proceed = input("\nDo you want to process ALL EPO archives? (y/N): ")

if proceed.lower() == 'y':
    comprehensive_stats = process_all_epo_archives()
    
    if comprehensive_stats:
        print(f"\n🎉 All done! Check your results in: ../data/raw/EPO/")
        print(f"📊 Total files extracted: {comprehensive_stats['xml_files_extracted'] + comprehensive_stats['pdf_files_extracted']}")
else:
    print("❌ Processing cancelled")
    

🎯 This will process ALL EPO archives in your archive/EPO directory:
  - EPRTBJV2025000021001001.zip
  - EPRTBJV2025000022001001.zip
  - EPRTBJV2025000023001001.zip
  - EPRTBJV2025000024001001.zip
  - EPRTBJV2025000024001001/ (already extracted)

⚠️ This may take several hours depending on the total number of ZIP files
🎯 Only PDF and XML files will be kept



Do you want to process ALL EPO archives? (y/N):  y


🚀 Processing ALL EPO Archives
📂 Source directory: ../data/archive/EPO
📁 Output directory: ../data/raw/EPO
📅 Start time: 14:04:10
🔍 Found 5 EPO archives to process:
  📦 EPRTBJV2025000021001001 (main_zip)
  📦 EPRTBJV2025000024001001 (directory)
  📦 EPRTBJV2025000023001001 (main_zip)
  📦 EPRTBJV2025000022001001 (main_zip)
  📦 EPRTBJV2025000024001001 (main_zip)


[1/5] Processing: EPRTBJV2025000021001001
  🔄 Extracting main ZIP: EPRTBJV2025000021001001.zip
    📂 Processing DOC folder: /tmp/tmpabl5zryz/DOC
    📁 Found 9 subfolders
      📦 EPW1B8: Processing 24 ZIP files
      ✅ Completed EPW1B8: 24 ZIP files
      📦 EPW1B9: Processing 4 ZIP files
      ✅ Completed EPW1B9: 4 ZIP files
      📦 EPW1A9: Processing 1 ZIP files
      ✅ Completed EPW1A9: 1 ZIP files
      📦 EPNWA2: Processing 430 ZIP files
        🔄 Progress: 100/430
        🔄 Progress: 200/430
        🔄 Progress: 300/430
        🔄 Progress: 400/430
      ✅ Completed EPNWA2: 430 ZIP files
      📦 EPNWB1: Processing 2128 ZIP files
