In [None]:
import sys
import os
import time
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from multiprocessing import cpu_count
import psutil

# Add project root to path
PROJECT_ROOT = Path("/mnt/home/dchhantyal/3d-cnn-classification")
sys.path.append(str(PROJECT_ROOT / "preperation" / "python"))

# Import parallel extraction modules
from parallel_nucleus_extractor import (
    ParallelNucleusExtractor, 
    ParallelConfig, 
    create_optimized_config,
    ProgressTracker
)

print(f"üñ•Ô∏è  System Information:")
print(f"   ‚Ä¢ CPU cores: {cpu_count()}")
print(f"   ‚Ä¢ Available memory: {psutil.virtual_memory().available / (1024**3):.1f} GB")
print(f"   ‚Ä¢ Total memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")
print(f"   ‚Ä¢ CPU usage: {psutil.cpu_percent(interval=1):.1f}%")

In [None]:
# Configuration
DATA_PATH = "/mnt/home/dchhantyal/3d-cnn-classification"
DATASET_NAME = "230212_stack6"  # Change this to your target dataset

# Auto-optimize configuration based on system resources
available_memory = psutil.virtual_memory().available / (1024**3)  # GB
estimated_dataset_size = 500  # Adjust based on your dataset

print(f"üîß Creating optimized configuration...")
config = create_optimized_config(
    dataset_size=estimated_dataset_size,
    available_memory_gb=available_memory * 0.8  # Use 80% of available memory
)

# You can also create a custom configuration
# custom_config = ParallelConfig(
#     max_workers_batch=4,
#     max_workers_frames=8, 
#     max_workers_io=16,
#     chunk_size=50,
#     enable_detailed_logging=True,
#     save_intermediate_results=True
# )

In [None]:
# Initialize the parallel extractor
print(f"üöÄ Initializing Parallel Nucleus Extractor...")
extractor = ParallelNucleusExtractor(DATA_PATH, config)

# Load the dataset
print(f"üìä Loading dataset: {DATASET_NAME}")
success = extractor.load_dataset(DATASET_NAME)

if success:
    print(f"‚úÖ Dataset loaded successfully!")
    
    # Display dataset statistics
    df = extractor.metadata["classes"]
    print(f"\nüìà Dataset Statistics:")
    print(f"   ‚Ä¢ Total nuclei: {len(df)}")
    print(f"   ‚Ä¢ Mitotic events: {df['mitotic'].sum()}")
    print(f"   ‚Ä¢ Death events: {df['death'].sum()}")
    print(f"   ‚Ä¢ Both events: {((df['mitotic'] == 1) & (df['death'] == 1)).sum()}")
    print(f"   ‚Ä¢ Normal nuclei: {((df['mitotic'] == 0) & (df['death'] == 0)).sum()}")
    
    # Frame distribution
    print(f"\nüé¨ Frame Distribution:")
    frame_counts = df['frame'].value_counts().sort_index()
    print(f"   ‚Ä¢ Frame range: {frame_counts.index.min()} - {frame_counts.index.max()}")
    print(f"   ‚Ä¢ Avg nuclei per frame: {frame_counts.mean():.1f}")
    
else:
    print(f"‚ùå Failed to load dataset: {DATASET_NAME}")
    print("Please check the dataset path and name.")

In [None]:
# Test single nucleus extraction (performance comparison)
if success:
    # Get a sample nucleus for testing
    sample_nucleus = df.iloc[0]
    nucleus_id = int(sample_nucleus['nucleus_id'])
    event_frame = int(sample_nucleus['frame'])
    
    print(f"üß™ Testing single nucleus extraction...")
    print(f"   ‚Ä¢ Nucleus ID: {nucleus_id}")
    print(f"   ‚Ä¢ Event frame: {event_frame}")
    print(f"   ‚Ä¢ Classification: Mitotic={sample_nucleus['mitotic']}, Death={sample_nucleus['death']}")
    
    # Time the parallel extraction
    start_time = time.time()
    result = extractor.extract_nucleus_time_series_parallel(nucleus_id, event_frame)
    parallel_time = time.time() - start_time
    
    if result and result['extraction_success']:
        print(f"‚úÖ Parallel extraction successful!")
        print(f"   ‚Ä¢ Processing time: {parallel_time:.2f} seconds")
        print(f"   ‚Ä¢ Successful frames: {result['successful_frames']}/{result['total_frames']}")
        print(f"   ‚Ä¢ Extracted frames: {list(result['time_series'].keys())}")
        
        # Display frame-level results
        for frame_label, frame_data in result['time_series'].items():
            if frame_data.get('success', False):
                bbox = frame_data['bbox']
                print(f"     {frame_label}: ‚úÖ Cropped to {bbox}")
            else:
                print(f"     {frame_label}: ‚ùå {frame_data.get('error', 'Unknown error')}")
    else:
        print(f"‚ùå Parallel extraction failed")
        
    # Memory usage check
    memory_info = psutil.virtual_memory()
    print(f"\nüíæ Memory Usage:")
    print(f"   ‚Ä¢ Available: {memory_info.available / (1024**3):.1f} GB")
    print(f"   ‚Ä¢ Used: {memory_info.percent:.1f}%")

In [None]:
# Small batch test (10 nuclei)
if success:
    print(f"üß™ Testing small batch extraction (10 nuclei)...")
    
    # Test with a small batch first
    start_time = time.time()
    successful = extractor.batch_extract_nuclei_parallel(
        max_samples=10,
        event_types=["death", "mitotic"],  # Focus on interesting events
        dataset_name=DATASET_NAME
    )
    batch_time = time.time() - start_time
    
    print(f"\nüìä Small Batch Results:")
    print(f"   ‚Ä¢ Processing time: {batch_time:.2f} seconds")
    print(f"   ‚Ä¢ Successful extractions: {successful}")
    print(f"   ‚Ä¢ Average time per nucleus: {batch_time/10:.2f} seconds")
    print(f"   ‚Ä¢ Estimated rate: {10/batch_time:.2f} nuclei/second")
    
    # Memory usage after batch
    memory_info = psutil.virtual_memory()
    print(f"   ‚Ä¢ Memory usage: {memory_info.percent:.1f}%")

In [None]:
# Full batch extraction by event type
if success:
    print(f"üöÄ Running full parallel batch extraction...")
    
    # Extract different event types separately for better organization
    event_configs = [
        {"types": ["death"], "max_samples": 100, "description": "Death Events"},
        {"types": ["mitotic"], "max_samples": 100, "description": "Mitotic Events"},
        {"types": ["normal"], "max_samples": 50, "description": "Normal Nuclei (Controls)"},
    ]
    
    total_successful = 0
    total_time = 0
    
    for config_item in event_configs:
        print(f"\nüéØ Processing {config_item['description']}...")
        
        start_time = time.time()
        successful = extractor.batch_extract_nuclei_parallel(
            max_samples=config_item['max_samples'],
            event_types=config_item['types'],
            dataset_name=DATASET_NAME
        )
        processing_time = time.time() - start_time
        
        total_successful += successful
        total_time += processing_time
        
        print(f"   ‚úÖ Completed: {successful}/{config_item['max_samples']} nuclei")
        print(f"   ‚è±Ô∏è  Time: {processing_time:.2f} seconds")
        print(f"   üöÄ Rate: {successful/processing_time:.2f} nuclei/second")
    
    print(f"\nüéâ Complete Extraction Summary:")
    print(f"   ‚Ä¢ Total successful: {total_successful}")
    print(f"   ‚Ä¢ Total time: {total_time/60:.2f} minutes")
    print(f"   ‚Ä¢ Overall rate: {total_successful/total_time:.2f} nuclei/second")
    
    # Final memory check
    memory_info = psutil.virtual_memory()
    print(f"   ‚Ä¢ Final memory usage: {memory_info.percent:.1f}%")

In [None]:
# Performance Analysis and Optimization Tips

print(f"üîß Performance Optimization Guide:")
print(f"\nüìä Current Configuration Analysis:")
print(f"   ‚Ä¢ CPU cores available: {cpu_count()}")
print(f"   ‚Ä¢ Batch workers: {config.max_workers_batch}")
print(f"   ‚Ä¢ Frame workers: {config.max_workers_frames}")
print(f"   ‚Ä¢ I/O workers: {config.max_workers_io}")
print(f"   ‚Ä¢ Chunk size: {config.chunk_size}")

print(f"\nüí° Optimization Tips:")
print(f"   1. **Memory**: If you have more RAM, increase chunk_size for fewer disk writes")
print(f"   2. **CPU**: If CPU usage is low, increase max_workers_batch")
print(f"   3. **Storage**: If using SSD, increase max_workers_io for faster file operations")
print(f"   4. **Network Storage**: If using network storage, decrease I/O workers to avoid saturation")
print(f"   5. **Large Datasets**: Use save_intermediate_results=True to avoid memory buildup")

print(f"\n‚ö° Performance Tuning Examples:")
print(f"""
# For high-memory systems (32+ GB RAM):
high_memory_config = ParallelConfig(
    max_workers_batch=8,
    chunk_size=200,
    max_memory_gb=24.0
)

# For many-core systems (16+ cores):
many_core_config = ParallelConfig(
    max_workers_batch=12,
    max_workers_frames=16,
    max_workers_io=32
)

# For network storage systems:
network_config = ParallelConfig(
    max_workers_io=4,  # Reduce I/O workers
    chunk_size=20,     # Smaller chunks
    save_intermediate_results=True
)
""")

# System resource monitoring
cpu_percent = psutil.cpu_percent(interval=1)
memory_info = psutil.virtual_memory()
disk_info = psutil.disk_usage('/')

print(f"\nüñ•Ô∏è  Current System Status:")
print(f"   ‚Ä¢ CPU usage: {cpu_percent:.1f}%")
print(f"   ‚Ä¢ Memory usage: {memory_info.percent:.1f}%")
print(f"   ‚Ä¢ Disk usage: {disk_info.percent:.1f}%")
print(f"   ‚Ä¢ Available memory: {memory_info.available / (1024**3):.1f} GB")

In [None]:
# Advanced Configuration Examples

print("üî¨ Advanced Usage Examples:\n")

# Example 1: Custom time windows
print("1. Custom Time Windows:")
custom_time_config = ParallelConfig(
    frame_offsets=[-2, -1, 0, 1, 2],  # 5-frame window
    max_workers_batch=4,
    max_workers_frames=10
)
print("   ‚Ä¢ 5-frame window: t-2, t-1, t, t+1, t+2")
print("   ‚Ä¢ Good for studying longer temporal dynamics")

# Example 2: Sparse sampling
print("\n2. Sparse Temporal Sampling:")
sparse_config = ParallelConfig(
    frame_offsets=[-10, -5, 0, 5, 10],  # Sparse sampling
    max_workers_batch=6,
    max_workers_frames=8
)
print("   ‚Ä¢ Sparse sampling: t-10, t-5, t, t+5, t+10")
print("   ‚Ä¢ Good for long-term behavior analysis")

# Example 3: Single frame extraction (for 2D CNN)
print("\n3. Single Frame Extraction (2D CNN):")
single_frame_config = ParallelConfig(
    frame_offsets=[0],  # Only event frame
    max_workers_batch=12,
    max_workers_frames=1
)
print("   ‚Ä¢ Single frame: t only")
print("   ‚Ä¢ Good for 2D CNN training")

# Example 4: Memory-constrained system
print("\n4. Memory-Constrained Configuration:")
low_memory_config = ParallelConfig(
    max_workers_batch=2,
    chunk_size=10,
    max_memory_gb=4.0,
    save_intermediate_results=True
)
print("   ‚Ä¢ Small batch sizes and chunks")
print("   ‚Ä¢ Immediate result saving")

print("\nüéØ To use custom configuration:")
print("""
# Create extractor with custom config
custom_extractor = ParallelNucleusExtractor(DATA_PATH, custom_time_config)
custom_extractor.load_dataset(DATASET_NAME)

# Run extraction with custom parameters
results = custom_extractor.batch_extract_nuclei_parallel(
    max_samples=50,
    event_types=["death"],
    dataset_name=DATASET_NAME
)
""")