# 00 - Environment Check

This notebook validates the environment for GPU storage ML experiments across different platforms:
- **Local development**: Single node with GPU support
- **Google Colab**: Free GPU instances
- **AWS SageMaker**: Unified Studio integration
- **EMR Spark**: Distributed processing

## Key Checks
1. Python environment and core libraries
2. GPU availability and specifications
3. Storage and memory characteristics
4. Cloud platform detection
5. Spark configuration readiness

In [None]:
import sys
import platform
import os
import psutil
import subprocess
from pathlib import Path
import json

print("=" * 60)
print("üîß ENVIRONMENT VALIDATION FOR GPU STORAGE ML PROJECT")
print("=" * 60)

# Basic system info
print(f"\nüìã System Information:")
print(f"   Python: {sys.version.split()[0]}")
print(f"   Platform: {platform.platform()}")
print(f"   Architecture: {platform.machine()}")
print(f"   CPU cores: {psutil.cpu_count()} ({psutil.cpu_count(logical=False)} physical)")
print(f"   Memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")

# Cloud platform detection
print(f"\n‚òÅÔ∏è Cloud Platform Detection:")
is_colab = 'google.colab' in sys.modules
is_sagemaker = os.path.exists('/opt/ml')
is_emr = os.path.exists('/etc/hadoop')
print(f"   Google Colab: {'‚úÖ Yes' if is_colab else '‚ùå No'}")
print(f"   AWS SageMaker: {'‚úÖ Yes' if is_sagemaker else '‚ùå No'}")
print(f"   EMR Cluster: {'‚úÖ Yes' if is_emr else '‚ùå No'}")
print(f"   Local Environment: {'‚úÖ Yes' if not (is_colab or is_sagemaker or is_emr) else '‚ùå No'}")

In [None]:
# GPU detection and specifications
print(f"\nüéÆ GPU Configuration:")
try:
    import torch
    print(f"   PyTorch: {torch.__version__}")
    cuda_available = torch.cuda.is_available()
    print(f"   CUDA available: {'‚úÖ Yes' if cuda_available else '‚ùå No'}")
    
    if cuda_available:
        device_count = torch.cuda.device_count()
        print(f"   GPU count: {device_count}")
        for i in range(device_count):
            props = torch.cuda.get_device_properties(i)
            memory_gb = props.total_memory / (1024**3)
            print(f"   GPU {i}: {props.name} ({memory_gb:.1f} GB)")
            print(f"           Compute capability: {props.major}.{props.minor}")
        
        # Test basic GPU operation
        try:
            x = torch.rand(1000, 1000, device='cuda')
            y = torch.matmul(x, x.T)
            print(f"   ‚úÖ Basic GPU operations working")
        except Exception as e:
            print(f"   ‚ö†Ô∏è GPU operation test failed: {e}")
    else:
        print(f"   ‚ÑπÔ∏è Running in CPU-only mode")
        
except ImportError:
    print(f"   ‚ùå PyTorch not available")

# NVIDIA tools check
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"   ‚úÖ nvidia-smi available")
        # Extract basic info from nvidia-smi
        lines = result.stdout.split('\n')
        for line in lines:
            if 'Driver Version' in line:
                print(f"   Driver: {line.split('Driver Version: ')[1].split()[0]}")
                break
    else:
        print(f"   ‚ùå nvidia-smi not working")
except FileNotFoundError:
    print(f"   ‚ùå nvidia-smi not found")

In [None]:
# Storage characteristics
print(f"\nüíæ Storage Analysis:")
cwd = Path.cwd()
disk_usage = psutil.disk_usage(cwd)
print(f"   Working directory: {cwd}")
print(f"   Available space: {disk_usage.free / (1024**3):.1f} GB")
print(f"   Total space: {disk_usage.total / (1024**3):.1f} GB")

# Check for different storage types
storage_paths = {
    'Local NVMe/SSD': ['/dev/nvme*', '/dev/ssd*'],
    'Network mounts': ['/mnt/*', '/net/*'],
    'Cloud storage': ['/gcs/*', '/s3/*', '/efs/*']
}

import glob
for storage_type, patterns in storage_paths.items():
    found = []
    for pattern in patterns:
        found.extend(glob.glob(pattern))
    if found:
        print(f"   {storage_type}: {found[:3]}{'...' if len(found) > 3 else ''}")

# Test write performance for quick assessment
test_file = cwd / 'temp_write_test.dat'
try:
    import time
    data = b'x' * (10 * 1024 * 1024)  # 10MB
    start = time.time()
    with open(test_file, 'wb') as f:
        f.write(data)
        f.flush()
        os.fsync(f.fileno())
    write_time = time.time() - start
    write_speed = len(data) / (1024**2) / write_time
    print(f"   Sequential write speed: {write_speed:.1f} MB/s")
    test_file.unlink()  # cleanup
except Exception as e:
    print(f"   ‚ö†Ô∏è Write test failed: {e}")
    if test_file.exists():
        test_file.unlink()

In [None]:
# Core library availability
print(f"\nüìö Library Availability:")
libraries = {
    'Data Science': ['pandas', 'numpy', 'matplotlib', 'scipy', 'scikit-learn'],
    'Big Data': ['pyspark', 'pyarrow'],
    'ML Acceleration': ['torch', 'tensorflow'],
    'Data Loading': ['webdataset', 'ffcv'],
    'Networking': ['ucx-py']
}

for category, libs in libraries.items():
    print(f"   {category}:")
    for lib in libs:
        try:
            if lib == 'torch':
                import torch
                version = torch.__version__
            elif lib == 'tensorflow':
                import tensorflow as tf
                version = tf.__version__
            else:
                module = __import__(lib.replace('-', '_'))
                version = getattr(module, '__version__', 'unknown')
            print(f"     ‚úÖ {lib}: {version}")
        except ImportError:
            print(f"     ‚ùå {lib}: not installed")
        except Exception as e:
            print(f"     ‚ö†Ô∏è {lib}: error ({e})")

In [None]:
# Spark configuration check
print(f"\n‚ö° Spark Configuration:")
try:
    from pyspark.sql import SparkSession
    import pyspark
    print(f"   PySpark: {pyspark.__version__}")
    
    # Test basic Spark session
    spark = SparkSession.builder \
        .appName("EnvironmentTest") \
        .master("local[*]") \
        .config("spark.sql.adaptive.enabled", "true") \
        .getOrCreate()
    
    print(f"   ‚úÖ Spark session created successfully")
    print(f"   Spark version: {spark.version}")
    print(f"   Default parallelism: {spark.sparkContext.defaultParallelism}")
    
    # Test basic operation
    df = spark.range(1000).toDF("number")
    count = df.count()
    print(f"   ‚úÖ Basic Spark operation successful (count: {count})")
    
    spark.stop()
    
    # Check for RAPIDS/GPU support
    try:
        import cudf
        print(f"   ‚úÖ RAPIDS cuDF available: {cudf.__version__}")
    except ImportError:
        print(f"   ‚ùå RAPIDS cuDF not available")
        
except Exception as e:
    print(f"   ‚ùå Spark test failed: {e}")

In [None]:
# Environment summary and recommendations
print(f"\nüìä ENVIRONMENT SUMMARY:")
print(f"=" * 40)

# Create environment profile
env_profile = {
    'platform': {
        'local': not (is_colab or is_sagemaker or is_emr),
        'colab': is_colab,
        'sagemaker': is_sagemaker,
        'emr': is_emr
    },
    'compute': {
        'cpu_cores': psutil.cpu_count(),
        'memory_gb': round(psutil.virtual_memory().total / (1024**3), 1),
        'gpu_available': cuda_available if 'cuda_available' in locals() else False
    },
    'capabilities': {
        'basic_ml': True,  # We have pandas, numpy etc
        'gpu_acceleration': 'cuda_available' in locals() and cuda_available,
        'distributed_spark': True,  # PySpark is working
        'advanced_loaders': False  # FFCV, WebDataset may not be available
    }
}

print(f"\nüéØ Recommended Experiment Configuration:")
if env_profile['platform']['colab']:
    print(f"   üì± Google Colab detected:")
    print(f"     - Use free GPU for training experiments")
    print(f"     - Limited to ~12GB memory, optimize batch sizes")
    print(f"     - Mount Google Drive for large datasets")
elif env_profile['platform']['sagemaker']:
    print(f"   üî¨ AWS SageMaker detected:")
    print(f"     - Use SageMaker Studio for integrated experience")
    print(f"     - S3 integration for large-scale storage experiments")
    print(f"     - Consider SageMaker Processing for distributed jobs")
elif env_profile['platform']['emr']:
    print(f"   üî• EMR Cluster detected:")
    print(f"     - Focus on distributed Spark experiments")
    print(f"     - HDFS and S3 storage comparisons")
    print(f"     - Multi-node GPU coordination if available")
else:
    print(f"   üñ•Ô∏è Local Environment:")
    print(f"     - Full control over storage configurations")
    print(f"     - Test different storage backends (NVMe, NFS, object)")
    print(f"     - Single-node optimizations")

if env_profile['compute']['gpu_available']:
    print(f"   üéÆ GPU acceleration enabled - prioritize GPU-bound experiments")
else:
    print(f"   üíª CPU-only mode - focus on I/O and data pipeline experiments")

print(f"\n‚úÖ Environment check complete! Ready for GPU storage ML experiments.")

# Save environment profile for later use
with open('../results/environment_profile.json', 'w') as f:
    json.dump(env_profile, f, indent=2)
print(f"üìÑ Environment profile saved to results/environment_profile.json")