# Improved Data Engineering Package Validation

This notebook validates packages with robust error handling for complex packages like apache-airflow.

In [None]:
import sys
import importlib
import subprocess
import pandas as pd
from IPython.display import display, HTML

def check_package_robust(package_name, import_name=None, fallback_imports=None):
    """Robust package checking with multiple fallback strategies"""
    if import_name is None:
        import_name = package_name
    
    # Strategy 1: Try primary import
    try:
        module = importlib.import_module(import_name)
        
        # Try multiple version attributes
        version_attrs = ['__version__', 'VERSION', 'version', '_version']
        version = 'Unknown'
        
        for attr in version_attrs:
            if hasattr(module, attr):
                version_val = getattr(module, attr)
                if version_val:
                    version = str(version_val)
                    break
        
        return {'status': '‚úÖ', 'version': version, 'method': 'import'}
        
    except ImportError:
        # Strategy 2: Try fallback imports for complex packages
        if fallback_imports:
            for fallback in fallback_imports:
                try:
                    module = importlib.import_module(fallback)
                    version = getattr(module, '__version__', 'Unknown')
                    return {'status': '‚úÖ', 'version': version, 'method': f'fallback: {fallback}'}
                except:
                    continue
        
        # Strategy 3: Check via pip show (package installed but import issues)
        try:
            result = subprocess.run(['pip', 'show', package_name], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                for line in result.stdout.split('\n'):
                    if line.startswith('Version:'):
                        version = line.split(':', 1)[1].strip()
                        return {'status': '‚ö†Ô∏è', 'version': version, 'method': 'pip (import failed)'}
        except Exception:
            pass
            
        return {'status': '‚ùå', 'version': 'Not Found', 'method': 'not installed'}
        
    except Exception as e:
        return {'status': '‚ö†Ô∏è', 'version': f'Error: {str(e)[:50]}...', 'method': 'exception'}

print("üîç Robust Package Validation Starting...\n")

## Package Validation with Fallback Strategies

In [None]:
# Define packages with their import strategies
packages_config = [
    # Base packages (already in jupyter/pyspark-notebook)
    {'name': 'pyarrow', 'import': 'pyarrow'},
    {'name': 'pyspark', 'import': 'pyspark'},
    
    # Core data processing
    {'name': 'boto3', 'import': 'boto3'},
    
    # Data lake and streaming
    {'name': 'deltalake', 'import': 'deltalake'},
    {'name': 'kafka-python', 'import': 'kafka'},
    {'name': 'minio', 'import': 'minio'},
    {'name': 'lakefs-client', 'import': 'lakefs_client'},
    
    # Data quality and orchestration (complex packages)
    {'name': 'great-expectations', 'import': 'great_expectations'},
    {'name': 'apache-airflow', 'import': 'airflow', 'fallbacks': ['airflow.version', 'airflow.configuration']},
    
    # Model serving and deployment
    {'name': 'bentoml', 'import': 'bentoml'},
    {'name': 'kubeflow-training', 'import': 'kubeflowtraining', 'fallbacks': ['kubeflow.training']},
    {'name': 'kubernetes', 'import': 'kubernetes', 'fallbacks': ['kubernetes.client']},
    
    # Jupyter environment
    {'name': 'ipywidgets', 'import': 'ipywidgets'},
]

print(f"Checking {len(packages_config)} packages with robust validation...\n")

results = []
for pkg_config in packages_config:
    pkg_name = pkg_config['name']
    import_name = pkg_config['import']
    fallbacks = pkg_config.get('fallbacks', None)
    
    print(f"Checking {pkg_name}...", end=" ")
    result = check_package_robust(pkg_name, import_name, fallbacks)
    
    results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version'],
        'Method': result['method']
    })
    
    print(f"{result['status']} ({result['method']})")

print("\n" + "="*60)
df = pd.DataFrame(results)
display(HTML(df.to_html(index=False, escape=False)))

## Detailed Analysis

In [None]:
# Analyze results by status
status_counts = df['Status'].value_counts()
method_counts = df['Method'].value_counts()

print("üìä Installation Analysis:")
print(f"‚úÖ Working: {status_counts.get('‚úÖ', 0)}")
print(f"‚ö†Ô∏è Issues: {status_counts.get('‚ö†Ô∏è', 0)}")
print(f"‚ùå Missing: {status_counts.get('‚ùå', 0)}")

total = len(results)
success_rate = (status_counts.get('‚úÖ', 0) / total) * 100
print(f"\nüìà Success Rate: {success_rate:.1f}%")

# Show problematic packages
problematic = df[df['Status'] != '‚úÖ']
if not problematic.empty:
    print("\n‚ö†Ô∏è Packages needing attention:")
    for _, row in problematic.iterrows():
        print(f"  ‚Ä¢ {row['Package']}: {row['Version']} ({row['Method']})")
else:
    print("\nüéâ All packages are working correctly!")

print("\nüîß Detection Methods Used:")
for method, count in method_counts.items():
    print(f"  ‚Ä¢ {method}: {count} packages")

## Functionality Tests for Complex Packages

In [None]:
print("üß™ Testing complex package functionality...\n")

functionality_tests = []

# Test Apache Airflow (complex package)
print("Testing Apache Airflow...")
try:
    # Try multiple import strategies
    import airflow
    from airflow import __version__ as airflow_version
    functionality_tests.append(('Apache Airflow', '‚úÖ', f'Core import successful (v{airflow_version})'))
    
    # Test configuration access
    try:
        from airflow.configuration import conf
        functionality_tests.append(('Airflow Config', '‚úÖ', 'Configuration accessible'))
    except Exception as e:
        functionality_tests.append(('Airflow Config', '‚ö†Ô∏è', f'Config issue: {str(e)[:40]}...'))
        
except Exception as e:
    functionality_tests.append(('Apache Airflow', '‚ùå', f'Import failed: {str(e)[:40]}...'))

# Test Great Expectations
print("Testing Great Expectations...")
try:
    import great_expectations as gx
    # Try to create a basic context
    functionality_tests.append(('Great Expectations', '‚úÖ', f'Import successful (v{gx.__version__})'))
except Exception as e:
    functionality_tests.append(('Great Expectations', '‚ùå', f'Failed: {str(e)[:40]}...'))

# Test Kubeflow Training
print("Testing Kubeflow Training...")
try:
    import kubeflowtraining
    functionality_tests.append(('Kubeflow Training', '‚úÖ', f'Import successful'))
    
    # Test training module
    try:
        from kubeflow import training
        functionality_tests.append(('Training Module', '‚úÖ', 'Training module accessible'))
    except Exception as e:
        functionality_tests.append(('Training Module', '‚ö†Ô∏è', f'Module issue: {str(e)[:40]}...'))
        
except Exception as e:
    functionality_tests.append(('Kubeflow Training', '‚ùå', f'Failed: {str(e)[:40]}...'))

# Test BentoML
print("Testing BentoML...")
try:
    import bentoml
    functionality_tests.append(('BentoML', '‚úÖ', f'Import successful (v{bentoml.__version__})'))
except Exception as e:
    functionality_tests.append(('BentoML', '‚ùå', f'Failed: {str(e)[:40]}...'))

# Display results
print("\n" + "="*50)
func_df = pd.DataFrame(functionality_tests, columns=['Component', 'Status', 'Result'])
display(HTML(func_df.to_html(index=False, escape=False)))

print("\n‚ú® Functionality testing complete!")

## Environment Summary

In [None]:
import platform
import os

print("üñ•Ô∏è Environment Information:")
print(f"Python: {sys.version.split()[0]}")
print(f"Platform: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.machine()}")

# Check for Coder environment
if os.getenv('CODER_AGENT_TOKEN'):
    print(f"Environment: Coder Workspace")
    print(f"Workspace: Data Engineering Template")
else:
    print(f"Environment: Standard Jupyter")

print(f"\nüì¶ Package Summary:")
print(f"Total packages checked: {len(results)}")
print(f"Validation strategies used: {len(method_counts)}")
print(f"Complex packages handled: apache-airflow, kubeflow-training, kubernetes")

print(f"\nüéØ This robust validation handles complex packages with multiple import strategies.")