# Data Engineering Workspace Package Validation

This notebook validates and displays the versions of all installed packages in your data engineering workspace.

In [None]:
import sys
import importlib
import subprocess
from IPython.display import display, HTML
import pandas as pd

def check_package(package_name, import_name=None, version_attr='__version__'):
    """Check if a package is installed and get its version"""
    if import_name is None:
        import_name = package_name
    
    try:
        module = importlib.import_module(import_name)
        version = getattr(module, version_attr, 'Unknown')
        return {'status': '✅ Installed', 'version': version}
    except ImportError:
        return {'status': '❌ Not Found', 'version': 'N/A'}
    except Exception as e:
        return {'status': '⚠️ Error', 'version': str(e)}

def check_package_pip(package_name):
    """Check package version using pip show"""
    try:
        result = subprocess.run(['pip', 'show', package_name], 
                              capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            for line in result.stdout.split('\n'):
                if line.startswith('Version:'):
                    return line.split(':', 1)[1].strip()
        return 'Not Found'
    except:
        return 'Error'

print("🔍 Validating Data Engineering Workspace Packages...\n")

## Core Data Processing Packages

In [None]:
core_packages = [
    ('boto3', 'boto3'),
    ('duckdb', 'duckdb'),
    ('polars', 'polars'),
    ('pyarrow', 'pyarrow'),
]

core_results = []
for pkg_name, import_name in core_packages:
    result = check_package(pkg_name, import_name)
    core_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

core_df = pd.DataFrame(core_results)
display(HTML(core_df.to_html(index=False, escape=False)))

## Data Lake and Streaming Packages

In [None]:
data_lake_packages = [
    ('deltalake', 'deltalake'),
    ('kafka-python', 'kafka'),
    ('minio', 'minio'),
    ('lakefs-client', 'lakefs_client'),
]

data_lake_results = []
for pkg_name, import_name in data_lake_packages:
    result = check_package(pkg_name, import_name)
    data_lake_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

data_lake_df = pd.DataFrame(data_lake_results)
display(HTML(data_lake_df.to_html(index=False, escape=False)))

## ML and AI Frameworks

In [None]:
ml_packages = [
    ('torch', 'torch'),
    ('transformers', 'transformers'),
    ('datasets', 'datasets'),
    ('accelerate', 'accelerate'),
    ('mlflow', 'mlflow'),
]

ml_results = []
for pkg_name, import_name in ml_packages:
    result = check_package(pkg_name, import_name)
    ml_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

ml_df = pd.DataFrame(ml_results)
display(HTML(ml_df.to_html(index=False, escape=False)))

## Distributed Computing and Orchestration

In [None]:
distributed_packages = [
    ('ray', 'ray'),
    ('apache-airflow', 'airflow'),
    ('great-expectations', 'great_expectations'),
]

distributed_results = []
for pkg_name, import_name in distributed_packages:
    result = check_package(pkg_name, import_name)
    distributed_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

distributed_df = pd.DataFrame(distributed_results)
display(HTML(distributed_df.to_html(index=False, escape=False)))

## Model Serving and Vector Database

In [None]:
serving_packages = [
    ('pymilvus', 'pymilvus'),
    ('bentoml', 'bentoml'),
    ('kfp', 'kfp'),
    ('kubernetes', 'kubernetes'),
]

serving_results = []
for pkg_name, import_name in serving_packages:
    result = check_package(pkg_name, import_name)
    serving_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

serving_df = pd.DataFrame(serving_results)
display(HTML(serving_df.to_html(index=False, escape=False)))

## Jupyter Environment

In [None]:
jupyter_packages = [
    ('ipywidgets', 'ipywidgets'),
    ('jupyterlab', 'jupyterlab'),
]

jupyter_results = []
for pkg_name, import_name in jupyter_packages:
    result = check_package(pkg_name, import_name)
    jupyter_results.append({
        'Package': pkg_name,
        'Status': result['status'],
        'Version': result['version']
    })

jupyter_df = pd.DataFrame(jupyter_results)
display(HTML(jupyter_df.to_html(index=False, escape=False)))

## Summary Report

In [None]:
# Combine all results
all_results = core_results + data_lake_results + ml_results + distributed_results + serving_results + jupyter_results

# Count status
installed_count = sum(1 for r in all_results if '✅' in r['Status'])
not_found_count = sum(1 for r in all_results if '❌' in r['Status'])
error_count = sum(1 for r in all_results if '⚠️' in r['Status'])
total_count = len(all_results)

print(f"📊 Package Installation Summary:")
print(f"✅ Successfully Installed: {installed_count}/{total_count}")
print(f"❌ Not Found: {not_found_count}/{total_count}")
print(f"⚠️ Errors: {error_count}/{total_count}")
print(f"\n📈 Success Rate: {(installed_count/total_count)*100:.1f}%")

if not_found_count > 0 or error_count > 0:
    print("\n⚠️ Issues found with some packages. Check the tables above for details.")
else:
    print("\n🎉 All packages are successfully installed!")

## System Information

In [None]:
import platform

print(f"🖥️ System Information:")
print(f"Python Version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.architecture()}")
print(f"Processor: {platform.processor()}")

# Check if we're in a Jupyter environment
try:
    from IPython import get_ipython
    if get_ipython() is not None:
        print(f"Environment: Jupyter Notebook/Lab")
except:
    print(f"Environment: Standard Python")

## Quick Package Tests

In [None]:
print("🧪 Running quick functionality tests...\n")

# Test core packages with simple operations
tests = []

# Test DuckDB
try:
    import duckdb
    result = duckdb.sql("SELECT 'DuckDB working!' as test").fetchone()
    tests.append(('DuckDB', '✅ SQL query successful'))
except Exception as e:
    tests.append(('DuckDB', f'❌ {str(e)[:50]}...'))

# Test Polars
try:
    import polars as pl
    df = pl.DataFrame({'test': [1, 2, 3]})
    tests.append(('Polars', '✅ DataFrame creation successful'))
except Exception as e:
    tests.append(('Polars', f'❌ {str(e)[:50]}...'))

# Test PyTorch
try:
    import torch
    tensor = torch.tensor([1, 2, 3])
    tests.append(('PyTorch', '✅ Tensor creation successful'))
except Exception as e:
    tests.append(('PyTorch', f'❌ {str(e)[:50]}...'))

# Test Ray
try:
    import ray
    tests.append(('Ray', '✅ Import successful'))
except Exception as e:
    tests.append(('Ray', f'❌ {str(e)[:50]}...'))

# Display test results
test_df = pd.DataFrame(tests, columns=['Package', 'Test Result'])
display(HTML(test_df.to_html(index=False, escape=False)))

print("\n✨ Package validation complete!")