In [None]:
#!/usr/bin/env python3
"""
Kubernetes Cluster Monitor with Graphing - For JupyterHub Stress Testing
Monitors CPU and memory usage and generates before/during/after graphs
"""

import subprocess
import time
from datetime import datetime
import sys
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import defaultdict

# ============================================================================
# CONFIGURATION
# ============================================================================
REFRESH_INTERVAL = 5  # seconds between updates
NAMESPACE = "default"  # Change to your JupyterHub namespace (e.g., "jupyterhub")
SHOW_ALL_NAMESPACES = True  # Set to False to only show NAMESPACE above
OUTPUT_FILE = "k8s_stress_test_report.png"  # Graph output filename

# Data storage
node_history = defaultdict(lambda: {'timestamps': [], 'cpu': [], 'memory': []})
pod_history = defaultdict(lambda: {'timestamps': [], 'cpu': [], 'memory': []})
cluster_totals = {'timestamps': [], 'cpu': [], 'memory': [], 'pod_count': []}

def run_kubectl(cmd):
    """Run a kubectl command and return the output"""
    try:
        result = subprocess.run(
            cmd.split(),
            capture_output=True,
            text=True,
            check=True
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error running kubectl: {e}")
        return None

def parse_resource(value):
    """Parse Kubernetes resource values (e.g., '100m', '1Gi', '500Mi')"""
    if not value or value == '<unknown>':
        return 0
    
    value = value.strip()
    
    # Handle CPU (millicores)
    if value.endswith('m'):
        return float(value[:-1]) / 1000  # Convert millicores to cores
    
    # Handle memory
    if value.endswith('Ki'):
        return float(value[:-2]) / 1024 / 1024  # Convert to GB
    elif value.endswith('Mi'):
        return float(value[:-2]) / 1024  # Convert to GB
    elif value.endswith('Gi'):
        return float(value[:-2])
    elif value.endswith('Ti'):
        return float(value[:-2]) * 1024
    
    # Assume it's already in base unit
    try:
        return float(value)
    except:
        return 0

def get_node_metrics():
    """Get CPU and memory metrics for all nodes"""
    output = run_kubectl("kubectl top nodes --no-headers")
    if not output:
        return []
    
    nodes = []
    for line in output.strip().split('\n'):
        if not line:
            continue
        parts = line.split()
        if len(parts) >= 5:
            nodes.append({
                'name': parts[0],
                'cpu_cores': parse_resource(parts[1]),
                'cpu_percent': parts[2],
                'memory_gb': parse_resource(parts[3]),
                'memory_percent': parts[4]
            })
    return nodes

def get_pod_metrics(namespace=None):
    """Get CPU and memory metrics for pods"""
    if namespace:
        cmd = f"kubectl top pods -n {namespace} --no-headers"
    else:
        cmd = "kubectl top pods --all-namespaces --no-headers"
    
    output = run_kubectl(cmd)
    if not output:
        return []
    
    pods = []
    for line in output.strip().split('\n'):
        if not line:
            continue
        parts = line.split()
        
        if namespace:
            if len(parts) >= 3:
                pods.append({
                    'namespace': namespace,
                    'name': parts[0],
                    'cpu_cores': parse_resource(parts[1]),
                    'memory_gb': parse_resource(parts[2])
                })
        else:
            if len(parts) >= 4:
                pods.append({
                    'namespace': parts[0],
                    'name': parts[1],
                    'cpu_cores': parse_resource(parts[2]),
                    'memory_gb': parse_resource(parts[3])
                })
    return pods

def record_metrics(timestamp, nodes, pods):
    """Record metrics to history for graphing"""
    # Record node metrics
    for node in nodes:
        node_history[node['name']]['timestamps'].append(timestamp)
        node_history[node['name']]['cpu'].append(node['cpu_cores'])
        node_history[node['name']]['memory'].append(node['memory_gb'])
    
    # Record pod metrics (aggregate by namespace)
    namespace_metrics = defaultdict(lambda: {'cpu': 0, 'memory': 0})
    for pod in pods:
        namespace_metrics[pod['namespace']]['cpu'] += pod['cpu_cores']
        namespace_metrics[pod['namespace']]['memory'] += pod['memory_gb']
    
    for ns, metrics in namespace_metrics.items():
        pod_history[ns]['timestamps'].append(timestamp)
        pod_history[ns]['cpu'].append(metrics['cpu'])
        pod_history[ns]['memory'].append(metrics['memory'])
    
    # Record cluster totals
    total_cpu = sum(n['cpu_cores'] for n in nodes)
    total_memory = sum(n['memory_gb'] for n in nodes)
    cluster_totals['timestamps'].append(timestamp)
    cluster_totals['cpu'].append(total_cpu)
    cluster_totals['memory'].append(total_memory)
    cluster_totals['pod_count'].append(len(pods))

def generate_graphs():
    """Generate comprehensive graphs of the stress test"""
    if not cluster_totals['timestamps']:
        print("No data collected for graphing")
        return
    
    # Create figure with subplots
    fig = plt.figure(figsize=(16, 12))
    
    # Title with test duration
    start_time = cluster_totals['timestamps'][0]
    end_time = cluster_totals['timestamps'][-1]
    duration = (end_time - start_time).total_seconds() / 60
    fig.suptitle(f'Kubernetes Cluster Stress Test Report\n'
                 f'Duration: {duration:.1f} minutes | '
                 f'Start: {start_time.strftime("%H:%M:%S")} | '
                 f'End: {end_time.strftime("%H:%M:%S")}',
                 fontsize=16, fontweight='bold')
    
    # 1. Cluster Total CPU Usage
    ax1 = plt.subplot(3, 2, 1)
    ax1.plot(cluster_totals['timestamps'], cluster_totals['cpu'], 
             linewidth=2, color='#2E86AB', marker='o', markersize=3)
    ax1.fill_between(cluster_totals['timestamps'], cluster_totals['cpu'], 
                      alpha=0.3, color='#2E86AB')
    ax1.set_ylabel('CPU Cores', fontsize=11, fontweight='bold')
    ax1.set_title('Total Cluster CPU Usage', fontsize=12, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # Add statistics
    max_cpu = max(cluster_totals['cpu'])
    min_cpu = min(cluster_totals['cpu'])
    avg_cpu = sum(cluster_totals['cpu']) / len(cluster_totals['cpu'])
    ax1.axhline(y=avg_cpu, color='red', linestyle='--', alpha=0.7, label=f'Avg: {avg_cpu:.2f}')
    ax1.legend(loc='upper left', fontsize=9)
    ax1.text(0.02, 0.98, f'Max: {max_cpu:.2f}\nMin: {min_cpu:.2f}',
             transform=ax1.transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
             fontsize=9)
    
    # 2. Cluster Total Memory Usage
    ax2 = plt.subplot(3, 2, 2)
    ax2.plot(cluster_totals['timestamps'], cluster_totals['memory'], 
             linewidth=2, color='#A23B72', marker='o', markersize=3)
    ax2.fill_between(cluster_totals['timestamps'], cluster_totals['memory'], 
                      alpha=0.3, color='#A23B72')
    ax2.set_ylabel('Memory (GB)', fontsize=11, fontweight='bold')
    ax2.set_title('Total Cluster Memory Usage', fontsize=12, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # Add statistics
    max_mem = max(cluster_totals['memory'])
    min_mem = min(cluster_totals['memory'])
    avg_mem = sum(cluster_totals['memory']) / len(cluster_totals['memory'])
    ax2.axhline(y=avg_mem, color='red', linestyle='--', alpha=0.7, label=f'Avg: {avg_mem:.2f}')
    ax2.legend(loc='upper left', fontsize=9)
    ax2.text(0.02, 0.98, f'Max: {max_mem:.2f}\nMin: {min_mem:.2f}',
             transform=ax2.transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
             fontsize=9)
    
    # 3. Per-Node CPU Usage
    ax3 = plt.subplot(3, 2, 3)
    for node_name, data in node_history.items():
        ax3.plot(data['timestamps'], data['cpu'], 
                linewidth=2, marker='o', markersize=2, label=node_name, alpha=0.8)
    ax3.set_ylabel('CPU Cores', fontsize=11, fontweight='bold')
    ax3.set_title('CPU Usage by Node', fontsize=12, fontweight='bold')
    ax3.legend(loc='upper left', fontsize=9)
    ax3.grid(True, alpha=0.3)
    ax3.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 4. Per-Node Memory Usage
    ax4 = plt.subplot(3, 2, 4)
    for node_name, data in node_history.items():
        ax4.plot(data['timestamps'], data['memory'], 
                linewidth=2, marker='o', markersize=2, label=node_name, alpha=0.8)
    ax4.set_ylabel('Memory (GB)', fontsize=11, fontweight='bold')
    ax4.set_title('Memory Usage by Node', fontsize=12, fontweight='bold')
    ax4.legend(loc='upper left', fontsize=9)
    ax4.grid(True, alpha=0.3)
    ax4.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 5. Top Namespace CPU Usage
    ax5 = plt.subplot(3, 2, 5)
    # Show top 5 namespaces by average CPU
    ns_avg_cpu = {ns: sum(data['cpu'])/len(data['cpu']) 
                  for ns, data in pod_history.items() if data['cpu']}
    top_ns = sorted(ns_avg_cpu.items(), key=lambda x: x[1], reverse=True)[:5]
    
    for ns, _ in top_ns:
        data = pod_history[ns]
        ax5.plot(data['timestamps'], data['cpu'], 
                linewidth=2, marker='o', markersize=2, label=ns, alpha=0.8)
    ax5.set_xlabel('Time', fontsize=11, fontweight='bold')
    ax5.set_ylabel('CPU Cores', fontsize=11, fontweight='bold')
    ax5.set_title('Top 5 Namespaces by CPU', fontsize=12, fontweight='bold')
    ax5.legend(loc='upper left', fontsize=9)
    ax5.grid(True, alpha=0.3)
    ax5.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax5.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 6. Pod Count Over Time
    ax6 = plt.subplot(3, 2, 6)
    ax6.plot(cluster_totals['timestamps'], cluster_totals['pod_count'], 
             linewidth=2, color='#F18F01', marker='o', markersize=3)
    ax6.fill_between(cluster_totals['timestamps'], cluster_totals['pod_count'], 
                      alpha=0.3, color='#F18F01')
    ax6.set_xlabel('Time', fontsize=11, fontweight='bold')
    ax6.set_ylabel('Pod Count', fontsize=11, fontweight='bold')
    ax6.set_title('Active Pods Over Time', fontsize=12, fontweight='bold')
    ax6.grid(True, alpha=0.3)
    ax6.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    plt.setp(ax6.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_FILE, dpi=300, bbox_inches='tight')
    print(f"\n✅ Graphs saved to: {OUTPUT_FILE}")
    plt.close()

def print_header():
    """Print the monitoring header"""
    print("\n" + "=" * 80)
    print("KUBERNETES CLUSTER MONITOR - JUPYTERHUB STRESS TEST")
    print("=" * 80)
    print(f"Refresh interval: {REFRESH_INTERVAL} seconds")
    print(f"Monitoring namespace: {'ALL' if SHOW_ALL_NAMESPACES else NAMESPACE}")
    print(f"Graph output: {OUTPUT_FILE}")
    print("Press Ctrl+C to stop monitoring and generate graphs")
    print("=" * 80 + "\n")

def print_nodes(nodes):
    """Print node metrics in a formatted table"""
    if not nodes:
        print("⚠ No node metrics available. Is metrics-server installed?")
        return
    
    print("\n📊 NODE METRICS")
    print("-" * 80)
    print(f"{'NODE':<30} {'CPU (cores)':<15} {'CPU %':<10} {'MEMORY (GB)':<15} {'MEM %':<10}")
    print("-" * 80)
    
    for node in nodes:
        print(f"{node['name']:<30} {node['cpu_cores']:<15.2f} {node['cpu_percent']:<10} "
              f"{node['memory_gb']:<15.2f} {node['memory_percent']:<10}")
    
    # Print totals
    total_cpu = sum(n['cpu_cores'] for n in nodes)
    total_mem = sum(n['memory_gb'] for n in nodes)
    print("-" * 80)
    print(f"{'TOTAL':<30} {total_cpu:<15.2f} {'':<10} {total_mem:<15.2f}")

def print_pods(pods, top_n=10):
    """Print top N pods by resource usage"""
    if not pods:
        print("\n⚠ No pod metrics available")
        return
    
    # Sort by CPU usage
    pods_by_cpu = sorted(pods, key=lambda x: x['cpu_cores'], reverse=True)[:top_n]
    
    print(f"\n🔥 TOP {top_n} PODS BY CPU")
    print("-" * 80)
    print(f"{'NAMESPACE':<20} {'POD NAME':<35} {'CPU (cores)':<15} {'MEMORY (GB)':<15}")
    print("-" * 80)
    
    for pod in pods_by_cpu:
        print(f"{pod['namespace']:<20} {pod['name'][:34]:<35} "
              f"{pod['cpu_cores']:<15.3f} {pod['memory_gb']:<15.2f}")

def check_metrics_server():
    """Check if metrics-server is installed"""
    result = run_kubectl("kubectl get deployment metrics-server -n kube-system")
    if result is None:
        print("\n⚠️  WARNING: metrics-server not found!")
        print("To install metrics-server, run:")
        print("kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml")
        print("\nFor Minikube/local clusters, you may need to enable it:")
        print("minikube addons enable metrics-server")
        return False
    return True

def main():
    """Main monitoring loop"""
    print_header()
    
    # Check for metrics-server
    if not check_metrics_server():
        print("\nExiting...")
        sys.exit(1)
    
    print("Waiting for metrics to be available...")
    time.sleep(3)
    
    try:
        iteration = 0
        while True:
            iteration += 1
            timestamp = datetime.now()
            
            # Clear screen (optional - comment out if you want to keep history)
            print("\033[2J\033[H")  # ANSI escape codes to clear screen
            
            print