# Database Metrics Analysis

This notebook demonstrates how to connect to Druid and analyze basic database metrics collected by Project Obsidian Core.

## Setup Connection to Druid

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sqlalchemy import create_engine
from datetime import datetime, timedelta

# Connect to Druid SQL
druid_url = "druid://druid-broker:8082/druid/v2/sql/"
engine = create_engine(druid_url)

# Test connection
try:
    with engine.connect() as conn:
        result = conn.execute("SELECT 1 AS test").fetchone()
    print(f"Connected to Druid successfully: {result}")
except Exception as e:
    print(f"Error connecting to Druid: {e}")

## List Available Metrics

In [None]:
# Query to get distinct metric names
query = """
SELECT DISTINCT "metric.name", "db.system"
FROM metrics_db
ORDER BY "db.system", "metric.name"
"""

try:
    metrics_df = pd.read_sql(query, engine)
    display(metrics_df)
except Exception as e:
    print(f"Error fetching metrics: {e}")

## Analyze Time Series Metrics

### MySQL Connections Over Time

In [None]:
# Time range for the query (last hour)
end_time = datetime.now()
start_time = end_time - timedelta(hours=1)

# Query for MySQL connections
query = f"""
SELECT 
  __time,
  "resource.instance.id",
  "metric.value"
FROM metrics_db
WHERE 
  "db.system" = 'mysql' AND
  "metric.name" = 'mysql.connections' AND
  __time BETWEEN TIMESTAMP '{start_time.strftime('%Y-%m-%d %H:%M:%S')}' AND TIMESTAMP '{end_time.strftime('%Y-%m-%d %H:%M:%S')}'
ORDER BY __time
"""

try:
    connections_df = pd.read_sql(query, engine)
    
    if not connections_df.empty:
        # Create time series plot
        fig = px.line(
            connections_df, 
            x="__time", 
            y="metric.value", 
            color="resource.instance.id",
            title="MySQL Active Connections",
            labels={
                "__time": "Time",
                "metric.value": "Connection Count",
                "resource.instance.id": "Instance"
            }
        )
        fig.update_layout(height=500)
        fig.show()
    else:
        print("No connection metrics found in the selected time range.")
except Exception as e:
    print(f"Error fetching connection metrics: {e}")

### PostgreSQL Metrics: Buffer Hit Ratio

In [None]:
# Time range for the query (last 4 hours)
end_time = datetime.now()
start_time = end_time - timedelta(hours=4)

# Query for PostgreSQL buffer metrics
query = f"""
SELECT 
  time_floor(__time, 'PT1M') AS minute,
  "resource.instance.id",
  SUM(CASE WHEN "metric.name" = 'postgresql.blocks_hit' THEN "metric.value" ELSE 0 END) AS blocks_hit,
  SUM(CASE WHEN "metric.name" = 'postgresql.blocks_read' THEN "metric.value" ELSE 0 END) AS blocks_read
FROM metrics_db
WHERE 
  "db.system" = 'postgresql' AND
  ("metric.name" = 'postgresql.blocks_hit' OR "metric.name" = 'postgresql.blocks_read') AND
  __time BETWEEN TIMESTAMP '{start_time.strftime('%Y-%m-%d %H:%M:%S')}' AND TIMESTAMP '{end_time.strftime('%Y-%m-%d %H:%M:%S')}'
GROUP BY 1, 2
ORDER BY 1, 2
"""

try:
    buffer_df = pd.read_sql(query, engine)
    
    if not buffer_df.empty:
        # Calculate buffer hit ratio
        buffer_df['hit_ratio'] = buffer_df['blocks_hit'] / (buffer_df['blocks_hit'] + buffer_df['blocks_read'])
        
        # Create time series plot
        fig = px.line(
            buffer_df, 
            x="minute", 
            y="hit_ratio", 
            color="resource.instance.id",
            title="PostgreSQL Buffer Hit Ratio",
            labels={
                "minute": "Time",
                "hit_ratio": "Buffer Hit Ratio",
                "resource.instance.id": "Instance"
            }
        )
        fig.update_layout(height=500, yaxis_range=[0, 1])
        fig.show()
    else:
        print("No buffer metrics found in the selected time range.")
except Exception as e:
    print(f"Error fetching buffer metrics: {e}")

## Compare Multiple Metrics

In [None]:
# Time range for the query (last 24 hours)
end_time = datetime.now()
start_time = end_time - timedelta(hours=24)

# Get a specific database instance ID
query = """
SELECT DISTINCT "resource.instance.id", "db.system"
FROM metrics_db
LIMIT 10
"""

try:
    instances_df = pd.read_sql(query, engine)
    display(instances_df)
    
    # Select the first MySQL instance for demonstration
    mysql_instances = instances_df[instances_df['db.system'] == 'mysql']['resource.instance.id'].tolist()
    
    if mysql_instances:
        instance_id = mysql_instances[0]
        
        # Query for multiple MySQL metrics
        query = f"""
        SELECT 
          time_floor(__time, 'PT5M') AS time_bucket,
          "metric.name",
          AVG("metric.value") AS avg_value
        FROM metrics_db
        WHERE 
          "db.system" = 'mysql' AND
          "resource.instance.id" = '{instance_id}' AND
          "metric.name" IN ('mysql.connections', 'mysql.threads_running', 'mysql.questions') AND
          __time BETWEEN TIMESTAMP '{start_time.strftime('%Y-%m-%d %H:%M:%S')}' AND TIMESTAMP '{end_time.strftime('%Y-%m-%d %H:%M:%S')}'
        GROUP BY 1, 2
        ORDER BY 1, 2
        """
        
        metrics_df = pd.read_sql(query, engine)
        
        if not metrics_df.empty:
            # Pivot data for plotting
            pivot_df = metrics_df.pivot(index='time_bucket', columns='metric.name', values='avg_value')
            pivot_df = pivot_df.reset_index()
            
            # Create subplots
            fig = go.Figure()
            
            for column in pivot_df.columns[1:]:
                fig.add_trace(go.Scatter(
                    x=pivot_df['time_bucket'],
                    y=pivot_df[column],
                    mode='lines',
                    name=column
                ))
            
            fig.update_layout(
                title=f"MySQL Metrics for {instance_id}",
                xaxis_title="Time",
                yaxis_title="Value",
                height=600,
                legend_title="Metric"
            )
            fig.show()
        else:
            print("No metrics found for the selected instance in the time range.")
    else:
        print("No MySQL instances found in the metrics data.")
except Exception as e:
    print(f"Error fetching or visualizing metrics: {e}")