# Query Analytics (QAN) Analysis Example

This notebook demonstrates how to analyze database query performance data collected by Project Obsidian Core.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
from datetime import datetime, timedelta
import pydruid
from pydruid.client import *
from pydruid.utils.aggregators import *
from pydruid.utils.filters import Dimension

# Configure matplotlib
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

## Connect to Druid

In [None]:
# Configuration
DRUID_HOST = 'druid-router'
DRUID_PORT = 8888
DRUID_URL = f'http://{DRUID_HOST}:{DRUID_PORT}'
DRUID_ENDPOINT = f'{DRUID_URL}/druid/v2/'

# Initialize Druid client
client = PyDruid(DRUID_ENDPOINT, 'sql')

# Test connection
try:
    tables = client.sql("SHOW TABLES")
    print("Connected to Druid successfully!")
    print(f"Available tables: {tables}")
except Exception as e:
    print(f"Failed to connect to Druid: {e}")

## Define Time Range

In [None]:
# Define time range for analysis (last hour by default)
end_time = datetime.now()
start_time = end_time - timedelta(hours=1)

# Format for Druid SQL
start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")

print(f"Analyzing data from {start_time_str} to {end_time_str}")

## Top 10 Queries by Execution Time (MySQL)

In [None]:
# Query MySQL top 10 queries by execution time
mysql_query = f"""
SELECT
  db.statement.sample AS query,
  SUM(db.query.calls.delta) AS execution_count,
  SUM(db.query.total_timer_wait.delta) / 1000000000 AS total_time_sec,
  SUM(db.query.total_timer_wait.delta) / SUM(db.query.calls.delta) / 1000000 AS avg_time_ms,
  SUM(db.query.rows_examined.delta) AS rows_examined,
  SUM(db.query.rows_sent.delta) AS rows_sent,
  SUM(db.query.rows_examined.delta) / SUM(db.query.rows_sent.delta) AS rows_examined_sent_ratio
FROM qan_db
WHERE "__time" BETWEEN TIMESTAMP '{start_time_str}' AND TIMESTAMP '{end_time_str}'
  AND db.system = 'mysql'
GROUP BY db.statement.sample
ORDER BY total_time_sec DESC
LIMIT 10
"""

try:
    result = client.sql(mysql_query)
    mysql_top_queries = pd.DataFrame(result)
    
    if not mysql_top_queries.empty:
        # Display results
        print("Top 10 MySQL Queries by Execution Time:")
        display(mysql_top_queries)
        
        # Plot results
        plt.figure(figsize=(14, 8))
        plt.barh(mysql_top_queries['query'].str[:50], mysql_top_queries['avg_time_ms'])
        plt.xlabel('Average Execution Time (ms)')
        plt.ylabel('Query')
        plt.title('Top 10 MySQL Queries by Avg. Execution Time')
        plt.tight_layout()
        plt.show()
    else:
        print("No MySQL data found for the specified time range")
except Exception as e:
    print(f"Error querying MySQL data: {e}")

## Top 10 Queries by Execution Time (PostgreSQL)

In [None]:
# Query PostgreSQL top 10 queries by execution time
pg_query = f"""
SELECT
  db.statement.sample AS query,
  SUM(db.query.calls.delta) AS execution_count,
  SUM(db.query.total_exec_time.delta) / 1000 AS total_time_sec,
  SUM(db.query.total_exec_time.delta) / SUM(db.query.calls.delta) AS avg_time_ms,
  SUM(db.query.total_plan_time.delta) / SUM(db.query.calls.delta) AS avg_plan_time_ms,
  SUM(db.query.rows.delta) AS rows_processed
FROM qan_db
WHERE "__time" BETWEEN TIMESTAMP '{start_time_str}' AND TIMESTAMP '{end_time_str}'
  AND db.system = 'postgresql'
GROUP BY db.statement.sample
ORDER BY total_time_sec DESC
LIMIT 10
"""

try:
    result = client.sql(pg_query)
    pg_top_queries = pd.DataFrame(result)
    
    if not pg_top_queries.empty:
        # Display results
        print("Top 10 PostgreSQL Queries by Execution Time:")
        display(pg_top_queries)
        
        # Plot results
        plt.figure(figsize=(14, 8))
        plt.barh(pg_top_queries['query'].str[:50], pg_top_queries['avg_time_ms'])
        plt.xlabel('Average Execution Time (ms)')
        plt.ylabel('Query')
        plt.title('Top 10 PostgreSQL Queries by Avg. Execution Time')
        plt.tight_layout()
        plt.show()
    else:
        print("No PostgreSQL data found for the specified time range")
except Exception as e:
    print(f"Error querying PostgreSQL data: {e}")

## Query Execution Counts Over Time

In [None]:
# Query execution count trends over time
time_series_query = f"""
SELECT
  TIME_FLOOR("__time", 'PT5M') AS time_bucket,
  db.system AS database_system,
  SUM(db.query.calls.delta) AS query_count
FROM qan_db
WHERE "__time" BETWEEN TIMESTAMP '{start_time_str}' AND TIMESTAMP '{end_time_str}'
GROUP BY TIME_FLOOR("__time", 'PT5M'), db.system
ORDER BY time_bucket ASC
"""

try:
    result = client.sql(time_series_query)
    execution_trends = pd.DataFrame(result)
    
    if not execution_trends.empty:
        # Convert timestamp to datetime for better plotting
        execution_trends['time_bucket'] = pd.to_datetime(execution_trends['time_bucket'])
        
        # Plot time series
        plt.figure(figsize=(14, 8))
        
        # Plot each database system
        for db_system in execution_trends['database_system'].unique():
            db_data = execution_trends[execution_trends['database_system'] == db_system]
            plt.plot(db_data['time_bucket'], db_data['query_count'], marker='o', linestyle='-', label=db_system)
        
        plt.xlabel('Time')
        plt.ylabel('Query Execution Count')
        plt.title('Query Execution Counts Over Time (5-minute intervals)')
        plt.legend()
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("No time series data found for the specified time range")
except Exception as e:
    print(f"Error querying time series data: {e}")

## Database Performance Metrics Comparison

In [None]:
# Compare database performance metrics
metrics_query = f"""
SELECT
  db.system AS database_system,
  COUNT(DISTINCT db.statement.digest) AS unique_queries,
  SUM(db.query.calls.delta) AS total_executions,
  AVG(CASE WHEN db.system = 'mysql' THEN db.query.total_timer_wait.delta / db.query.calls.delta / 1000000 
           WHEN db.system = 'postgresql' THEN db.query.total_exec_time.delta / db.query.calls.delta END) AS avg_execution_time_ms
FROM qan_db
WHERE "__time" BETWEEN TIMESTAMP '{start_time_str}' AND TIMESTAMP '{end_time_str}'
GROUP BY db.system
"""

try:
    result = client.sql(metrics_query)
    metrics_comparison = pd.DataFrame(result)
    
    if not metrics_comparison.empty:
        # Display results
        print("Database Performance Metrics Comparison:")
        display(metrics_comparison)
        
        # Create subplots for comparison
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        # Plot unique queries
        axes[0].bar(metrics_comparison['database_system'], metrics_comparison['unique_queries'])
        axes[0].set_title('Unique Queries')
        axes[0].set_ylabel('Count')
        
        # Plot total executions
        axes[1].bar(metrics_comparison['database_system'], metrics_comparison['total_executions'])
        axes[1].set_title('Total Query Executions')
        axes[1].set_ylabel('Count')
        
        # Plot average execution time
        axes[2].bar(metrics_comparison['database_system'], metrics_comparison['avg_execution_time_ms'])
        axes[2].set_title('Average Execution Time')
        axes[2].set_ylabel('Time (ms)')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No metrics comparison data found for the specified time range")
except Exception as e:
    print(f"Error querying metrics comparison data: {e}")

## Interactive Query Filter Tool

In [None]:
# This cell creates interactive widgets to filter and explore data
from ipywidgets import interact, widgets

# Get unique database systems
systems_query = """
SELECT DISTINCT
  db.system AS database_system
FROM qan_db
"""

try:
    result = client.sql(systems_query)
    db_systems = [row['database_system'] for row in result]
    if not db_systems:
        db_systems = ['mysql', 'postgresql']  # Default if no data yet
except Exception as e:
    print(f"Error fetching database systems: {e}")
    db_systems = ['mysql', 'postgresql']  # Default if query fails

# Function to filter and display queries
def explore_queries(db_system, min_exec_time=0, limit=10, include_pattern=''):
    time_col = "db.query.total_timer_wait.delta / 1000000" if db_system == 'mysql' else "db.query.total_exec_time.delta"
    
    filter_clause = f"AND db.statement.sample LIKE '%{include_pattern}%'" if include_pattern else ""
    
    query = f"""
    SELECT
      db.statement.sample AS query,
      SUM(db.query.calls.delta) AS execution_count,
      SUM({time_col}) / 1000 AS total_time_sec,
      SUM({time_col}) / SUM(db.query.calls.delta) AS avg_time_ms
    FROM qan_db
    WHERE "__time" BETWEEN TIMESTAMP '{start_time_str}' AND TIMESTAMP '{end_time_str}'
      AND db.system = '{db_system}'
      {filter_clause}
    GROUP BY db.statement.sample
    HAVING avg_time_ms >= {min_exec_time}
    ORDER BY total_time_sec DESC
    LIMIT {limit}
    """
    
    try:
        result = client.sql(query)
        df = pd.DataFrame(result)
        
        if not df.empty:
            # Display results
            print(f"Queries for {db_system} with avg. execution time >= {min_exec_time}ms:")
            display(df)
            
            # Plot results
            plt.figure(figsize=(14, 8))
            plt.barh(df['query'].str[:50], df['avg_time_ms'])
            plt.xlabel('Average Execution Time (ms)')
            plt.ylabel('Query')
            plt.title(f'Top Queries for {db_system} by Avg. Execution Time')
            plt.tight_layout()
            plt.show()
        else:
            print(f"No queries found for {db_system} with the specified filters")
    except Exception as e:
        print(f"Error querying data: {e}")

# Create interactive widgets
interact(
    explore_queries,
    db_system=widgets.Dropdown(options=db_systems, description='Database:'),
    min_exec_time=widgets.IntSlider(min=0, max=1000, step=10, value=0, description='Min Exec Time (ms):'),
    limit=widgets.IntSlider(min=5, max=50, step=5, value=10, description='Result Limit:'),
    include_pattern=widgets.Text(value='', description='Query Contains:', placeholder='SQL keyword')
);