In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.ticker as ticker

# Configure plotting
plt.style.use("default")
sns.set_palette("husl")
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 120)

print("Kraken Analyzer - Overview Dashboard")
print(f"Pandas: {pd.__version__}")

# Load curated parquet file
data_path = Path("../data/curated/experiments.parquet")

if not data_path.exists():
    print(f"Curated dataset not found: {data_path}")
    print("   Run 'make analyze' from the root directory first.")
    raise FileNotFoundError("Please run data ingestion first")

df = pd.read_parquet(data_path)
print(f"Loaded dataset: {len(df):,} rows × {len(df.columns)} columns")
print(
    f"Data range: {df['synced_at'].min() if 'synced_at' in df.columns else 'Unknown'} to {df['synced_at'].max() if 'synced_at' in df.columns else 'Unknown'}"
)

print("=== Dataset Summary ===")
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")

# Show experiments if available
if "experiment_label" in df.columns:
    exp_counts = df["experiment_label"].value_counts()
    print(f"\nExperiments ({len(exp_counts)}):")
    for exp, count in exp_counts.items():
        print(f"  • {exp}: {count:,} rows")

# Show schema versions if available
if "schema_version" in df.columns:
    schema_counts = df["schema_version"].value_counts()
    print("\nSchema versions:")
    for version, count in schema_counts.items():
        print(f"  • {version}: {count:,} rows")

print("\n=== Column Overview ===")
print(df.dtypes)

Kraken Analyzer - Overview Dashboard
Pandas: 2.2.3
Loaded dataset: 2,300 rows × 30 columns
Data range: 2025:09:10T06:18:09Z to 2025:09:10T12:41:55+0200
=== Dataset Summary ===
Total rows: 2,300
Total columns: 30

Experiments (2):
  • kraken1.0_vs_INES: 1,200 rows
  • kraken1.1_vs_INES: 1,100 rows

Schema versions:
  • v1: 2,300 rows

=== Column Overview ===
ines_simulation_id                    float64
kraken_simulation_id                  float64
network_size                            int64
event_skew                            float64
node_event_ratio                      float64
num_event_types                         int64
max_parents                             int64
workload_size                           int64
query_length                            int64
simulation_mode                        object
median_selectivity                    float64
total_projections_placed                int64
placement_difference_to_ines_count      int64
combigen_time_seconds                 floa

## Filter 
Let's focus only on kraken1.1_vs_INES experiments for this analysis.

In [9]:
# Show first few rows
print("=== Sample Data ===")
display(df.head(5))

# Filter by experiment_label if available, and only keep kraken1.1_vs_INES experiments
if "experiment_label" in df.columns:
    df = df[df["experiment_label"] == "kraken1.1_vs_INES"]
    print(f"Filtered to 'kraken1.1_vs_INES' experiments: {len(df):,} rows")

=== Sample Data ===


Unnamed: 0,ines_simulation_id,kraken_simulation_id,network_size,event_skew,node_event_ratio,num_event_types,max_parents,workload_size,query_length,simulation_mode,...,ines_cost,kraken_cost,all_push_central_latency,ines_latency,kraken_latency,experiment_label,schema_version,source,snapshot_file,synced_at
0,,,10,2.0,0.5,6,5,5,3,random,...,49350.09,8439.40769,3,1,16.0,kraken1.0_vs_INES,v1,cloud-11,run_results.2025-09-10T06-18-09Z.csv,2025:09:10T06:18:09Z
1,,,10,2.0,0.5,6,5,5,3,random,...,3902411.0,20667.00553,3,1,16.0,kraken1.0_vs_INES,v1,cloud-11,run_results.2025-09-10T06-18-09Z.csv,2025:09:10T06:18:09Z
2,,,10,2.0,0.5,6,5,5,3,random,...,299245.2,23850.761721,3,1,11.0,kraken1.0_vs_INES,v1,cloud-11,run_results.2025-09-10T06-18-09Z.csv,2025:09:10T06:18:09Z
3,,,10,2.0,0.5,6,5,5,3,random,...,463178.2,9871.551568,3,1,15.0,kraken1.0_vs_INES,v1,cloud-11,run_results.2025-09-10T06-18-09Z.csv,2025:09:10T06:18:09Z
4,,,10,2.0,0.5,6,5,5,3,random,...,932549.7,15461.717421,3,1,15.0,kraken1.0_vs_INES,v1,cloud-11,run_results.2025-09-10T06-18-09Z.csv,2025:09:10T06:18:09Z


Filtered to 'kraken1.1_vs_INES' experiments: 1,100 rows
