In [8]:
# 📦 Imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# 📂 Load Data
df = pd.read_parquet("../data/benchmark_results.parquet")
print(f"Loaded {len(df)} rows")

# 🔎 Preview Data
df.head()

Loaded 1000 rows


Unnamed: 0,rows,columns,null_rate,cardinality,engine,output_format,runtime_ms
0,6424388,58,0.34,3129,polars,json,917.5
1,6551634,28,0.36,1873,polars,json,960.35
2,4305572,29,0.52,125,polars,json,683.24
3,2235489,75,0.48,7094,polars,csv,512.2
4,9959614,56,0.55,8026,pandas,json,1348.12


In [None]:
# 📂 Load Data
df = pd.read_parquet("../data/clustered_benchmark_data.parquet")
print(f"Loaded {len(df)} rows")

# 🔎 Preview Data
df.head()

In [9]:
# 📊 Cluster Distribution
cluster_counts = df['cluster'].value_counts().sort_index()
cluster_counts.plot(kind='bar', title='Cluster Size Distribution', xlabel='Cluster ID', ylabel='Number of Runs', figsize=(10, 4))
plt.show()

KeyError: 'cluster'

In [None]:
# 🔍 Feature Distributions by Cluster
import seaborn as sns

fig, axs = plt.subplots(1, 3, figsize=(15, 4))
sns.boxplot(x='cluster', y='runtime_ms', data=df, ax=axs[0])
axs[0].set_title('Runtime by Cluster')

sns.boxplot(x='cluster', y='null_rate', data=df, ax=axs[1])
axs[1].set_title('Null Rate by Cluster')

sns.boxplot(x='cluster', y='cardinality', data=df, ax=axs[2])
axs[2].set_title('Cardinality by Cluster')

plt.tight_layout()
plt.show()

In [None]:
# 🧭 UMAP Projection to 2D
features = df[["rows", "columns", "null_rate", "cardinality", "runtime_ms"]]
scaled = StandardScaler().fit_transform(features)

umap = UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding = umap.fit_transform(scaled)

plt.figure(figsize=(10, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], c=df['cluster'], cmap='Spectral', s=10)
plt.title("UMAP Projection of Benchmark Clusters")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(label='Cluster ID')
plt.show()

In [None]:
# 📈 Cluster Summary Table
def mode(series):
    return series.mode()[0] if not series.mode().empty else "N/A"

summary = (
    df.groupby("cluster")
    .agg({
        "rows": "mean",
        "columns": "mean",
        "null_rate": "mean",
        "cardinality": "mean",
        "runtime_ms": "mean",
        "engine": mode,
        "output_format": mode,
        "cluster": "count"
    })
    .rename(columns={"cluster": "n_runs"})
    .reset_index()
)

summary.style.format("{:.2f}", subset=["rows", "columns", "null_rate", "cardinality", "runtime_ms"])