In [None]:
#@title Setup + upload (merges any number of CSVs, optional outlier filtering)
!pip -q install pandas matplotlib numpy

import io, os, zipfile, math, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Image
from google.colab import files

# -------------------- user-config knobs --------------------
metric_col           = "rollout_total_s"   # primary metric
phase_cols           = ["repo_sync_s","manifest_apply_s","resource_creation_s","pod_rollout_s"]
remove_outliers      = True                # set False to keep raw only
outlier_method       = "iqr"               # 'iqr' or 'winsor'
iqr_k                = 1.5                 # Tukey fence: Q1 - k*IQR, Q3 + k*IQR
winsor_pct           = 0.01                # 1% winsorization each tail if method='winsor'
# -----------------------------------------------------------

print("▶ Select your rollout CSVs (one or more):")
uploaded = files.upload()

dfs = []
for name, blob in uploaded.items():
    if name.lower().endswith(".csv"):
        try:
            df = pd.read_csv(io.BytesIO(blob))
            print(f"  ✓ {name}: {len(df)} rows")
            dfs.append(df)
        except Exception as e:
            print(f"  ✗ Failed to read {name}: {e}")

if not dfs:
    raise SystemExit("No CSVs were loaded.")

df = pd.concat(dfs, ignore_index=True)

# Normalize column names
df.columns = [c.strip() for c in df.columns]

required = {"status", metric_col}
missing = required - set(df.columns)
if missing:
    raise SystemExit(f"Missing required columns: {missing}\nColumns present: {list(df.columns)}")

# Coerce numeric for relevant cols
for c in [metric_col, *phase_cols]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Keep successful rows only
ok = df[df["status"].astype(str).str.lower().eq("ok")].copy()
ok = ok[pd.notnull(ok[metric_col])].reset_index(drop=True)
ok["iteration_global"] = np.arange(1, len(ok)+1)

print(f"\nCombined successful rows: n={len(ok)}")
display(ok.head())

# ---------- outlier handling ----------
def iqr_filter(series, k=1.5):
    q1, q3 = np.nanpercentile(series, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - k*iqr, q3 + k*iqr
    mask = (series >= lo) & (series <= hi)
    return mask, (lo, hi, q1, q3, iqr)

def winsorize(series, p=0.01):
    lo, hi = np.nanpercentile(series, [100*p, 100*(1-p)])
    clipped = series.clip(lower=lo, upper=hi)
    return clipped, (lo, hi)

ok_clean = ok.copy()
outlier_info = {}

if remove_outliers:
    if outlier_method == "iqr":
        mask, info = iqr_filter(ok_clean[metric_col], k=iqr_k)
        outlier_info["method"] = "iqr"
        outlier_info["params"] = {"k": iqr_k, "lo_hi_q1_q3_iqr": info}
        outliers = (~mask).sum()
        ok_clean = ok_clean[mask].reset_index(drop=True)
        print(f"[Outliers] IQR method (k={iqr_k}): removed {outliers} rows; kept n={len(ok_clean)}")
    elif outlier_method == "winsor":
        clipped, (lo, hi) = winsorize(ok_clean[metric_col], p=winsor_pct)
        outlier_info["method"] = "winsor"
        outlier_info["params"] = {"pct": winsor_pct, "lo": float(lo), "hi": float(hi)}
        ok_clean[metric_col] = clipped
        print(f"[Outliers] Winsorized at {100*winsor_pct:.1f}% each tail; kept n={len(ok_clean)}")
    else:
        print("[Outliers] Unknown method; skipping.")

# Utility: summary stats + 95% CI
def summary_stats(s):
    s = pd.to_numeric(s, errors="coerce").dropna()
    n = s.shape[0]
    mean = s.mean()
    std  = s.std(ddof=1) if n > 1 else np.nan
    ci   = 1.96 * std / np.sqrt(n) if n > 1 else np.nan
    return dict(n=int(n), mean=float(mean), std=float(std), ci_lo=float(mean-ci), ci_hi=float(mean+ci))

raw_stats   = summary_stats(ok[metric_col])
clean_stats = summary_stats(ok_clean[metric_col])

print("\nRaw stats:", json.dumps(raw_stats, indent=2))
print("Clean stats:", json.dumps(clean_stats, indent=2))
