In [None]:
# %% [markdown]
# # 04 — Learning Curves and Power Analysis
# 
# *Obiectiv:* se va arăta cum evoluează **puterea statistică** și **lățimea intervalului
# de încredere (CI)** pe măsură ce crește dimensiunea eșantionului.
# 
# Outputs:
# * `results/figures/power/power_curve.png` – putere vs. n (pentru efectul Cohen’s d observat)
# * `results/figures/power/ci_learning_curve.png` – lățimea CI bootstrap vs. n
# * `results/tables/learning_curve_ci.csv` – tabel numeric (n, CI-width)
# 
# > Necesită `statistical_summary.csv` (generat de notebook‑ul 02) sau recalculează intern coeficientul d dacă fișierul lipsește.
# 
# ---
# %%
# 1️ Imports & path setup
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.power import TTestPower

BASE_DIR = Path("D:/Disertatie/1.database_performance_analysis").resolve()
FIG_POWER_DIR = BASE_DIR / "results" / "figures" / "power"
TABLES_DIR    = BASE_DIR / "results" / "tables"
for p in [FIG_POWER_DIR, TABLES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# %%
# 2️ Load effect size (Cohen's d) from statistical_summary.csv or compute fallback
summary_file = TABLES_DIR / "statistical_summary.csv"
if summary_file.exists():
    summary_df = pd.read_csv(summary_file)
    d_obs = summary_df.loc[summary_df["group"]=="index", "cohen_d"].iloc[0]
    print(f"Using Cohen's d from summary: {d_obs:.3f}")
else:
    # Fallback – compute from raw diff of index dataset
    RAW_DIR = BASE_DIR / "data" / "raw"
    idx = pd.read_csv(RAW_DIR/"queries_with_index.csv")
    no  = pd.read_csv(RAW_DIR/"queries_no_index.csv")
    diff = no["execution_time_ms"] - idx["execution_time_ms"]
    d_obs = diff.mean()/diff.std(ddof=1)
    print(f"Computed Cohen's d ad‑hoc: {d_obs:.3f}")

# Ensure effect size positive (we care about magnitude)
d_obs = abs(d_obs)


Using Cohen's d from summary: 0.031


In [None]:
# %%
# 3️ Power curve for paired t‑test
ns = np.arange(10, 4001, 50)
power_calc = TTestPower()
powers = power_calc.power(effect_size=d_obs, nobs=ns, alpha=0.05)

plt.figure(figsize=(5,3.5))
plt.plot(ns, powers, "o-")
plt.axhline(0.8, color='red', ls='--', label='0.8 power')
plt.title(f"Power Curve (d={d_obs:.2f})")
plt.xlabel("Sample size (paired)")
plt.ylabel("Power (1-β)")
plt.legend()
plt.tight_layout()
plt.savefig(FIG_POWER_DIR/"power_curve.png")
plt.close()

In [None]:
# %%
# 4️ CI learning curve (bootstrap) pe dataset index
RAW_DIR = BASE_DIR / "data" / "raw"
idx = pd.read_csv(RAW_DIR/"queries_with_index.csv")
no  = pd.read_csv(RAW_DIR/"queries_no_index.csv")
idx["query_id"] = idx["query_id"].astype(str)
no ["query_id"] = no ["query_id"].astype(str)
merged = idx.merge(no, on="query_id", suffixes=("_idx","_no"))
diff = merged["execution_time_ms_no"] - merged["execution_time_ms_idx"]

rng = np.random.default_rng(0)
curve = []
for n in range(20, len(diff)+1, 50):
    boots = rng.choice(diff.values, size=(3000, n), replace=True).mean(axis=1)
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    curve.append({"n":n, "ci_width":ci_high-ci_low})
curve_df = pd.DataFrame(curve)
curve_df.to_csv(TABLES_DIR/"learning_curve_ci.csv", index=False)

plt.figure(figsize=(5,3.5))
plt.plot(curve_df["n"], curve_df["ci_width"], "o-")
plt.title("CI Width vs Sample Size (Index diff)")
plt.xlabel("Sample size")
plt.ylabel("CI width (ms)")
plt.tight_layout()
plt.savefig(FIG_POWER_DIR/"ci_learning_curve.png")
plt.close()

print("Power & CI learning curves generated →", FIG_POWER_DIR)


Power & CI learning curves generated → D:\Disertatie\1.database_performance_analysis\results\figures\power
