In [1]:
# %% [markdown]
# # 03 — Model Diagnostics & Robust Regression
# 
# *Diagnostic‑plots pentru modelul OLS* (din notebook‑ul 02) + **regresie robustă**
# (Huber‐T). Output‑urile merg în:
# 
# * `results/figures/diagnostics/` – PNG‑uri cu reziduuri, Q‑Q, Cook’s distance
# * `results/tables/` – coeficienți OLS vs RLM
# 
# ---
# %%
# 1️ Imports & path setup
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Statsmodels mandatory for this notebook
try:
    import statsmodels.formula.api as smf
    import statsmodels.api as sm
    from statsmodels.graphics.gofplots import qqplot
    from statsmodels.stats.outliers_influence import OLSInfluence
except ImportError as e:
    raise SystemExit(" statsmodels required.  →  pip install statsmodels") from e

BASE_DIR = Path("D:/Disertatie/1.database_performance_analysis").resolve()
FIG_DIAG_DIR = BASE_DIR / "results" / "figures" / "diagnostics"
TABLES_DIR   = BASE_DIR / "results" / "tables"
for p in [FIG_DIAG_DIR, TABLES_DIR]:
    p.mkdir(parents=True, exist_ok=True)

RAW_DIR  = BASE_DIR / "data" / "raw"
RAW_FILES = {
    "no_index"    : RAW_DIR / "queries_no_index.csv",
    "with_index"  : RAW_DIR / "queries_with_index.csv",
}

# %%
# 2️ Load data + build long format (index dataset only for clarity)
idx = pd.read_csv(RAW_FILES["with_index"], low_memory=False)
no  = pd.read_csv(RAW_FILES["no_index"],    low_memory=False)
for df in (idx, no):
    df["query_id"] = df["query_id"].astype(str)

base_cols = ["query_id", "documents_returned"]
long = pd.concat([
    idx[base_cols + ["execution_time_ms"]].assign(opt="after").rename(columns={"execution_time_ms":"exec_time"}),
    no [base_cols + ["execution_time_ms"]].assign(opt="before").rename(columns={"execution_time_ms":"exec_time"}),
])

In [2]:
# %%
# 3️ Fit OLS & Robust RLM
formula = "exec_time ~ C(opt) + np.log1p(documents_returned)"
ols_model = smf.ols(formula, data=long).fit()
rlm_model = smf.rlm(formula, data=long, M=sm.robust.norms.HuberT()).fit()

ols_model.summary().as_text()

# Save coefficient tables
pd.DataFrame({"OLS": ols_model.params, "RLM_Huber": rlm_model.params}).to_csv(TABLES_DIR/"coef_ols_vs_rlm.csv")


In [3]:
# %%
# 4️ Diagnostic plots for OLS
infl = OLSInfluence(ols_model)
resid = ols_model.resid
fitted = ols_model.fittedvalues

# a) Residuals vs Fitted 
plt.figure(figsize=(5,4))
plt.scatter(fitted, resid, alpha=0.6)
plt.axhline(0, color='red', ls='--')
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted")
plt.tight_layout()
plt.savefig(FIG_DIAG_DIR/"resid_vs_fitted.png"); plt.close()

# b) Q‑Q plot 
plt.figure(figsize=(4,4))
qqplot(resid, line='s', ax=plt.gca())
plt.title("QQ‑plot residuals")
plt.tight_layout(); plt.savefig(FIG_DIAG_DIR/"qq_residuals.png"); plt.close()

# c) Cook's distance
cooks = infl.cooks_distance[0]
plt.figure(figsize=(6,3))
plt.stem(np.arange(len(cooks)), cooks, markerfmt=",", linefmt="-")
plt.title("Cook's Distance")
plt.xlabel("Observation")
plt.ylabel("Cook D")
plt.tight_layout()
plt.savefig(FIG_DIAG_DIR / "cooks_distance.png")
plt.close()

# d) Leverage vs Residual squared 
leverage = infl.hat_matrix_diag
plt.figure(figsize=(5,4))
plt.scatter(leverage, (resid/np.std(resid))**2, alpha=0.6)
plt.xlabel("Leverage")
plt.ylabel("Standardized Residual²")
plt.title("Leverage vs Residuals²")
plt.tight_layout(); plt.savefig(FIG_DIAG_DIR/"lev_vs_resid2.png"); plt.close()

print("Diagnostics & robust regression complete. Outputs saved in: \n  •", FIG_DIAG_DIR, "\n  •", TABLES_DIR/"coef_ols_vs_rlm.csv")


Diagnostics & robust regression complete. Outputs saved in: 
  • D:\Disertatie\1.database_performance_analysis\results\figures\diagnostics 
  • D:\Disertatie\1.database_performance_analysis\results\tables\coef_ols_vs_rlm.csv
