# Noise Characterization - Real Prtecan Data

**Goal:** Analyze residual patterns in real prtecan data to understand noise structure.

This notebook addresses three key issues:
1. **Systematic bias:** y1 label at lowest pH is always negative (sometimes >3σ)
2. **Adjacent correlation:** Residuals alternate positive/negative at adjacent points
3. **X-value uncertainty:** pH values may be systematically wrong (per-well or plate-wide)

**Outputs:**
- Covariance matrices by label (saved to `dev/`)
- Bias parameters (saved to `dev/`)
- Correlation statistics (saved to `dev/`)
- Diagnostic plots

**Next:** Use outputs in `02_synthetic_data_generator.ipynb`


### Tips for development vs tutorial hygiene:
---
- Keep a scratch notebook (e.g., `prtecan_devel.ipynb`) for experiments.
- Avoid `os.chdir`; use Path objects relative to repository root as in this notebook.
- When a feature stabilizes, port minimal, clear examples into the main tutorial and keep heavy testing in `tests/`.

## Setup

In [None]:
# Magic commands for development
%load_ext autoreload
%autoreload 2

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats

from clophfit import prtecan
from clophfit.fitting.bayes import (
    fit_binding_pymc_multi,
    fit_binding_pymc_multi2,
)
from clophfit.fitting.core import (
    fit_binding_glob,
)

# Configure notebook
%matplotlib inline
plt.style.use("seaborn-v0_8")

data_root = Path("tests/Tecan")
l0_dir = data_root / "140220"
l1_dir = data_root / "L1"
l2_dir = data_root / "L2"
l4_dir = data_root / "L4"

In [None]:
def tit(folder, bg_mth="meansd"):
    tit = prtecan.Titration.fromlistfile(folder / "list.pH.csv", is_ph=1)
    tit.load_additions(folder / "additions.pH")
    tit.load_scheme(folder / "scheme.txt")
    tit.params.bg_mth = bg_mth
    tit.params.bg_adj = True
    return tit


tit = tit(l2_dir)
tit.bg_err

In [None]:
# Import noise model utilities
import sys

sys.path.insert(0, ".")

from clophfit.fitting.residuals import (
    collect_multi_residuals,
    residual_statistics,
    validate_residuals,
)
from dev.noise_models import (
    analyze_label_bias,
    compute_correlation_matrices,
    compute_residual_covariance,
    detect_adjacent_correlation,
    estimate_x_shift_statistics,
)

## residual cov

In [None]:
tit.result_global.compute_all()

In [None]:
all_res = collect_multi_residuals(tit.result_global.results)

In [None]:
cov_y1 = compute_residual_covariance(all_res)["y1"]
cov_y2 = compute_residual_covariance(all_res)["y2"]
print(cov_y2.round(3))
sns.heatmap(cov_y2.round(3), cmap="coolwarm")

In [None]:
compute_correlation_matrices(cov_y1)

In [None]:
corr_by_label = {
    lbl: cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov)))
    for lbl, cov in ((k, v.to_numpy()) for k, v in cov_by_label.items())
}
corr_y2 = pd.DataFrame(
    corr_by_label["y2"],
    index=cov_by_label["y2"].index,
    columns=cov_by_label["y2"].columns,
)
corr_y2

In [None]:
sns.heatmap(corr_y2, cmap="vlag")

In [None]:
sns.lineplot(all_res, x="x", y="resid_weighted", hue="label")

In [None]:
trace_multi2 = fit_binding_pymc_multi2(
    tit.result_global.results, tit.scheme, tit.bg_err
)

In [None]:
result_multi = fit_binding_pymc_multi(tit.result_global.results, tit.scheme)

In [None]:
validate_residuals(result_multi["A02"])

In [None]:
analyze_label_bias(all_res)

In [None]:
detect_adjacent_correlation(all_res)

In [None]:
estimate_x_shift_statistics(all_res, trace_multi)

In [None]:
residual_statistics(all_res)

In [None]:
fr.figure

In [None]:
validate_residuals(fr)

In [None]:
a, b = analyze_label_bias(all_res, 7)

In [None]:
a.loc["y1"]

In [None]:
a, b = detect_adjacent_correlation(all_res)

In [None]:
a[a.lag1_corr < -0.5]

In [None]:
tit.result_multi_trace

In [None]:
results_pymc = fit_binding_pymc_multi(tit.result_global.results, tit.scheme)
results_pymc2 = fit_binding_pymc_multi(tit.result_global.results, tit.scheme)
results_multi = fit_binding_pymc_multi(tit.result_global.results, tit.scheme)
results_multi2 = fit_binding_pymc_multi2(
    tit.result_global.results, tit.scheme, tit.bg_err
)
all_methods = pd.concat([
    collect_multi_residuals(lm_results_pymc).assign(method="PyMC"),
    collect_multi_residuals(pymc_results_pymc2).assign(method="PyMC2"),
    collect_multi_residuals(gls_results_multi).assign(method="Multi"),
    collect_multi_residuals(gls_results_multi2).assign(method="Multi2"),
])
all_methods

In [None]:
all_res = collect_multi_residuals(fit_results)

# Analyze your 3 issues
bias_summary, label_bias = analyze_label_bias(all_res, n_bins=3)
corr_stats, corr_by_label = detect_adjacent_correlation(all_res)
shift_stats = estimate_x_shift_statistics(all_res, fit_results)

## Residues distribution

In [None]:
k = "G12"

fr1 = tit.results[1][k]
fr1.figure

In [None]:
fr2 = tit.results[2][k]
print(fr1.result.redchi, fr2.result.redchi)
fr2.figure

In [None]:
fr2.result.residual * fr2.dataset["2"].y_err / fr2.dataset["2"].y

In [None]:
frg = tit.result_global[k]
print(frg.result.redchi)
frg.figure

In [None]:
fr1.dataset["1"].y_err *= np.sqrt(fr1.result.redchi) / 2
fr2.dataset["2"].y_err *= np.sqrt(fr2.result.redchi) / 2

In [None]:
fr1.dataset["1"].y_err, fr2.dataset["2"].y_err

In [None]:
frg.dataset["y1"].y_err, frg.dataset["y2"].y_err

In [None]:
from copy import deepcopy

In [None]:
dsg = deepcopy(frg.dataset)

dsg["y1"].y_errc = np.ones_like(dsg["y1"].y_errc) * 44 * np.sqrt(9) / 3 * 3.1
dsg["y2"].y_errc = np.ones_like(dsg["y2"].y_errc) * 14 * np.sqrt(9) / 3 * 3

# weight_multi_ds_titration(dsg)
dsg

In [None]:
fr = fit_binding_glob(dsg)
print(fr.result.redchi)
fr.figure

In [None]:
fr.dataset

In [None]:
fr.result.chisqr

In [None]:
np.mean(np.abs(fr.result.residual[7:]))

In [None]:
from clophfit.fitting.core import weight_da, weight_multi_ds_titration

In [None]:
weight_da(fr1.dataset["1"], is_ph=1)

In [None]:
weight_multi_ds_titration(fr1.dataset)

In [None]:
fr_mcmc = tit.result_mcmc[k]

In [None]:
fr_mcmc.figure

In [None]:
fr_odr = tit.result_odr[k]
fr_odr.figure

In [None]:
plt.plot(fr2.result.residual, "o")
plt.plot(frg.result.residual, "o")
plt.plot(fr_mcmc.result.residual, "s")
plt.plot(fr_odr.result.residual, "*")

In [None]:
fr_odr.result.residual

In [None]:
tr = tit.result_global
tr[k].result.residual

In [None]:
def residual_df_all(tr) -> pd.DataFrame:
    rows = []
    for k in tr.fit_keys:
        ds = tr[k].dataset
        res = np.asarray(tr[k].result.residual, dtype=float)

        start = 0
        for label, da in ds.items():
            x = np.asarray(da.x, dtype=float)  # masked x used in fit
            n = x.size
            r = res[start : start + n]
            start += n
            rows += [
                {"k": k, "label": label, "x": float(xi), "residue": float(ri)}
                for xi, ri in zip(x, r, strict=True)
            ]

    return pd.DataFrame(rows)


df = residual_df_all(tr)
df["std_res"] = (df.residue - np.nanmean(df.residue)) / np.nanstd(df.residue, ddof=1)

In [None]:
g = sns.displot(
    data=df,
    x="std_res",
    col="label",
    kind="hist",
    bins=60,
    stat="density",
    common_norm=False,
    height=4,
    aspect=1.2,
)
for ax in g.axes.flat:
    sns.kdeplot(
        data=df[df["label"] == ax.get_title().split(" = ")[-1]],
        x="std_res",
        ax=ax,
        lw=2,
    )
    ax.axvline(-2, ls="--", c="crimson", lw=1)
    ax.axvline(2, ls="--", c="crimson", lw=1)
    ax.set_xlim(-6, 6)
g.set_titles(col_template="{col_name}")
g.fig.suptitle("Standardized residuals (with ±2σ)", y=1.05)
plt.show()

In [None]:
g = sns.catplot(
    data=df,
    x="x",
    y="std_res",
    col="label",
    kind="boxen",  # nicer than boxplot for big n
    showfliers=False,
    height=4,
    aspect=1.4,
    sharey=True,
)
for ax in g.axes.flat:
    ax.axhline(-2, ls="--", c="crimson", lw=1)
    ax.axhline(2, ls="--", c="crimson", lw=1)
    ax.set_xlabel("x")
    ax.set_ylabel("std_res")
    ax.tick_params(axis="x", rotation=45)
g.fig.suptitle("Std residuals vs x (per label)", y=1.05)
plt.show()

In [None]:
out = (
    df
    .assign(out=np.abs(df["std_res"]) > 2.5)
    .groupby(["k", "label"], as_index=False)["out"]
    .mean()
    .rename(columns={"out": "outlier_rate"})
)
# plot top offenders per label
top = out.sort_values("outlier_rate", ascending=False).groupby("label").head(25)

plt.figure(figsize=(12, 6))
sns.barplot(data=top, y="k", x="outlier_rate", hue="label", dodge=False)
plt.xlabel("Outlier rate (|std_res| > 2)")
plt.ylabel("k (top 25 per label)")
plt.title("Worst wells by standardized-residual outlier rate")
plt.legend(title="label")
plt.tight_layout()
plt.show()

In [None]:
# df[np.abs(df.std_res) > 2.5]

In [None]:
df[np.abs(df.residue) > 3]

In [None]:
df[np.abs(df.residue) > 2.5]

In [None]:
df[df.std_res < -2.5]

In [None]:
df[df.std_res < -2.5]

In [None]:
tr = tit.result_global

residuals = [tr[k].result.residual.ravel() for k in tr.fit_keys]
residuals = np.concatenate(residuals)

all_res = residuals
std_res = (all_res - np.nanmean(all_res)) / np.nanstd(all_res, ddof=1)
std_res = residuals

# stats
k2, pval = stats.normaltest(std_res)
skew = stats.skew(std_res, bias=False)
kurt = stats.kurtosis(std_res, fisher=True, bias=False)

# plot
fig, ax = plt.subplots(figsize=(7, 5))
ax.hist(std_res, bins=40, density=True, color="#4c72b0", alpha=0.7)
x = np.linspace(-4, 4, 300)
# ax.plot(x, stats.norm.pdf(x, 0, 1), "r-", lw=2, label="N(0,1) PDF")
ax.set_xlabel("Standardized residual")
ax.set_ylabel("Density")
ax.set_title("Residual distribution (all fits in tit.results[2])")
props = {"boxstyle": "round", "facecolor": "white", "alpha": 0.8}
txt = f"n={len(std_res)}\n p={pval:.3g}\n skew={skew:.3f}\n kurt={kurt:.3f}"
ax.text(
    0.98,
    0.95,
    txt,
    transform=ax.transAxes,
    fontsize=10,
    va="top",
    ha="right",
    bbox=props,
)
ax.legend()
plt.show()

In [None]:
sns.histplot(std_res, kde=True)

In [None]:
stats.shapiro(std_res)

In [None]:
stats.kstest(std_res, "norm")

In [None]:
stats.probplot(std_res, plot=plt, rvalue=True)

In [None]:
# Seaborn doesn't have qqplot, use scipy.stats instead
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Histogram with KDE
sns.histplot(std_res, kde=True, ax=ax1)

# Q-Q plot (using scipy)
stats.probplot(std_res, dist="norm", plot=ax2)
plt.show()
# +end_src

In [None]:
tit.result_global["D03"].figure

In [None]:
plt.plot(tit.result_odr["D03"].result.residual, "o")

## Discard detection

test:

- E10
- F10
- G09


In [None]:
plt.plot([np.nanmean(tit.data[1][k] / tit.data[2][k].mean()) for k in tit.fit_keys])

print([
    (t[0], t[1])
    for t in [
        (k, np.nanmean(tit.data[1][k] / tit.data[2][k].mean())) for k in tit.fit_keys
    ]
    if t[1] > 2 or t[1] < 1
])

## Export Noise Parameters

Save the characterized noise parameters for use in synthetic data generation.

In [None]:
# Export noise parameters
# export_noise_parameters(cov_by_label, label_bias_stats, correlation_stats)
print("Export noise parameters after computing them above")