# Synthetic Data Generator with Realistic Noise

**Goal:** Build improved synthetic data generation with realistic noise structure.

**Inputs:**
- Noise parameters from `01_noise_characterization.ipynb`
- Covariance matrices from `dev/`

**Noise Model Components:**
1. Base heteroscedastic noise (existing)
2. Label-dependent bias (y1 at low pH)
3. Correlated noise structure (adjacent points)
4. X-value uncertainty/shift simulation

**Outputs:**
- Updated `make_dataset()` function
- Validation plots comparing synthetic vs real residual patterns

**Next:** Test fitting methods in `03_fitting_method_comparison.ipynb`


### Tips for development vs tutorial hygiene:
---
- Keep a scratch notebook (e.g., `prtecan_devel.ipynb`) for experiments.
- Avoid `os.chdir`; use Path objects relative to repository root as in this notebook.
- When a feature stabilizes, port minimal, clear examples into the main tutorial and keep heavy testing in `tests/`.

## Setup

In [None]:
# Magic commands for development
%load_ext autoreload
%autoreload 2

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats

from clophfit import prtecan
from clophfit.fitting.core import (
    fit_binding_glob_recursive_outlier,
    fit_binding_glob_reweighted,
    outlier2,
)
from clophfit.fitting.odr import (
    fit_binding_odr,
)

# Configure notebook
%matplotlib inline
plt.style.use("seaborn-v0_8")

data_root = Path("tests/Tecan")
l0_dir = data_root / "140220"
l1_dir = data_root / "L1"
l2_dir = data_root / "L2"
l4_dir = data_root / "L4"

In [None]:
def tit(folder, bg_mth="meansd"):
    tit = prtecan.Titration.fromlistfile(folder / "list.pH.csv", is_ph=1)
    tit.load_additions(folder / "additions.pH")
    tit.load_scheme(folder / "scheme.txt")
    tit.params.bg_mth = bg_mth
    tit.params.bg_adj = True
    return tit


tit = tit(l2_dir)
tit.bg_err

In [None]:
# Import noise model utilities
import sys

sys.path.insert(0, ".")


import pandas as pd

## Load Noise Parameters

Load characterized noise parameters from notebook 01.

In [None]:
# Load noise parameters
from pathlib import Path

noise_dir = Path("dev")

# Load covariance matrices
cov_y1 = (
    pd.read_csv(noise_dir / "cov_matrix_y1.csv", index_col=0)
    if (noise_dir / "cov_matrix_y1.csv").exists()
    else None
)
cov_y2 = (
    pd.read_csv(noise_dir / "cov_matrix_y2.csv", index_col=0)
    if (noise_dir / "cov_matrix_y2.csv").exists()
    else None
)

# Load bias parameters
label_bias = (
    pd.read_csv(noise_dir / "label_bias.csv", index_col=0)
    if (noise_dir / "label_bias.csv").exists()
    else None
)

print("Loaded noise parameters:")
if cov_y1 is not None:
    print(f"  - Covariance y1: {cov_y1.shape}")
if cov_y2 is not None:
    print(f"  - Covariance y2: {cov_y2.shape}")
if label_bias is not None:
    print(f"  - Label bias: {label_bias.shape}")
    print(label_bias)

## Synthetic dataset

In [None]:
from clophfit.testing.synthetic import (
    _sample_correlated_signals,
    _sample_from_real,
    make_dataset,
)

In [None]:
from benchmarks.compare_fitting_methods import generate_synthetic_data

In [None]:
ds = generate_synthetic_data(pKa=6.7, add_outliers=True)
g = ds.plot()

In [None]:
ds["y1"].y_err / ds["y1"].y

In [None]:
ds, truth = make_dataset(
    7, randomize_signals=1, error_model="realistic", rel_error={"y1": 0.100, "y2": 0.2}
)  # uniform simple realistic physics
g = ds.plot()
print(truth)
fr = outlier2(ds)
fr.figure

In [None]:
ds = generate_synthetic_data(pKa=6.7, add_outliers=True)
fr = outlier2(ds, error_model="uniform")
fr.figure

In [None]:
ds, thruth = make_dataset(
    7.0, 2000, 200, is_ph=True, n_labels=1, error_model="realistic"
)
fr = outlier2(ds)
# fr.figure
plt.plot(ds["y0"].x, fr.result.residual / ds["y0"].y, "o-")

In [None]:
(
    ds["y0"].y_err / ds["y0"].y,
    fr.result.residual * fr.dataset["y0"].y_err / fr.dataset["y0"].y,
)

In [None]:
Ks = []
sKs = []
for _i in range(99):
    ds = generate_synthetic_data(7.1, add_outliers=True)
    fr = outlier2(ds, error_model="uniform")
    Ks.append(fr.result.params["K"].value)
    sKs.append(fr.result.params["K"].stderr)
sns.histplot(Ks, kde=True)

In [None]:
import pandas as pd

df = pd.DataFrame({"K": Ks, "K_err": sKs})
df = df[df.K_err < 0.5]

plt.errorbar(x=range(len(df.K)), y=df.K, yerr=df.K_err)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharey=True, width_ratios=[3, 1])

# Left panel
ax1.errorbar(
    x=range(len(df.K)),
    y=df.K,
    yerr=df.K_err,
    fmt="o",
    capsize=3,
    alpha=0.7,
    label="K Â± error",
)
ax1.axhline(
    y=df.K.mean(),
    color="r",
    linestyle="--",
    alpha=0.7,
    label=f"Mean: {df.K.mean():.2f}",
)
ax1.set_xlabel("Index")
ax1.set_ylabel("K")
ax1.set_title(f"K values (n={len(df.K)})")
ax1.legend()
ax1.grid(alpha=0.3)

# Right panel with both histogram and KDE
ax2_hist = ax2
ax2_kde = ax2.twinx()

# Histogram
n, bins, patches = ax2_hist.hist(
    df.K, bins=30, orientation="horizontal", alpha=0.3, edgecolor="black", density=False
)

# KDE
kde = stats.gaussian_kde(df.K)
x_kde = np.linspace(df.K.min(), df.K.max(), 200)
y_kde = kde(x_kde)
# Scale KDE to match histogram visually
scale_factor = n.max() / y_kde.max()
ax2_kde.plot(y_kde * scale_factor, x_kde, "r-", linewidth=2, alpha=0.7, label="KDE")

ax2_hist.set_xlabel("Frequency")
ax2_kde.set_xlabel("Density (scaled)", color="r")
ax2_hist.set_title("Distribution")
ax2_hist.tick_params(axis="x")
ax2_kde.tick_params(axis="x", labelcolor="r")
ax2_hist.grid(alpha=0.3)

# Add statistics text box
stats_text = f"Mean: {df.K.mean():.3f}\nStd: {df.K.std():.3f}\nMin: {df.K.min():.3f}\nMax: {df.K.max():.3f}"
ax2_hist.text(
    0.7,
    0.95,
    stats_text,
    transform=ax2_hist.transAxes,
    fontsize=10,
    verticalalignment="top",
    bbox={"boxstyle": "round", "facecolor": "wheat", "alpha": 0.8},
)

plt.tight_layout()

In [None]:
np.mean(Ks), np.median(Ks)

In [None]:
rng = np.random.default_rng(None)

_sample_from_real(rng, "K")

In [None]:
_sample_correlated_signals(rng)

In [None]:
from functools import partial

rel_error = {"y1": 0.04, "y2": 0.01}
make_ds = partial(
    make_dataset,
    randomize_signals=True,
    rel_error=rel_error,
    min_error=1,
    low_ph_drop=False,
    x_error_large=0.0,
)

In [None]:
from collections import defaultdict

values = defaultdict(list)

In [None]:
# ds, truth = make_dataset(6.8, randomize_signals=True, error_model="physics", noise=.01, rel_error=rel_error, outlier_prob=.1, outlier_sigma=4)
# ds, truth = make_dataset(6.8, randomize_signals=True, rel_error=rel_error, min_error=1, low_ph_drop=True, low_ph_drop_magnitude=.25, low_ph_drop6_prob=.0, x_error_large=0.0, seed=1)
ds, truth = make_ds(6.8)
g = ds.plot()

fr = outlier2(ds, error_model="uniform")
fr.figure

In [None]:
for _i in range(33):
    ds, truth = make_ds(7.2, min_error=0.1)

    fr = fit_binding_glob_reweighted(ds, "")
    values["reweighted"].append(fr.result.params["K"].value)
    fr = fit_binding_glob_recursive_outlier(ds)
    values["recursive_outlier"].append(fr.result.params["K"].value)
    fr = outlier2(ds)
    values["outlier"].append(fr.result.params["K"].value)
    # fr = fit_binding_pymc2(ds)
    fr = fit_binding_odr(ds)
    values["odr"].append(fr.result.params["K"].value)

for key in values:
    print(key, np.median(values[key]), np.mean(values[key]))

sns.histplot(values, kde=True)

In [None]:
sns.stripplot(values)
sns.boxplot(values, saturation=0.01)