# Mondrian CP for WCP-L2D

Class-conditional (Mondrian) conformal prediction to address the
prevalence shift problem in binary CP deferral.

**Key idea**: Instead of one quantile threshold for all classes,
compute separate thresholds for class 0 (negative) and class 1 (positive).
This naturally handles the massive prevalence shift between CheXpert and NIH,
where standard CP produces overly conservative prediction sets (~95% deferral)
and WCP has inconsistent behavior across pathologies.

In [None]:
import math

import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torchcp.classification.score import RAPS

from wcp_l2d.features import ExtractedFeatures
from wcp_l2d.pathologies import COMMON_PATHOLOGIES
from wcp_l2d.label_utils import extract_binary_labels
from wcp_l2d.dre import AdaptiveDRE
from wcp_l2d.conformal import ConformalPredictor, WeightedConformalPredictor
from wcp_l2d.evaluation import (
    compute_coverage,
    compute_system_accuracy,
    _predictions_from_sets,
    DeferralResult,
)

SEED = 42
EXPERT_ACCURACY = 0.85
FEATURE_DIR = Path("../data/features")
TARGET_PATHOLOGY = "Effusion"

np.random.seed(SEED)
torch.manual_seed(SEED)
print("Setup complete.")

## 1. Data Preparation

Load features, extract binary labels, split data, train classifier, fit DRE.
Same pipeline as wcp_experiment.ipynb.

In [None]:
# Load features
chexpert = ExtractedFeatures.load(
    FEATURE_DIR / "chexpert_densenet121-res224-chex_features.npz"
)
nih = ExtractedFeatures.load(FEATURE_DIR / "nih_densenet121-res224-chex_features.npz")

# Binary labels for target pathology
chex_feats, chex_labels, _ = extract_binary_labels(
    chexpert.features, chexpert.labels, COMMON_PATHOLOGIES, TARGET_PATHOLOGY
)
nih_feats, nih_labels, _ = extract_binary_labels(
    nih.features, nih.labels, COMMON_PATHOLOGIES, TARGET_PATHOLOGY
)

# Splits
chex_train_feats, chex_temp_feats, chex_train_labels, chex_temp_labels = (
    train_test_split(
        chex_feats, chex_labels, test_size=0.4, random_state=SEED, stratify=chex_labels
    )
)
chex_cal_feats, chex_test_feats, chex_cal_labels, chex_test_labels = train_test_split(
    chex_temp_feats,
    chex_temp_labels,
    test_size=0.5,
    random_state=SEED,
    stratify=chex_temp_labels,
)

# NIH pool (unlabeled, for DRE) + test
rng = np.random.RandomState(SEED)
nih_all_perm = rng.permutation(len(nih.features))
nih_pool_feats_all = nih.features[nih_all_perm[: len(nih.features) // 2]]

_, nih_test_feats, _, nih_test_labels = train_test_split(
    nih_feats, nih_labels, test_size=0.5, random_state=SEED, stratify=nih_labels
)

print(f"CheXpert cal: {len(chex_cal_labels)} (prev={chex_cal_labels.mean():.3f})")
print(f"NIH test:     {len(nih_test_labels)} (prev={nih_test_labels.mean():.3f})")
print(f"Prevalence ratio: {chex_cal_labels.mean() / nih_test_labels.mean():.1f}x")

In [None]:
# Train binary classifier
scaler = StandardScaler()
X_train = scaler.fit_transform(chex_train_feats)
X_cal = scaler.transform(chex_cal_feats)
X_test_chex = scaler.transform(chex_test_feats)
X_test_nih = scaler.transform(nih_test_feats)

clf = LogisticRegression(
    solver="lbfgs", max_iter=1000, C=1.0, random_state=SEED
)
clf.fit(X_train, chex_train_labels)


def get_binary_logits(clf, X):
    d = clf.decision_function(X)
    return np.column_stack([-d, d])


cal_logits = get_binary_logits(clf, X_cal)
test_chex_logits = get_binary_logits(clf, X_test_chex)
test_nih_logits = get_binary_logits(clf, X_test_nih)

# DRE
dre = AdaptiveDRE(n_components=4, weight_clip=20.0, random_state=SEED)
dre.fit(chex_cal_feats, nih_pool_feats_all)
cal_weights = dre.compute_weights(chex_cal_feats)
test_nih_weights = dre.compute_weights(nih_test_feats)

diag = dre.diagnostics(chex_cal_feats)
print(f"Classifier NIH AUC: {roc_auc_score(nih_test_labels, clf.predict_proba(X_test_nih)[:, 1]):.4f}")
print(f"DRE ESS: {diag.ess:.0f}/{len(cal_weights)} = {diag.ess_fraction:.3f}")

## 2. Mondrian Conformal Prediction

Standard CP uses a single quantile threshold for all classes:
$$C(x) = \{y : s(x,y) \leq \hat{q}\}$$

Mondrian CP computes **class-conditional thresholds**:
$$C(x) = \{y : s(x,y) \leq \hat{q}_y\}$$

where $\hat{q}_y$ is calibrated only on samples with true label $y$.
This gives **class-conditional coverage**: $P(Y \in C(X) \mid Y=y) \geq 1-\alpha$ for each $y$.

In [None]:
class MondrianConformalPredictor:
    """Class-conditional (Mondrian) conformal prediction using RAPS.

    Calibrates separate quantile thresholds per class, providing
    class-conditional coverage guarantees.
    """

    def __init__(self, penalty=0.1, kreg=1, randomized=False):
        self.score_fn = RAPS(penalty=penalty, kreg=kreg, randomized=randomized)
        self.q_hats = {}  # {class_id: q_hat}

    def calibrate(self, logits, labels, alpha=0.1):
        """Calibrate with class-conditional quantiles."""
        logits_t = torch.tensor(logits, dtype=torch.float32)
        labels_t = torch.tensor(labels, dtype=torch.long)
        scores = self.score_fn(logits_t, labels_t).numpy()

        for c in np.unique(labels):
            mask = labels == c
            class_scores = np.sort(scores[mask])
            n = len(class_scores)
            k = math.ceil((n + 1) * (1 - alpha))
            self.q_hats[int(c)] = (
                float(class_scores[k - 1]) if k <= n else float("inf")
            )

        return self.q_hats

    def predict(self, logits):
        """Generate prediction sets with per-class thresholds."""
        logits_t = torch.tensor(logits, dtype=torch.float32)
        all_scores = self.score_fn(logits_t).numpy()  # [N, K]

        N, K = all_scores.shape
        prediction_sets = np.zeros((N, K), dtype=np.int32)
        for c in range(K):
            if c in self.q_hats:
                prediction_sets[:, c] = (all_scores[:, c] <= self.q_hats[c]).astype(
                    np.int32
                )
        return prediction_sets


class MondrianWeightedConformalPredictor:
    """Class-conditional weighted conformal prediction.

    Combines Mondrian (per-class) calibration with importance-weighted
    quantiles from DRE, addressing both covariate and prevalence shift.
    """

    def __init__(self, penalty=0.1, kreg=1, randomized=False):
        self.score_fn = RAPS(penalty=penalty, kreg=kreg, randomized=randomized)
        self.cal_data = {}  # {class_id: {scores, weights}}

    def calibrate(self, logits, labels, weights):
        """Calibrate with per-class weighted scores."""
        logits_t = torch.tensor(logits, dtype=torch.float32)
        labels_t = torch.tensor(labels, dtype=torch.long)
        scores = self.score_fn(logits_t, labels_t).numpy()

        for c in np.unique(labels):
            mask = labels == c
            class_scores = scores[mask]
            class_weights = weights[mask]
            sort_idx = np.argsort(class_scores)
            self.cal_data[int(c)] = {
                "scores": class_scores[sort_idx],
                "weights": class_weights[sort_idx],
            }

    def predict(self, logits, test_weights, alpha=0.1):
        """Generate prediction sets with per-class weighted quantiles."""
        logits_t = torch.tensor(logits, dtype=torch.float32)
        all_scores = self.score_fn(logits_t).numpy()  # [N, K]

        N, K = all_scores.shape
        prediction_sets = np.zeros((N, K), dtype=np.int32)

        for c in range(K):
            if c not in self.cal_data:
                continue

            cal_scores = self.cal_data[c]["scores"]
            cal_weights = self.cal_data[c]["weights"]
            n_cal = len(cal_scores)

            # Weighted quantile: [N_test, n_cal + 1] (last = test point at inf)
            cal_w = cal_weights[np.newaxis, :]  # [1, n_cal]
            test_w = test_weights[:, np.newaxis]  # [N, 1]

            all_w = np.concatenate(
                [np.broadcast_to(cal_w, (N, n_cal)), test_w], axis=1
            )
            p = all_w / all_w.sum(axis=1, keepdims=True)
            cumprob = np.cumsum(p[:, :n_cal], axis=1)

            target = 1 - alpha
            reached = cumprob >= target
            has_any = reached.any(axis=1)
            first_idx = np.argmax(reached, axis=1)

            q_hat = np.where(has_any, cal_scores[first_idx], np.inf)
            prediction_sets[:, c] = (all_scores[:, c] <= q_hat).astype(np.int32)

        return prediction_sets


print("Mondrian predictors defined.")

## 3. Evaluate All Methods on Target Pathology

In [None]:
def evaluate_cp_method(method_name, predictor_factory, alphas, expert_accuracy=EXPERT_ACCURACY):
    """Generic evaluation loop for any CP method."""
    results = []
    for alpha in alphas:
        pred_sets = predictor_factory(alpha)
        cov = compute_coverage(pred_sets, nih_test_labels)
        preds, defer_mask = _predictions_from_sets(pred_sets, test_nih_logits)
        sys = compute_system_accuracy(
            preds, nih_test_labels, defer_mask, expert_accuracy=expert_accuracy
        )
        results.append(
            DeferralResult(
                method=method_name,
                alpha_or_threshold=float(alpha),
                system_accuracy=sys["system_accuracy"],
                deferral_rate=sys["deferral_rate"],
                coverage_rate=cov["coverage_rate"],
                average_set_size=cov["average_set_size"],
                model_accuracy_on_kept=sys["model_accuracy_on_kept"],
                n_total=len(nih_test_labels),
                n_deferred=sys["n_deferred"],
            )
        )
    return results


alphas = np.linspace(0.01, 0.5, 50)


# Standard CP
def std_cp_factory(alpha):
    cp = ConformalPredictor(penalty=0.1, kreg=1, randomized=False)
    cp.calibrate(cal_logits, chex_cal_labels, alpha=alpha)
    return cp.predict(test_nih_logits)


# WCP
def wcp_factory(alpha):
    wcp = WeightedConformalPredictor(penalty=0.1, kreg=1, randomized=False)
    wcp.calibrate(cal_logits, chex_cal_labels, cal_weights)
    return wcp.predict(test_nih_logits, test_nih_weights, alpha=alpha)


# Mondrian CP
def mondrian_cp_factory(alpha):
    mcp = MondrianConformalPredictor(penalty=0.1, kreg=1, randomized=False)
    mcp.calibrate(cal_logits, chex_cal_labels, alpha=alpha)
    return mcp.predict(test_nih_logits)


# Mondrian WCP
def mondrian_wcp_factory(alpha):
    mwcp = MondrianWeightedConformalPredictor(penalty=0.1, kreg=1, randomized=False)
    mwcp.calibrate(cal_logits, chex_cal_labels, cal_weights)
    return mwcp.predict(test_nih_logits, test_nih_weights, alpha=alpha)


std_results = evaluate_cp_method("Standard CP", std_cp_factory, alphas)
wcp_results = evaluate_cp_method("WCP", wcp_factory, alphas)
mcp_results = evaluate_cp_method("Mondrian CP", mondrian_cp_factory, alphas)
mwcp_results = evaluate_cp_method("Mondrian WCP", mondrian_wcp_factory, alphas)

print("All methods evaluated.")

In [None]:
# Summary at alpha=0.1
alpha_target = 0.1
rows = []
for name, res_list in [
    ("Standard CP", std_results),
    ("WCP", wcp_results),
    ("Mondrian CP", mcp_results),
    ("Mondrian WCP", mwcp_results),
]:
    r = min(res_list, key=lambda r: abs(r.alpha_or_threshold - alpha_target))
    rows.append(
        {
            "Method": name,
            "Coverage": f"{r.coverage_rate:.4f}",
            "Avg Set Size": f"{r.average_set_size:.3f}",
            "Deferral Rate": f"{r.deferral_rate:.4f}",
            "System Acc": f"{r.system_accuracy:.4f}",
            "Model Acc (kept)": f"{r.model_accuracy_on_kept:.4f}",
        }
    )

df = pd.DataFrame(rows)
print(f"\n{TARGET_PATHOLOGY} — Results at alpha={alpha_target} (target coverage >= {1-alpha_target:.0%})")
print("=" * 90)
print(df.to_string(index=False))

## 4. Class-Conditional Coverage Analysis

Examine coverage separately for positive and negative samples to understand
how each method handles the prevalence shift.

In [None]:
def class_conditional_analysis(pred_sets, logits, labels, method_name, alpha=0.1):
    """Coverage and deferral broken down by true class."""
    set_sizes = pred_sets.sum(axis=1)
    covered = pred_sets[np.arange(len(labels)), labels].astype(bool)
    deferred = set_sizes != 1

    rows = []
    for c, c_name in [(0, "Negative"), (1, "Positive")]:
        mask = labels == c
        n = mask.sum()
        rows.append(
            {
                "Method": method_name,
                "Class": c_name,
                "N": n,
                "Coverage": f"{covered[mask].mean():.4f}",
                "Deferral": f"{deferred[mask].mean():.4f}",
                "Avg |C|": f"{set_sizes[mask].mean():.3f}",
            }
        )
    rows.append(
        {
            "Method": method_name,
            "Class": "Overall",
            "N": len(labels),
            "Coverage": f"{covered.mean():.4f}",
            "Deferral": f"{deferred.mean():.4f}",
            "Avg |C|": f"{set_sizes.mean():.3f}",
        }
    )
    return rows


alpha_target = 0.1
all_rows = []

for name, PredClass, use_weights in [
    ("Standard CP", ConformalPredictor, False),
    ("WCP", WeightedConformalPredictor, True),
    ("Mondrian CP", MondrianConformalPredictor, False),
    ("Mondrian WCP", MondrianWeightedConformalPredictor, True),
]:
    pred = PredClass(penalty=0.1, kreg=1, randomized=False)
    if use_weights:
        pred.calibrate(cal_logits, chex_cal_labels, cal_weights)
        ps = pred.predict(test_nih_logits, test_nih_weights, alpha=alpha_target)
    else:
        pred.calibrate(cal_logits, chex_cal_labels, alpha=alpha_target)
        ps = pred.predict(test_nih_logits)
    all_rows.extend(class_conditional_analysis(ps, test_nih_logits, nih_test_labels, name))

df_cc = pd.DataFrame(all_rows)
print(f"\nClass-conditional analysis for {TARGET_PATHOLOGY} at alpha={alpha_target}")
print("=" * 80)
print(df_cc.to_string(index=False))

## 5. Mondrian Quantile Inspection

Compare the quantile thresholds computed by each method to understand
why prediction sets differ.

In [None]:
alpha_target = 0.1

# Standard CP
std_cp = ConformalPredictor(penalty=0.1, kreg=1, randomized=False)
std_q = std_cp.calibrate(cal_logits, chex_cal_labels, alpha=alpha_target)

# Mondrian CP
mon_cp = MondrianConformalPredictor(penalty=0.1, kreg=1, randomized=False)
mon_qs = mon_cp.calibrate(cal_logits, chex_cal_labels, alpha=alpha_target)

print(f"Quantile thresholds at alpha={alpha_target}:")
print(f"  Standard CP:  q_hat = {std_q:.6f} (single threshold for all classes)")
print(f"  Mondrian CP:  q_hat_0 (neg) = {mon_qs[0]:.6f}")
print(f"                q_hat_1 (pos) = {mon_qs[1]:.6f}")
print()

# Score distributions by class
score_fn = RAPS(penalty=0.1, kreg=1, randomized=False)
cal_scores = score_fn(
    torch.tensor(cal_logits, dtype=torch.float32),
    torch.tensor(chex_cal_labels, dtype=torch.long),
).numpy()

print("Calibration score statistics:")
for c, name in [(0, "Negative"), (1, "Positive")]:
    s = cal_scores[chex_cal_labels == c]
    print(
        f"  Class {c} ({name}): n={len(s)}, "
        f"mean={s.mean():.4f}, std={s.std():.4f}, "
        f"median={np.median(s):.4f}, "
        f"p10={np.percentile(s, 10):.4f}, p90={np.percentile(s, 90):.4f}"
    )

In [None]:
# Visualize calibration score distributions by class
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for ax, (c, name) in zip(axes, [(0, "Negative"), (1, "Positive")]):
    s = cal_scores[chex_cal_labels == c]
    ax.hist(s, bins=50, edgecolor="black", linewidth=0.3, alpha=0.7)
    ax.axvline(std_q, color="red", linestyle="--", linewidth=1.5, label=f"Std q={std_q:.3f}")
    ax.axvline(
        mon_qs[c], color="green", linestyle="-.", linewidth=1.5, label=f"Mon q_{c}={mon_qs[c]:.3f}"
    )
    ax.set_xlabel("RAPS Score")
    ax.set_ylabel("Count")
    ax.set_title(f"Cal Scores — {name} (n={len(s)})")
    ax.legend(fontsize=9)

plt.tight_layout()
plt.show()

## 6. Coverage vs Alpha Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

methods = {
    "Standard CP": std_results,
    "WCP": wcp_results,
    "Mondrian CP": mcp_results,
    "Mondrian WCP": mwcp_results,
}
colors = {
    "Standard CP": "#1f77b4",
    "WCP": "#ff7f0e",
    "Mondrian CP": "#2ca02c",
    "Mondrian WCP": "#d62728",
}

# Coverage vs alpha
ax = axes[0]
for name, res in methods.items():
    a = [r.alpha_or_threshold for r in res]
    c = [r.coverage_rate for r in res]
    ax.plot(a, c, label=name, color=colors[name], linewidth=1.5, marker="o", markersize=2)

ax.plot(alphas, 1 - alphas, "k--", alpha=0.5, linewidth=1.5, label=r"Ideal $1-\alpha$")
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("Coverage")
ax.set_title(f"Coverage vs Alpha ({TARGET_PATHOLOGY}, NIH test)")
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

# Deferral vs alpha
ax = axes[1]
for name, res in methods.items():
    a = [r.alpha_or_threshold for r in res]
    d = [r.deferral_rate for r in res]
    ax.plot(a, d, label=name, color=colors[name], linewidth=1.5, marker="o", markersize=2)

ax.set_xlabel(r"$\alpha$")
ax.set_ylabel("Deferral Rate")
ax.set_title(f"Deferral Rate vs Alpha ({TARGET_PATHOLOGY}, NIH test)")
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Multi-Pathology Comparison

In [None]:
alpha_target = 0.1
all_results = []

for pathology in COMMON_PATHOLOGIES:
    # Binary labels
    c_feats, c_labels, _ = extract_binary_labels(
        chexpert.features, chexpert.labels, COMMON_PATHOLOGIES, pathology
    )
    n_feats, n_labels, _ = extract_binary_labels(
        nih.features, nih.labels, COMMON_PATHOLOGIES, pathology
    )

    # Splits
    c_tr_f, c_tmp_f, c_tr_l, c_tmp_l = train_test_split(
        c_feats, c_labels, test_size=0.4, random_state=SEED, stratify=c_labels
    )
    c_cal_f, _, c_cal_l, _ = train_test_split(
        c_tmp_f, c_tmp_l, test_size=0.5, random_state=SEED, stratify=c_tmp_l
    )
    _, n_te_f, _, n_te_l = train_test_split(
        n_feats, n_labels, test_size=0.5, random_state=SEED, stratify=n_labels
    )

    # Classifier
    sc = StandardScaler()
    Xtr = sc.fit_transform(c_tr_f)
    Xcal = sc.transform(c_cal_f)
    Xte = sc.transform(n_te_f)

    model = LogisticRegression(solver="lbfgs", max_iter=1000, C=1.0, random_state=SEED)
    model.fit(Xtr, c_tr_l)
    nih_auc = roc_auc_score(n_te_l, model.predict_proba(Xte)[:, 1])

    def _logits(m, X):
        d = m.decision_function(X)
        return np.column_stack([-d, d])

    c_lg = _logits(model, Xcal)
    t_lg = _logits(model, Xte)

    # DRE
    d = AdaptiveDRE(n_components=4, weight_clip=20.0, random_state=SEED)
    d.fit(c_cal_f, nih_pool_feats_all)
    cw = d.compute_weights(c_cal_f)
    tw = d.compute_weights(n_te_f)

    row = {
        "Pathology": pathology,
        "NIH AUC": f"{nih_auc:.3f}",
        "NIH prev": f"{n_te_l.mean():.3f}",
        "Cal prev": f"{c_cal_l.mean():.3f}",
    }

    # Evaluate all 4 methods
    for method_name, PredClass, use_weights in [
        ("Std", ConformalPredictor, False),
        ("WCP", WeightedConformalPredictor, True),
        ("Mon", MondrianConformalPredictor, False),
        ("MonW", MondrianWeightedConformalPredictor, True),
    ]:
        pred = PredClass(penalty=0.1, kreg=1, randomized=False)
        if use_weights:
            pred.calibrate(c_lg, c_cal_l, cw)
            ps = pred.predict(t_lg, tw, alpha=alpha_target)
        else:
            pred.calibrate(c_lg, c_cal_l, alpha=alpha_target)
            ps = pred.predict(t_lg)

        cov = ps[np.arange(len(n_te_l)), n_te_l].mean()
        defer = (ps.sum(axis=1) != 1).mean()
        row[f"{method_name} Cov"] = f"{cov:.3f}"
        row[f"{method_name} Def"] = f"{defer:.3f}"

    all_results.append(row)
    print(
        f"{pathology:<16} AUC={nih_auc:.3f}  "
        f"Std={row['Std Cov']}/{row['Std Def']}  "
        f"WCP={row['WCP Cov']}/{row['WCP Def']}  "
        f"Mon={row['Mon Cov']}/{row['Mon Def']}  "
        f"MonW={row['MonW Cov']}/{row['MonW Def']}"
    )

df_all = pd.DataFrame(all_results)
print(f"\n{'=' * 120}")
print(f"Multi-pathology comparison at alpha={alpha_target} (Coverage/Deferral)")
print(f"{'=' * 120}")
print(df_all.to_string(index=False))

In [None]:
# Visualize: deferral rates across pathologies
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

pathology_names = [r["Pathology"] for r in all_results]
x = np.arange(len(pathology_names))
width = 0.2

# Deferral rates
ax = axes[0]
for i, (key, label) in enumerate(
    [("Std Def", "Standard CP"), ("WCP Def", "WCP"), ("Mon Def", "Mondrian CP"), ("MonW Def", "Mondrian WCP")]
):
    vals = [float(r[key]) for r in all_results]
    ax.bar(x + i * width, vals, width, label=label)

ax.set_xticks(x + 1.5 * width)
ax.set_xticklabels(pathology_names, rotation=45, ha="right")
ax.set_ylabel("Deferral Rate")
ax.set_title(f"Deferral Rate by Pathology (alpha={alpha_target})")
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3, axis="y")

# Coverage
ax = axes[1]
for i, (key, label) in enumerate(
    [("Std Cov", "Standard CP"), ("WCP Cov", "WCP"), ("Mon Cov", "Mondrian CP"), ("MonW Cov", "Mondrian WCP")]
):
    vals = [float(r[key]) for r in all_results]
    ax.bar(x + i * width, vals, width, label=label)

ax.axhline(y=0.9, color="red", linestyle="--", alpha=0.7, label="Target (90%)")
ax.set_xticks(x + 1.5 * width)
ax.set_xticklabels(pathology_names, rotation=45, ha="right")
ax.set_ylabel("Coverage")
ax.set_title(f"Coverage by Pathology (alpha={alpha_target})")
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3, axis="y")
ax.set_ylim(0.7, 1.05)

plt.tight_layout()
plt.show()