In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [5]:
DATASET_CONFIG = {
    "adult": {
        "csv_path": "adult_preprocessed.csv",
        "feature_cols": [
            "age", "final-weight", "education-num", "marital-status",
            "occupation", "relationship", "capital-gain",
            "hours-per-week", "native-country", "income"
        ],
        "sensitive_col": "sex",   # protected attribute
        "k": 6,
        "tag": "adult",
    },
    "bank": {
        "csv_path": "bank_preprocessed.csv",
        "feature_cols": [
            # TODO: change to the exact columns you use for Bank
            "age", "job", "marital", "education", "balance",
            "housing", "loan", "campaign", "pdays", "previous"
        ],
        "sensitive_col": "sex",   # or whatever you use
        "k": 6,
        "tag": "bank",
    },
    "credit": {
        "csv_path": "credit_preprocessed.csv",
        "feature_cols": [
            # TODO: change to the exact columns you use for Credit
            "LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3",
            "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2"
        ],
        "sensitive_col": "sex",
        "k": 6,
        "tag": "credit",
    },
    "student": {
        "csv_path": "student_preprocessed.csv",
        "feature_cols": [
            # TODO: change to the exact columns you use for Student
            "age", "Medu", "Fedu", "studytime", "failures",
            "famrel", "freetime", "goout", "G1", "G2"
        ],
        "sensitive_col": "sex",
        "k": 6,
        "tag": "student",
    },
}



In [7]:
def load_dataset(dataset_name: str):
    """Load dataset, standardize features, and build XA / XB."""
    cfg = DATASET_CONFIG[dataset_name]
    df = pd.read_csv(cfg["csv_path"])

    X = df[cfg["feature_cols"]].values
    s = df[cfg["sensitive_col"]].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    XA = X_scaled[s > 0]
    XB = X_scaled[s <= 0]

    print(f"[{dataset_name}] n = {len(X_scaled)}")
    print(f"[{dataset_name}] |A| = {(s > 0).sum()}, |B| = {(s <= 0).sum()}")

    return X_scaled, XA, XB, cfg

In [9]:
def _group_masks_from_sets(X, XA, XB):
    """
    Given full dataset X and subsets XA, XB, 
    return boolean masks inA, inB such that:
    inA[i] = True if X[i] ∈ XA
    inB[i] = True if X[i] ∈ XB
    """
    Xv  = np.ascontiguousarray(X).view([('', X.dtype)] * X.shape[1]).ravel()
    XAv = np.ascontiguousarray(XA).view([('', XA.dtype)] * XA.shape[1]).ravel()
    XBv = np.ascontiguousarray(XB).view([('', XB.dtype)] * XB.shape[1]).ravel()
    inA = np.in1d(Xv, XAv)
    inB = np.in1d(Xv, XBv)
    return inA, inB


def _assign(X, C):
    """
    Assign each data point in X to its closest cluster center in C.
    Returns:
        a   : array of cluster indices for each point
        d2  : squared distances to all centers
    """
    d2 = np.sum((X[:, None, :] - C[None, :, :]) ** 2, axis=2)
    return np.argmin(d2, axis=1), d2


def _kmeans_grad(X, C, a):
    """
    Gradient of the standard k-means objective:
      1/N Σ ||x - μ_c||²
    Returns:
        grad, counts, means
    """
    K, D = C.shape
    N = len(X)
    sums = np.zeros_like(C)
    np.add.at(sums, a, X)
    counts = np.bincount(a, minlength=K).astype(np.float64).reshape(-1, 1)
    means = np.divide(sums, np.maximum(counts, 1.0), where=counts > 0)
    grad = 2.0 * counts * (C - means) / N
    return grad, counts, means


def _group_kmeans_grad(X, C, a, group_mask):
    """
    Group-wise k-means gradient:
      1/|G| Σ_{i∈G} ||x - μ_c||²
    for a given group G.
    Returns:
        grad, group_loss
    """
    Xg = X[group_mask]
    ag = a[group_mask]
    if len(Xg) == 0:
        return np.zeros_like(C), 0.0

    K, D = C.shape
    G = len(Xg)
    sums = np.zeros_like(C)
    np.add.at(sums, ag, Xg)
    counts = np.bincount(ag, minlength=K).astype(np.float64).reshape(-1, 1)
    means = np.divide(sums, np.maximum(counts, 1.0), where=counts > 0)
    grad = 2.0 * counts * (C - means) / G
    loss = np.sum((Xg - C[ag]) ** 2) / G
    return grad, loss


def _separation_grad_for_group(Gpts, C, eps=1e-12):
    """
    Separation gradient for one group:
      1/|G| Σ s(x)^2,
    where s(x) is the signed distance to the nearest Voronoi bisector
    between the two closest centroids.
    Returns:
        grad, separation_value
    """
    if len(Gpts) == 0:
        return np.zeros_like(C), 0.0

    # Distances to all centers
    d2 = np.sum((Gpts[:, None, :] - C[None, :, :]) ** 2, axis=2)
    m1_idx = np.argmin(d2, axis=1)
    d2_mask = d2.copy()
    d2_mask[np.arange(len(Gpts)), m1_idx] = np.inf
    m2_idx = np.argmin(d2_mask, axis=1)

    m1, m2 = C[m1_idx], C[m2_idx]
    m = 0.5 * (m1 + m2)
    v = m2 - m1
    vn = np.linalg.norm(v, axis=1, keepdims=True)
    vhat = v / np.maximum(vn, eps)

    s = np.sum((Gpts - m) * vhat, axis=1)
    Px = (Gpts - m) - vhat * s[:, None]

    grad_m1 = (2.0 * s)[:, None] * (-(Px / np.maximum(vn, eps)) - 0.5 * vhat)
    grad_m2 = (2.0 * s)[:, None] * ((Px / np.maximum(vn, eps)) - 0.5 * vhat)

    K, D = C.shape
    grad = np.zeros_like(C)
    np.add.at(grad, m1_idx, grad_m1)
    np.add.at(grad, m2_idx, grad_m2)
    grad /= len(Gpts)

    val = np.mean(s ** 2)
    return grad, val



In [11]:
# ------------------------------------------------
# 3. Training combined (social + separation) k-means
# ------------------------------------------------

def train_combined_kmeans(
    X, XA, XB, k,
    alpha=0.5,            # weight for social term  (+ alpha * max(LA, LB))
    beta=0.5,             # weight for separation   (- beta  * min(cfdA, cfdB))
    seeds=range(10),
    iters=500,
    lr=0.5,
    verbose=False,
):
    """
    Train the combined Separation–Social Fair k-means:

      objective = kmeans
                  + alpha * max(L_A, L_B)      (social fairness)
                  - beta  * min(cfd_A, cfd_B)  (separation fairness)

    We run multiple seeds and return a list of histories, one per seed.
    """
    inA, inB = _group_masks_from_sets(X, XA, XB)
    N, D = X.shape
    history = []

    for seed in seeds:
        rng = np.random.default_rng(seed)
        C = X[rng.choice(N, size=k, replace=False)].copy()

        for t in range(iters):
            a, _ = _assign(X, C)

            # k-means term
            g_km, _, _ = _kmeans_grad(X, C, a)
            km_val = np.mean(np.sum((X - C[a]) ** 2, axis=1))

            # social term: LA, LB (group-wise k-means loss)
            gA, LA = _group_kmeans_grad(X, C, a, inA)
            gB, LB = _group_kmeans_grad(X, C, a, inB)
            if LA >= LB:
                g_social, social_val = gA, LA
            else:
                g_social, social_val = gB, LB

            # separation term: cfdA, cfdB (hyperplane distances)
            g_sep_A, cfdA = _separation_grad_for_group(X[inA], C)
            g_sep_B, cfdB = _separation_grad_for_group(X[inB], C)
            if cfdA <= cfdB:
                g_sep, sep_val = g_sep_A, cfdA
            else:
                g_sep, sep_val = g_sep_B, cfdB

            # gradient descent step
            C -= lr * (g_km + alpha * g_social - beta * g_sep)

            if verbose and (t % 50 == 0 or t == iters - 1):
                E = km_val + alpha * social_val - beta * sep_val
                print(
                    f"[seed {seed:02d} | it {t:04d}] "
                    f"E={E:.6f}  km={km_val:.6f}  soc={social_val:.6f}  sep={sep_val:.6f}"
                )

        # final values for this seed
        a, _ = _assign(X, C)
        km_final = np.mean(np.sum((X - C[a]) ** 2, axis=1))
        _, LA = _group_kmeans_grad(X, C, a, inA)
        _, LB = _group_kmeans_grad(X, C, a, inB)
        _, cfdA = _separation_grad_for_group(X[inA], C)
        _, cfdB = _separation_grad_for_group(X[inB], C)

        history.append(dict(
            seed=seed,
            centers=C.copy(),
            assignments=a.copy(),
            kmeans=km_final,
            social_max=max(LA, LB),
            separation_min=min(cfdA, cfdB),
            LA=LA, LB=LB,
            cfdA=cfdA, cfdB=cfdB,
            social_gap=abs(LA - LB),
            separation_gap=abs(cfdA - cfdB),
            objective=km_final + alpha * max(LA, LB) - beta * min(cfdA, cfdB),
        ))

    return history

In [13]:
# ------------------------------------------------
# 4. Full pipeline for one dataset
# ------------------------------------------------

def run_separation_social_pipeline(
    dataset_name: str,
    lambda_list=None,
    seeds=range(10),
    iters=500,
    lr=0.5,
    save_dir="./plots",
):
    """
    Run the combined Separation–Social Fair k-means on one dataset
    for multiple λ values.

    For each λ:
      alpha = beta = λ / 2

    Returns:
      df_lam: DataFrame of aggregated metrics across seeds.
      cfg   : dataset configuration.
    """
    if lambda_list is None:
        lambda_list = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

    X_input, XA, XB, cfg = load_dataset(dataset_name)
    k = cfg["k"]
    tag = cfg["tag"]

    rows = []
    for lam in lambda_list:
        print(f"\n=== {dataset_name} | λ = {lam} ===")
        hist = train_combined_kmeans(
            X_input, XA, XB, k,
            alpha=lam / 2.0,
            beta=lam / 2.0,
            seeds=seeds,
            iters=iters,
            lr=lr,
            verbose=False,
        )

        def agg(key):
            vals = np.asarray([d[key] for d in hist], float)
            return vals.mean(), vals.std()

        km_m, km_s           = agg("kmeans")
        soc_m, soc_s         = agg("social_max")
        sep_m, sep_s         = agg("separation_min")
        obj_m, obj_s         = agg("objective")
        sg_m, sg_s           = agg("social_gap")
        sep_gap_m, sep_gap_s = agg("separation_gap")

        rows.append({
            "lambda": lam,
            "kmeans_mean": km_m,        "kmeans_std": km_s,
            "social_mean": soc_m,       "social_std": soc_s,
            "separation_mean": sep_m,   "separation_std": sep_s,
            "objective_mean": obj_m,    "objective_std": obj_s,
            "social_gap_mean": sg_m,    "social_gap_std": sg_s,
            "sep_gap_mean": sep_gap_m,  "sep_gap_std": sep_gap_s,
        })

    df_lam = pd.DataFrame(rows).sort_values("lambda").reset_index(drop=True)
    pd.options.display.float_format = "{:,.6f}".format

    print(df_lam[[
        "lambda", "kmeans_mean", "social_mean",
        "separation_mean", "objective_mean",
        "social_gap_mean", "sep_gap_mean"
    ]])

    # Save numeric results
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    out_csv = Path(save_dir) / f"lambda_sweep_results_with_gaps_{tag}.csv"
    df_lam.to_csv(out_csv, index=False)
    print(f"[saved] {out_csv}")

    # Make plots
    plot_separation_social_metrics(df_lam, lambda_list, tag, save_dir)

    return df_lam, cfg

In [15]:
# ------------------------------------------------
# 5. Plotting
# ------------------------------------------------

def lam_label(l):
    s = f"{l:.2f}".rstrip("0").rstrip(".")
    return r"$\lambda$=" + s


def plot_separation_social_metrics(df_lam, lambda_list, tag, save_dir="./plots"):
    """Create the same four plots as in your original notebook."""
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    all_lams = sorted(lambda_list)
    tick_pos = np.arange(1, 1 + len(all_lams))
    tick_labels = [lam_label(l) for l in all_lams]

    # ----- 1. KMeans Loss vs lambda -----
    plt.figure(figsize=(6, 5))
    plt.errorbar(
        tick_pos,
        df_lam["kmeans_mean"],
        yerr=df_lam["kmeans_std"],
        marker="o",
        fmt="-o",
        capsize=4,
        color="green",
        linewidth=3,
    )
    plt.ylabel("KMeans Loss", fontsize=20)
    plt.xticks(tick_pos, tick_labels, rotation=45, ha="right", fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()
    fname = save_dir / f"plot_kmeans_vs_lambda_{tag}.pdf"
    plt.savefig(fname, dpi=300)
    plt.show()
    print(f"[saved] {fname}")

    # ----- 2. Social term (max(LA, LB)) -----
    plt.figure(figsize=(6, 5))
    plt.errorbar(
        tick_pos,
        df_lam["social_mean"],
        yerr=df_lam["social_std"],
        marker="o",
        linewidth=3,
    )
    plt.ylabel("Max Fair (Social)", fontsize=20)
    plt.xticks(tick_pos, tick_labels, rotation=45, ha="right", fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()
    fname = save_dir / f"plot_social_vs_lambda_{tag}.pdf"
    plt.savefig(fname, dpi=300)
    plt.show()
    print(f"[saved] {fname}")

    # ----- 3. Separation term (min(cfdA, cfdB)) -----
    plt.figure(figsize=(6, 5))
    plt.errorbar(
        tick_pos,
        df_lam["separation_mean"],
        yerr=df_lam["separation_std"],
        marker="o",
        linewidth=3,
        color="red",
    )
    plt.ylabel("Min Fair (Separation)", fontsize=20)
    plt.xticks(tick_pos, tick_labels, rotation=45, ha="right", fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()
    fname = save_dir / f"plot_separation_vs_lambda_{tag}.pdf"
    plt.savefig(fname, dpi=300)
    plt.show()
    print(f"[saved] {fname}")

    # ----- 4. Fairness gaps (social vs separation) -----
    plt.figure(figsize=(6, 5))
    plt.errorbar(
        tick_pos,
        df_lam["social_gap_mean"],
        yerr=df_lam["social_gap_std"],
        marker="o",
        linewidth=3,
        label="Social Fair",
        color="purple",
    )
    plt.errorbar(
        tick_pos,
        df_lam["sep_gap_mean"],
        yerr=df_lam["sep_gap_std"],
        marker="o",
        linewidth=3,
        label="Separation Fair",
        color="orange",
    )
    plt.ylabel("Fairness Difference", fontsize=20)
    plt.xticks(tick_pos, tick_labels, rotation=45, ha="right", fontsize=20)
    plt.yticks(fontsize=20)
    plt.legend(fontsize=16)
    plt.tight_layout()
    fname = save_dir / f"plot_fairness_gaps_vs_lambda_{tag}.pdf"
    plt.savefig(fname, dpi=300)
    plt.show()
    print(f"[saved] {fname}")

In [None]:
# ------------------------------------------------
# 6. Example: run for one dataset
# ------------------------------------------------

# Choose one of: "adult", "credit", "bank", "student"
dataset_name = "adult"

df_lam, cfg = run_separation_social_pipeline(
    dataset_name=dataset_name,
    lambda_list=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    seeds=range(10),
    iters=500,
    lr=0.5,
    save_dir=f"./plots/combined_{dataset_name}",
)

[adult] n = 1000
[adult] |A| = 656, |B| = 344

=== adult | λ = 0.0 ===

=== adult | λ = 0.2 ===

=== adult | λ = 0.4 ===

=== adult | λ = 0.6 ===

=== adult | λ = 0.8 ===
