In [6]:
import os
import re
import math
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict

num_pattern = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")

def canon_float(x, prec=DEDUP_PREC):
    """Round to a fixed precision to make dedup robust to tiny float noise."""
    return float(np.round(float(x), prec))

def fmt_num(x):
    """Pretty numeric formatting in headers (avoid 1.0000, show 1)."""
    x = float(x)
    xi = int(round(x))
    return str(xi) if abs(x - xi) < 10**-6 else FLOAT_FMT.format(x)

def parse_bins(text):
    """Parse a 'bins' string like '[  0  18  35  45  65 100]' -> sorted unique floats."""
    vals = [canon_float(v) for v in num_pattern.findall(str(text))]
    if not vals:
        return []
    vals = sorted(set(vals))
    return vals

def load_all_edges_for_attr(attr_name: str):
    """
    Load <Attr>.csv and return a list of edge-lists (one per row/method),
    already deduped & sorted within each list.
    """
    path = Path(SPECS_DIR) / f"{attr_name}.csv"
    if not path.exists():
        return []
    df = pd.read_csv(path)
    out = []
    for _, row in df.iterrows():
        edges = parse_bins(row.get("bins", ""))
        if len(edges) >= 2:
            out.append(edges)
    return out

def union_intervals_from_edges(edge_lists):
    """
    From multiple edge lists, build the union of UNIQUE intervals.
    Intervals are represented as tuples: (lo, hi, right_closed)
    where right_closed = True only for the last interval of its originating edge list.
    Dedup uses (lo, hi, right_closed) after rounding.
    """
    uniq = set()
    for edges in edge_lists:
        n = len(edges)
        for i in range(n - 1):
            lo = canon_float(edges[i])
            hi = canon_float(edges[i + 1])
            right_closed = (i == n - 2)
            key = (lo, hi, right_closed)
            uniq.add(key)
    # Return sorted list by (lo, hi, right_closed)
    return sorted(list(uniq), key=lambda t: (t[0], t[1], not t[2]))

def union_thresholds_from_edges(edge_lists):
    """
    From multiple edge lists, build the union of UNIQUE thresholds for '< thr' columns.
    We skip the first (minimum) cut of each list; keep unique canonized values.
    """
    thr = set()
    for edges in edge_lists:
        for t in edges[1:]:
            thr.add(canon_float(t))
    return sorted(list(thr))

def make_interval_features(series: pd.Series, intervals, attr):
    """Create binary features for intervals (list of (lo, hi, right_closed))."""
    x = pd.to_numeric(series, errors="coerce")
    cols = {}
    for (lo, hi, right_closed) in intervals:
        if right_closed:
            mask = (x >= lo) & (x <= hi)
            col = f"{attr}:[{fmt_num(lo)},{fmt_num(hi)}]"
        else:
            mask = (x >= lo) & (x <  hi)
            col = f"{attr}:[{fmt_num(lo)},{fmt_num(hi)})"
        cols[col] = mask.astype(int)
    return pd.DataFrame(cols, index=series.index)

def make_threshold_features(series: pd.Series, thresholds, attr):
    """Create binary '< thr' features for unique thresholds."""
    x = pd.to_numeric(series, errors="coerce")
    cols = {}
    for thr in thresholds:
        col = f"{attr}:<{fmt_num(thr)}"
        cols[col] = (x < thr).astype(int)
    return pd.DataFrame(cols, index=series.index)

def main():
    df = pd.read_csv(DIABETES_CSV)
    attributes = [c for c in ATTRIBUTES if c != LABEL_COL]

    feat_parts = []

    for attr in attributes:
        edge_lists = load_all_edges_for_attr(attr)
        if not edge_lists:
            # No spec file for this attribute -> skip
            continue

        if ENCODING_STYLE == "bins":
            # build the union of unique intervals across all methods, then binarize
            intervals = union_intervals_from_edges(edge_lists)
            feats = make_interval_features(df[attr], intervals, attr)
        else:
            # build the union of unique thresholds across all methods, then binarize
            thresholds = union_thresholds_from_edges(edge_lists)
            feats = make_threshold_features(df[attr], thresholds, attr)

        feat_parts.append(feats)

    X = pd.concat(feat_parts, axis=1) if feat_parts else pd.DataFrame(index=df.index)

    # Keep label column (if present) as last
    if LABEL_COL in ATTRIBUTES:
        X[LABEL_COL] = df[LABEL_COL].values

    out_path = "diabetes-treefarms.csv"
    X.to_csv(os.path.join("./pima/input/", out_path), index=False)
    print(f"Wrote {out_path} with shape {X.shape}")
    return X

In [7]:
# ---------------- CONFIG ----------------
DIABETES_CSV = "./pima/input/diabetes.csv"   # path to diabetes.csv
SPECS_DIR    = "./pima/scored_attributes"              # directory containing Age.csv, BMI.csv, Glucose.csv, ...
LABEL_COL    = "Outcome"        # kept (if present) at the end
ATTRIBUTES   = ["Age", "BMI", "Glucose", "Outcome"]  # list of attributes to consider
ENCODING_STYLE = "thresholds"         # "bins" (interval membership) or "thresholds" (cumulative <cut)
FLOAT_FMT = "{:.6g}"            # column name formatting for numbers
DEDUP_PREC = 12                 # rounding precision for comparing cuts/intervals
# ---------------------------------------

X = main()

Wrote diabetes-treefarms.csv with shape (768, 2208)
