# Data-Proceing 

In [1]:
import os.path
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from fasterrisk.binarization_util import convert_continuous_df_to_binary_df
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from fasterrisk.utils import download_file_from_google_drive

# -------------------------
# Config
# -------------------------
LABEL_COL = 'Cancer_lbl'
FEATURES_STLMD  = ['sct_long_dia','part_solid','ground_glass','solid','Upper_Lobe','Spiculation','age','sex']
FEATURES_STLM   = ['sct_long_dia','part_solid','ground_glass','solid','Upper_Lobe','Spiculation']
FEATURES_STPGLM = ['sct_long_dia','part_solid','ground_glass','Upper_Lobe','Spiculation']

CSV1 = '/data/usr/ft42/CVIT_XAI/LungRADS_Modeling/NLST_Statistics/ml_dataset/nlst_ct_nodule_df_set1.csv'
CSV2 = '/data/usr/ft42/CVIT_XAI/LungRADS_Modeling/NLST_Statistics/ml_dataset/nlst_ct_nodule_df_set2.csv'

# -------------------------
# Helpers
# -------------------------
def filter_and_report(df: pd.DataFrame) -> pd.DataFrame:
    initial_len = len(df)
    df = df[df['gender'].isin(['male', 'female'])]
    df = df[df['Nodule_Type'].isin(['solid', 'ground-glass', 'part-solid'])]
    print(f"[filter] Removed {initial_len - len(df)} rows (invalid gender or Nodule_Type).")
    return df

def to_fastrisk_y(y_raw, pos_label=1) -> np.ndarray:
    """Return 1-D np.ndarray[float] with labels in {-1.0, +1.0}."""
    y_arr = np.asarray(y_raw).ravel()
    uniq = set(np.unique(y_arr))
    if uniq <= {0, 1}:
        return (2 * y_arr - 1).astype(float)
    return np.where(y_arr == pos_label, 1.0, -1.0).astype(float)

def align_like_train(train_bin_df: pd.DataFrame, other_bin_df: pd.DataFrame) -> pd.DataFrame:
    """Make other_bin_df have exactly the same columns and order as train_bin_df."""
    cols = list(train_bin_df.columns)
    return other_bin_df.reindex(columns=cols, fill_value=0)

def prepare_data(df: pd.DataFrame, feature_cols, label_col):
    X = df[feature_cols]
    y = df[label_col]
    return X, y

def binarize_and_align(X_train_df: pd.DataFrame, X_val_df: pd.DataFrame, X_test_df: pd.DataFrame):
    """Binarize each split, then align val/test to training columns."""
    X_train_bin = convert_continuous_df_to_binary_df(X_train_df)
    X_val_bin   = convert_continuous_df_to_binary_df(X_val_df)
    X_test_bin  = convert_continuous_df_to_binary_df(X_test_df)
    X_val_bin   = align_like_train(X_train_bin, X_val_bin)
    X_test_bin  = align_like_train(X_train_bin, X_test_bin)
    # sanity
    assert list(X_val_bin.columns)  == list(X_train_bin.columns)
    assert list(X_test_bin.columns) == list(X_train_bin.columns)
    return X_train_bin, X_val_bin, X_test_bin

# -------------------------
# Load, filter, encode
# -------------------------
df1 = pd.read_csv(CSV1)
df2 = pd.read_csv(CSV2)

df1 = filter_and_report(df1)
df2 = filter_and_report(df2)

# Encode needed variables
for df in (df1, df2):
    df['sex'] = df['gender'].map({'male': 0, 'female': 1})
    df['part_solid']   = df['Nodule_Type'].apply(lambda x: 1 if 'part-solid'   in x else 0)
    df['ground_glass'] = df['Nodule_Type'].apply(lambda x: 1 if 'ground-glass' in x else 0)
    df['solid']        = df['Nodule_Type'].apply(lambda x: 1 if 'solid'        in x else 0)

# -------------------------
# Patient-level stratified split (on df1)
# -------------------------
patients = df1[['pid', LABEL_COL]].drop_duplicates()
train_patients, val_patients = train_test_split(
    patients,
    test_size=0.2,
    stratify=patients[LABEL_COL],
    random_state=42
)
train_df = df1[df1['pid'].isin(train_patients['pid'])]
val_df   = df1[df1['pid'].isin(val_patients['pid'])]

# ============================================================
# STLMD (includes age & sex)
# ============================================================
X_train_STLMD_df, y_train_STLMD_raw = prepare_data(train_df, FEATURES_STLMD, LABEL_COL)
X_val_STLMD_df,   y_val_STLMD_raw   = prepare_data(val_df,   FEATURES_STLMD, LABEL_COL)
X_test_STLMD_df,  y_test_STLMD_raw  = prepare_data(df2,      FEATURES_STLMD, LABEL_COL)

X_train_STLMD_bin, X_val_STLMD_bin, X_test_STLMD_bin = binarize_and_align(
    X_train_STLMD_df, X_val_STLMD_df, X_test_STLMD_df
)

y_train_STLMD = to_fastrisk_y(y_train_STLMD_raw, pos_label=1)
y_val_STLMD   = to_fastrisk_y(y_val_STLMD_raw,   pos_label=1)
y_test_STLMD  = to_fastrisk_y(y_test_STLMD_raw,  pos_label=1)

X_train_STLMD = X_train_STLMD_bin.to_numpy(dtype=float)
X_val_STLMD   = X_val_STLMD_bin.to_numpy(dtype=float)
X_test_STLMD  = X_test_STLMD_bin.to_numpy(dtype=float)

# ============================================================
# STLM (no age/sex)
# ============================================================
X_train_STLM_df, y_train_STLM_raw = prepare_data(train_df, FEATURES_STLM, LABEL_COL)
X_val_STLM_df,   y_val_STLM_raw   = prepare_data(val_df,   FEATURES_STLM, LABEL_COL)
X_test_STLM_df,  y_test_STLM_raw  = prepare_data(df2,      FEATURES_STLM, LABEL_COL)

X_train_STLM_bin, X_val_STLM_bin, X_test_STLM_bin = binarize_and_align(
    X_train_STLM_df, X_val_STLM_df, X_test_STLM_df
)

y_train_STLM = to_fastrisk_y(y_train_STLM_raw, pos_label=1)
y_val_STLM   = to_fastrisk_y(y_val_STLM_raw,   pos_label=1)
y_test_STLM  = to_fastrisk_y(y_test_STLM_raw,  pos_label=1)

X_train_STLM = X_train_STLM_bin.to_numpy(dtype=float)
X_val_STLM   = X_val_STLM_bin.to_numpy(dtype=float)
X_test_STLM  = X_test_STLM_bin.to_numpy(dtype=float)

# ============================================================
# STPGLM (subset features)
# ============================================================
X_train_STPGLM_df, y_train_STPGLM_raw = prepare_data(train_df, FEATURES_STPGLM, LABEL_COL)
X_val_STPGLM_df,   y_val_STPGLM_raw   = prepare_data(val_df,   FEATURES_STPGLM, LABEL_COL)
X_test_STPGLM_df,  y_test_STPGLM_raw  = prepare_data(df2,      FEATURES_STPGLM, LABEL_COL)

X_train_STPGLM_bin, X_val_STPGLM_bin, X_test_STPGLM_bin = binarize_and_align(
    X_train_STPGLM_df, X_val_STPGLM_df, X_test_STPGLM_df
)

y_train_STPGLM = to_fastrisk_y(y_train_STPGLM_raw, pos_label=1)
y_val_STPGLM   = to_fastrisk_y(y_val_STPGLM_raw,   pos_label=1)
y_test_STPGLM  = to_fastrisk_y(y_test_STPGLM_raw,  pos_label=1)

X_train_STPGLM = X_train_STPGLM_bin.to_numpy(dtype=float)
X_val_STPGLM   = X_val_STPGLM_bin.to_numpy(dtype=float)
X_test_STPGLM  = X_test_STPGLM_bin.to_numpy(dtype=float)

# -------------------------
# Quick hygiene checks
# -------------------------
def _chk(Xtr, ytr, Xv, yv, Xte, yte, name):
    assert Xtr.shape[0] == ytr.shape[0] and Xv.shape[0] == yv.shape[0] and Xte.shape[0] == yte.shape[0], f"row mismatch in {name}"
    assert set(np.unique(ytr)) <= {-1.0, 1.0} and set(np.unique(yv)) <= {-1.0, 1.0} and set(np.unique(yte)) <= {-1.0, 1.0}, f"bad labels in {name}"
    print(f"{name:7s} -> X_train {Xtr.shape}, X_val {Xv.shape}, X_test {Xte.shape}")

_chk(X_train_STLMD, y_train_STLMD, X_val_STLMD, y_val_STLMD, X_test_STLMD, y_test_STLMD, "STLMD")
_chk(X_train_STLM,  y_train_STLM,  X_val_STLM,  y_val_STLM,  X_test_STLM,  y_test_STLM,  "STLM")
_chk(X_train_STPGLM,y_train_STPGLM,X_val_STPGLM,y_val_STPGLM,X_test_STPGLM,y_test_STPGLM,"STPGLM")


[filter] Removed 0 rows (invalid gender or Nodule_Type).
[filter] Removed 0 rows (invalid gender or Nodule_Type).
Converting continuous features to binary features in the dataframe......
We select thresholds for each continuous feature by sampling (without replacement) <= max_num_thresholds_per_feature values from all unique values in that feature column.
Finish converting continuous features to binary features......
Converting continuous features to binary features in the dataframe......
We select thresholds for each continuous feature by sampling (without replacement) <= max_num_thresholds_per_feature values from all unique values in that feature column.
Finish converting continuous features to binary features......
Converting continuous features to binary features in the dataframe......
We select thresholds for each continuous feature by sampling (without replacement) <= max_num_thresholds_per_feature values from all unique values in that feature column.
Finish converting continuous

In [2]:
# ============================
# Export FasterRisk models -> Riskomon JSON (memo-compatible)
# ============================
import json, math
import numpy as np
from typing import List, Dict, Any

def _sigmoid(z: float) -> float:
    return 1.0 / (1.0 + math.exp(-z))

def _score_span(coefs: np.ndarray) -> (int, int):
    """Min/max integer score contribution from sparse integer weights (no intercept)."""
    pos = int(coefs[coefs > 0].sum()) if coefs.size else 0
    neg = int(coefs[coefs < 0].sum()) if coefs.size else 0
    # min: include all negatives, exclude positives; max: include all positives
    return neg, pos

def _risk_scale(multiplier: float, intercept: int, coefs: np.ndarray) -> List[List[float]]:
    """
    Returns [[score_int, prob], ...] with probability = sigmoid(multiplier * (intercept + score)).
    Matches the memo's 'risk_scale' field name.
    """
    smin, smax = _score_span(coefs)
    out = []
    for s in range(smin, smax + 1):
        total = intercept + s
        p = _sigmoid(multiplier * total)
        out.append([float(total), float(p)])   # memo JSON uses floats
    return out

def _feature_pairs(coefs: np.ndarray, feat_names: List[str]) -> List[List[Any]]:
    """
    [[coef, "FeatureName"], ...] â€” coef first, only nonzeros, sorted by |coef| desc.
    """
    pairs = []
    for w, name in zip(coefs.tolist(), feat_names):
        if int(w) != 0:
            pairs.append([float(int(w)), str(name)])  # store as float to mirror memo example
    pairs.sort(key=lambda x: (-abs(x[0]), x[1]))
    return pairs

'''
def export_riskomon_payload_memo(
    multipliers: List[float],
    intercepts: List[int],
    coef_matrix: List[np.ndarray],
    feature_names: List[str],
    X_train: np.ndarray, y_train: np.ndarray,
    dataset_tag: str = "CANCER_STLMD",
    export_n: int = None
) -> str:
    """
    Build JSON using memo schema:
      - feature_data: [[coef, "name"], ...]
      - risk_scale:   [[score, prob], ...]
      - training_logistic_loss: float
      - training_accuracy: float
      - training_AUC: float
      - card_label: "01", "02", ...
    """
    n_models = len(multipliers)
    use_n = min(export_n if isinstance(export_n, int) else n_models, n_models)

    payload = []
    width = max(2, len(str(use_n)))  # zero-pad like "01"
    for i in range(use_n):
        mult = float(multipliers[i])
        b0   = float(intercepts[i])
        b0_shift = b0 + 1.0              # +1 point
        betas = np.asarray(coef_matrix[i], dtype=int)

        clf = RiskScoreClassifier(mult, b0_shift, betas)
        clf.reset_featureNames(feature_names)

        train_loss = float(clf.compute_logisticLoss(X_train, y_train))
        train_acc, train_auc = clf.get_acc_and_auc(X_train, y_train)

        payload.append({
            "feature_data": _feature_pairs(betas, list(feature_names)),
            "risk_scale": _risk_scale(mult, b0_shift, betas),
            "training_logistic_loss": train_loss,
            "training_accuracy": float(train_acc),
            "training_AUC": float(train_auc),          # <-- capitalization per memo
            "card_label": f"{i+1:0{width}d}",          # <-- "01", "02", ...
        })

    out_fname = f"{dataset_tag}.json"
    with open(out_fname, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print(f"[Riskomon] Wrote {out_fname} with {len(payload)} models (memo schema)")
    return out_fname
'''

def export_riskomon_payload_memo(
    multipliers: List[float],
    intercepts: List[int],
    coef_matrix: List[np.ndarray],
    feature_names: List[str],
    X_train: np.ndarray, y_train: np.ndarray,
    dataset_tag: str = "CANCER_STLMD",
    export_n: int = None
) -> str:
    """
    Build JSON using memo schema:
      - feature_data: [[coef, "name"], ...]
      - risk_scale:   [[score, prob], ...]
      - training_logistic_loss: float
      - training_accuracy: float
      - training_AUC: float
      - card_label: "01", "02", ...
    """
    n_models = len(multipliers)
    use_n = min(export_n if isinstance(export_n, int) else n_models, n_models)

    payload = []
    width = max(2, len(str(use_n)))  # zero-pad like "01"
    for i in range(use_n):
        mult = float(multipliers[i])
        b0   = float(intercepts[i])
        b0_shift = b0 + 1.0              # +1 point
        betas = np.asarray(coef_matrix[i], dtype=int)
        # compute points on a reference split (e.g., train)
        points_train = (X_train @ betas)
        C = -points_train.min()          # or -np.floor(points_train.min())
        # new display score and intercept that preserve probabilities
        b0_rebased = b0 - C              # undo the shift in the intercept

        clf = RiskScoreClassifier(mult, b0_rebased, betas)
        clf.reset_featureNames(feature_names)

        train_loss = float(clf.compute_logisticLoss(X_train, y_train))
        train_acc, train_auc = clf.get_acc_and_auc(X_train, y_train)

        payload.append({
            "feature_data": _feature_pairs(betas, list(feature_names)),
            "risk_scale": _risk_scale(mult, b0_rebased, betas),
            "training_logistic_loss": train_loss,
            "training_accuracy": float(train_acc),
            "training_AUC": float(train_auc),          # <-- capitalization per memo
            "card_label": f"{i+1:0{width}d}",          # <-- "01", "02", ...
        })

    out_fname = f"{dataset_tag}.json"
    with open(out_fname, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print(f"[Riskomon] Wrote {out_fname} with {len(payload)} models (memo schema)")
    return out_fname

In [3]:

for RiskInputs in ["STLMD"]:#"STLM","STPGLM"]:
    print('Processing:{}'.format(RiskInputs))

    #-| Parameters
    sparsity    = 5
    parent_size = 10

    if RiskInputs == "STPGLM":
        X_train = X_train_STPGLM # X: ensure plain float np.array (not DataFrame)
        y_train = y_train_STPGLM # y: Series -> 1D float np.array in {-1,+1}
        X_test  = X_test_STPGLM # X: ensure plain float np.array (not DataFrame)
        y_test  = y_test_STPGLM # y: Series -> 1D float np.array in {-1,+1}
        X_train_bin = X_train_STPGLM_bin
    elif RiskInputs == "STLM":
        X_train = X_train_STLM # X: ensure plain float np.array (not DataFrame)
        y_train = y_train_STLM # y: Series -> 1D float np.array in {-1,+1}
        X_test  = X_test_STLM # X: ensure plain float np.array (not DataFrame)
        y_test  = y_test_STLM # y: Series -> 1D float np.array in {-1,+1}
        X_train_bin = X_train_STLM_bin
    elif RiskInputs == "STLMD":
        X_train = X_train_STLMD # X: ensure plain float np.array (not DataFrame)
        y_train = y_train_STLMD # y: Series -> 1D float np.array in {-1,+1}
        X_test  = X_test_STLMD # X: ensure plain float np.array (not DataFrame)
        y_test  = y_test_STLMD # y: Series -> 1D float np.array in {-1,+1}
        X_train_bin = X_train_STLMD_bin


    #-- Optimizer
    RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size)
    start_time           = time.time()
    RiskScoreOptimizer_m.optimize()
    print("Optimization takes {:.2f} seconds.".format(time.time() - start_time))
    multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()
    print("We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))
    # ===== Keep this; needed for feature names =====
    X_featureNames = X_train_bin.columns

    # ===== Paste the exporter block here (exactly here) =====
    # [PASTE THE WHOLE EXPORTER I GAVE YOU]
    _ = export_riskomon_payload_memo(
        multipliers=multipliers,
        intercepts=sparseDiversePool_beta0_integer,
        coef_matrix=sparseDiversePool_betas_integer,
        feature_names=list(X_featureNames),
        X_train=X_train, y_train=y_train,
        dataset_tag=f"CANCER_{RiskInputs}",
        export_n=None  # or 50
    )
    # ===== end paste =====

Processing:STLMD
Optimization takes 336.77 seconds.
We generate 50 risk score models from the sparse diverse pool
[Riskomon] Wrote CANCER_STLMD.json with 50 models (memo schema)
