In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)


Project root: c:\Users\sprin\OneDrive\Desktop\breast_ cancer_ml


In [2]:
from src.utils import load_dataset
from src.datasets import encode_target, select_base_features



In [3]:
df = load_dataset("data/breast_cancer_wisconsin.csv")
df = encode_target(df)
df = select_base_features(df)

df.head()


Unnamed: 0,concavity_mean,perimeter_mean,radius_mean,area_mean,target
0,0.3001,122.8,17.99,1001.0,1
1,0.0869,132.9,20.57,1326.0,1
2,0.1974,130.0,19.69,1203.0,1
3,0.2414,77.58,11.42,386.1,1
4,0.198,135.1,20.29,1297.0,1


In [4]:
from src.features import add_irregularity_index

df_feat = add_irregularity_index(df)
df_feat.head()


Unnamed: 0,concavity_mean,perimeter_mean,radius_mean,area_mean,target,irregularity_index
0,0.3001,122.8,17.99,1001.0,1,2e-05
1,0.0869,132.9,20.57,1326.0,1,5e-06
2,0.1974,130.0,19.69,1203.0,1,1.2e-05
3,0.2414,77.58,11.42,386.1,1,4e-05
4,0.198,135.1,20.29,1297.0,1,1.1e-05


In [5]:
df_feat["irregularity_index"].describe()


count    569.000000
mean       0.000010
std        0.000009
min        0.000000
25%        0.000005
50%        0.000008
75%        0.000012
max        0.000101
Name: irregularity_index, dtype: float64

In [6]:
from src.h1_feature_validation import run_h1_experiment

experiments = {
    "concavity_only": ["concavity_mean"],
    "perimeter_only": ["perimeter_mean"],
    "concavity_perimeter": ["concavity_mean", "perimeter_mean"],
    "irregularity_only": ["irregularity_index"],
    "all_plus_index": [
        "concavity_mean",
        "perimeter_mean",
        "radius_mean",
        "area_mean",
        "irregularity_index",
    ],
}

results = {}

for name, cols in experiments.items():
    auc = run_h1_experiment(df_feat, cols)
    results[name] = auc
    print(f"{name}: ROC-AUC = {auc:.4f}")


concavity_only: ROC-AUC = 0.9696
perimeter_only: ROC-AUC = 0.9568
concavity_perimeter: ROC-AUC = 0.9679
irregularity_only: ROC-AUC = 0.8213
all_plus_index: ROC-AUC = 0.9747


In [7]:
from src.evaluation import delong_test


In [9]:
from sklearn.model_selection import train_test_split
from src.models import get_logistic_model
from src.evaluation import delong_test

# Prepare features and target
X = df_feat[
    [
        "concavity_mean",
        "perimeter_mean",
        "radius_mean",
        "area_mean",
        "irregularity_index",
    ]
]
y = df_feat["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Model 1: concavity only
model_c = get_logistic_model()
model_c.fit(X_train[["concavity_mean"]], y_train)
pred_c = model_c.predict_proba(X_test[["concavity_mean"]])[:, 1]

# Model 2: all features + irregularity index
model_all = get_logistic_model()
model_all.fit(X_train, y_train)
pred_all = model_all.predict_proba(X_test)[:, 1]

# DeLong test
auc_c, auc_all, p_value = delong_test(y_test, pred_all, pred_c)

print(f"AUC (Concavity only): {auc_c:.4f}")
print(f"AUC (All + Index):   {auc_all:.4f}")
print(f"DeLong p-value:      {p_value:.6f}")


AUC (Concavity only): 0.9747
AUC (All + Index):   0.9696
DeLong p-value:      0.986467
