In [1]:
!pip install -U mutagen ucimlrepo ace-tools

Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting ace-tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace-tools, mutagen, ucimlrepo
Successfully installed ace-tools-0.0 mutagen-1.47.0 ucimlrepo-0.0.7


#Testing all algorithms

##1. Adult Dataset

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score, log_loss, precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Load ADULT Dataset
def load_adult():
    from ucimlrepo import fetch_ucirepo
    adult = fetch_ucirepo(id=2)
    X = adult.data.features
    y = adult.data.targets
    df = pd.concat([X, y], axis=1)
    df = df.fillna("Unknown")
    df['income'] = df['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})
    X = pd.get_dummies(df.drop(columns='income'), drop_first=True)
    y = df['income'].map({'<=50K': 0, '>50K': 1}).to_numpy()
    return X, y

# Load the dataset
X, y = load_adult()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True, kernel="rbf", C=1),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(32,), max_iter=200),
    "Bagged Trees": BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=10), n_estimators=50),
    "Boosted Trees": GradientBoostingClassifier(n_estimators=50, learning_rate=0.1),
    "Boosted Stumps": GradientBoostingClassifier(max_depth=1, n_estimators=50, learning_rate=0.1)
}

# Evaluation Metrics
def evaluate_metrics(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    fsc = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_prob)
    apr = average_precision_score(y_true, y_prob)
    mxe = log_loss(y_true, y_prob)
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    lift = np.sum(precision[:int(len(precision) * 0.25)]) / len(precision)  # Approx Lift
    return acc, fsc, lift, roc, apr, mxe

# Results storage
results = []

# Train and evaluate models
for model_name, model in models.items():
    print(f"Training {model_name}")
    # Train model
    model.fit(X_train, y_train)

    # Predictions and probabilities
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Evaluate metrics before calibration
    metrics_uncalibrated = evaluate_metrics(y_test, y_pred, y_prob)

    # Calibration
    calibrated_model = CalibratedClassifierCV(model, method="isotonic", cv="prefit")
    calibrated_model.fit(X_train, y_train)
    y_prob_calibrated = calibrated_model.predict_proba(X_test)[:, 1]

    # Evaluate metrics after calibration
    metrics_calibrated = evaluate_metrics(y_test, y_pred, y_prob_calibrated)

    # Store results
    results.append([model_name, "Uncalibrated", *metrics_uncalibrated])
    results.append([model_name, "Calibrated", *metrics_calibrated])

# Create results DataFrame
columns = ["Model", "Calibration", "Accuracy", "F-Score", "Lift", "ROC AUC", "Average Precision", "Cross-Entropy"]
results_df = pd.DataFrame(results, columns=columns)



Training Random Forest
Training Decision Tree
Training Logistic Regression
Training KNN
Training Naive Bayes
Training SVM
Training Neural Network




Training Bagged Trees
Training Boosted Trees
Training Boosted Stumps


ModuleNotFoundError: No module named 'ace_tools'

In [4]:
results_df

Unnamed: 0,Model,Calibration,Accuracy,F-Score,Lift,ROC AUC,Average Precision,Cross-Entropy
0,Random Forest,Uncalibrated,0.856889,0.635621,0.069771,0.912933,0.791988,0.32644
1,Random Forest,Calibrated,0.856889,0.635621,0.08692,0.912622,0.780729,0.32251
2,Decision Tree,Uncalibrated,0.861325,0.674359,0.096658,0.900428,0.755827,0.519413
3,Decision Tree,Calibrated,0.861325,0.674359,0.096658,0.900428,0.755827,0.519413
4,Logistic Regression,Uncalibrated,0.851703,0.65828,0.068666,0.904443,0.761362,0.323415
5,Logistic Regression,Calibrated,0.851703,0.65828,0.075944,0.904226,0.752141,0.325845
6,KNN,Uncalibrated,0.821538,0.599111,0.034181,0.833531,0.604114,1.565342
7,KNN,Calibrated,0.821538,0.599111,0.034181,0.833531,0.604114,1.566353
8,Naive Bayes,Uncalibrated,0.434314,0.447068,0.062769,0.699211,0.350419,17.154501
9,Naive Bayes,Calibrated,0.434314,0.447068,0.034181,0.700834,0.34884,0.473598


### Exact replica of paper

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score, log_loss, precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV

# Load ADULT Dataset
def load_adult():
    from ucimlrepo import fetch_ucirepo
    adult = fetch_ucirepo(id=2)
    X = adult.data.features
    y = adult.data.targets
    df = pd.concat([X, y], axis=1)
    df = df.fillna("Unknown")
    df['income'] = df['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})
    X = pd.get_dummies(df.drop(columns='income'), drop_first=True)
    y = df['income'].map({'<=50K': 0, '>50K': 1}).to_numpy()
    return X, y

# Load Dataset
X, y = load_adult()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Evaluation Metrics
def evaluate_metrics(y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    fsc = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_prob)
    apr = average_precision_score(y_true, y_prob)
    mxe = log_loss(y_true, y_prob)
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    lift = np.sum(precision[:int(len(precision) * 0.25)]) / len(precision)  # Approx Lift
    return acc, fsc, lift, roc, apr, mxe

# Results storage
results = []

models = {
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            "kernel": ["linear", "poly", "rbf"],
            "C": [10**i for i in range(-7, 4)],
            "degree": [2, 3],
            "gamma": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
        }
    },
    "Neural Network": {
        "model": MLPClassifier(max_iter=200),
        "params": {
            "hidden_layer_sizes": [(i,) for i in [1, 2, 4, 8, 32, 128]],
            "momentum": [0, 0.2, 0.5, 0.9]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=500),
        "params": {
            "C": [10**i for i in range(-8, 5)]
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": list(range(1, 27)),
            "weights": ["uniform", "distance"]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [1024],
            "max_features": [1, 2, 4, 6, 8, 12, 16, 20]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 10, 20, 30],
            "criterion": ["gini", "entropy"]
        }
    },
    "Bagged Trees": {
        "model": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100),
        "params": {}
    },
    "Boosted Trees": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024],
            "learning_rate": [0.1]
        }
    },
    "Boosted Stumps": {
        "model": GradientBoostingClassifier(max_depth=1),
        "params": {
            "n_estimators": [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    }
}

# Train and Evaluate Each Model
for model_name, config in models.items():
    print(f"Training {model_name}")
    model = config["model"]
    param_grid = config["params"]

    # Perform Grid Search if Parameters Exist
    if param_grid:
        grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy")
        grid.fit(X_train_scaled, y_train)
        best_model = grid.best_estimator_
    else:
        best_model = model.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = best_model.predict(X_test_scaled)
    y_prob = best_model.predict_proba(X_test_scaled)[:, 1] if hasattr(best_model, "predict_proba") else y_pred

    # Metrics Before Calibration
    metrics_uncalibrated = evaluate_metrics(y_test, y_pred, y_prob)

    # Platt Scaling Calibration
    platt_model = CalibratedClassifierCV(best_model, method="sigmoid", cv="prefit")
    platt_model.fit(X_train_scaled, y_train)
    y_prob_platt = platt_model.predict_proba(X_test_scaled)[:, 1]
    metrics_platt = evaluate_metrics(y_test, y_pred, y_prob_platt)

    # Isotonic Regression Calibration
    isotonic_model = CalibratedClassifierCV(best_model, method="isotonic", cv="prefit")
    isotonic_model.fit(X_train_scaled, y_train)
    y_prob_isotonic = isotonic_model.predict_proba(X_test_scaled)[:, 1]
    metrics_isotonic = evaluate_metrics(y_test, y_pred, y_prob_isotonic)

    # Save Results
    results.append([model_name, "Uncalibrated", *metrics_uncalibrated])
    results.append([model_name, "Platt Scaling", *metrics_platt])
    results.append([model_name, "Isotonic Regression", *metrics_isotonic])

# Create Results DataFrame
columns = ["Model", "Calibration", "Accuracy", "F-Score", "Lift", "ROC AUC", "Average Precision", "Cross-Entropy"]
results_df = pd.DataFrame(results, columns=columns)

Training SVM


In [None]:
results_df