In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
import time

#### -------- Load and prepare the dataset --------

In [3]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
column_names = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=column_names)

# diagnosis ----> (M = 1, B = 0)
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})
data = data.drop('ID', axis=1)

X = data.drop('Diagnosis', axis=1).values
y = data['Diagnosis'].values
    
X.shape, y.shape

((569, 30), (569,))

In [4]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import time

# Start runtime measurement
start_time = time.time()

# Initialize pruning strategies
pruning_strategies = [
    {"max_depth": 5},  # Max Depth Pruning
    {"min_samples_leaf": 10}  # Min Samples Leaf Pruning
]

# Perform Stratified k-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for strategy in pruning_strategies:
    print(f"Evaluating strategy: {strategy}")
    accuracies = []
    confusion_matrices = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the Decision Tree with the current pruning strategy
        dt = DecisionTreeClassifier(random_state=42, **strategy)
        dt.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = dt.predict(X_test)
        
        # Evaluate performance
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        accuracies.append(acc)
        confusion_matrices.append(cm)
    
    # Report average performance
    print(f"Average Accuracy: {np.mean(accuracies):.2f}")
    print(f"Confusion Matrices:")
    for i, cm in enumerate(confusion_matrices, 1):
        print(f"Fold {i}:\n{cm}\n")

# Function to extract rules from a decision tree
def extract_rules(decision_tree, feature_names):
    tree_rules = export_text(decision_tree, feature_names=feature_names)
    return tree_rules

# Train a final Decision Tree and extract rules
final_dt = DecisionTreeClassifier(random_state=42, max_depth=5)
final_dt.fit(X, y)
feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
rules = extract_rules(final_dt, feature_names)

print("Extracted Rules:")
print(rules)

# End runtime measurement
end_time = time.time()
print(f"Total Runtime: {end_time - start_time:.2f} seconds")

Evaluating strategy: {'max_depth': 5}
Average Accuracy: 0.93
Confusion Matrices:
Fold 1:
[[67  4]
 [ 3 40]]

Fold 2:
[[70  1]
 [12 31]]

Fold 3:
[[69  3]
 [ 5 37]]

Fold 4:
[[66  6]
 [ 2 40]]

Fold 5:
[[70  1]
 [ 4 38]]

Evaluating strategy: {'min_samples_leaf': 10}
Average Accuracy: 0.91
Confusion Matrices:
Fold 1:
[[65  6]
 [ 1 42]]

Fold 2:
[[65  6]
 [10 33]]

Fold 3:
[[70  2]
 [ 8 34]]

Fold 4:
[[61 11]
 [ 1 41]]

Fold 5:
[[68  3]
 [ 4 38]]

Extracted Rules:
|--- Feature_20 <= 16.80
|   |--- Feature_27 <= 0.14
|   |   |--- Feature_13 <= 91.56
|   |   |   |--- Feature_13 <= 38.60
|   |   |   |   |--- Feature_14 <= 0.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Feature_14 >  0.00
|   |   |   |   |   |--- class: 0
|   |   |   |--- Feature_13 >  38.60
|   |   |   |   |--- Feature_10 <= 0.42
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- Feature_10 >  0.42
|   |   |   |   |   |--- class: 0
|   |   |--- Feature_13 >  91.56
|   |   |   |--- class: 1
|   |--- Feature_2