In [None]:
# SVM on sklearn's Breast Cancer dataset
# --------------------------------------
# This script:
# 1) loads the dataset
# 2) splits train/test
# 3) builds a Pipeline: StandardScaler -> SVC
# 4) runs a small GridSearchCV over C and gamma (RBF) and linear kernel
# 5) evaluates on the held-out test set with metrics + ROC AUC

In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

In [16]:
# 1) Load data
data = load_breast_cancer()
X, y = data.data, data.target
target_names = data.target_names  # ['malignant', 'benign']


In [17]:
# 2) Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [11]:
# 3) Pipeline: scale -> SVM
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("svc", SVC(probability=True, class_weight=None, random_state=42))
])

In [18]:
# 4) Hyperparameter grid
#    Try linear and RBF kernels. For RBF, tune C and gamma on a small grid.
param_grid = [
    {"svc__kernel": ["linear"], "svc__C": [0.1, 1, 10, 100]},
    {"svc__kernel": ["rbf"], "svc__C": [0.1, 1, 10, 100], "svc__gamma": ["scale", 0.01, 0.001]}
]


In [19]:
# Use ROC AUC to balance precision/recall tradeoffs on probability scores
grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    refit=True,
    verbose=0
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("CV best ROC AUC: {:.3f}".format(grid.best_score_))

Best params: {'svc__C': 1, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
CV best ROC AUC: 0.994


In [20]:
# 5) Evaluate on test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # probability of positive class

print("\nClassification report:")
print(classification_report(y_test, y_pred, target_names=target_names))

print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred))

test_auc = roc_auc_score(y_test, y_proba)
print("Test ROC AUC: {:.3f}".format(test_auc))


Classification report:
              precision    recall  f1-score   support

   malignant       0.98      0.92      0.95        64
      benign       0.95      0.99      0.97       107

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.97      0.96      0.96       171

Confusion matrix (rows=true, cols=pred):
[[ 59   5]
 [  1 106]]
Test ROC AUC: 0.996
