In [14]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
X, y = load_breast_cancer(return_X_y=True)

In [8]:
# Train–test split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)



X_train shape: (455, 30)
X_test shape : (114, 30)


In [17]:

# Baseline Logistic Regression (default L2 regularization)
log_reg_base = LogisticRegression(max_iter=10000)

# Fit (train) the model on the training data
log_reg_base.fit(X_train, y_train)

# Show learned coefficients (one per feature) and intercept (bias term)
print("Baseline Logistic Regression coefficients:")
print(log_reg_base.coef_)
print("Intercept:", log_reg_base.intercept_)

# Predict labels on train and test data
y_train_pred_base = log_reg_base.predict(X_train)
y_test_pred_base = log_reg_base.predict(X_test)

# Compute accuracy on training and test sets
acc_train_base = accuracy_score(y_train, y_train_pred_base)
acc_test_base = accuracy_score(y_test, y_test_pred_base)

print("Baseline accuracy (train):", acc_train_base)
print("Baseline accuracy (test) :", acc_test_base)


Baseline Logistic Regression coefficients:
[[ 0.80708196  0.11328911 -0.28307687  0.02521483 -0.16733751 -0.20217576
  -0.45506264 -0.25243782 -0.30921281 -0.03116865 -0.05511739  1.10334223
   0.08564572 -0.09595851 -0.02231761  0.05911651 -0.02139347 -0.03540418
  -0.04039299  0.0137089   0.09521451 -0.37693116 -0.08781235 -0.01459524
  -0.32483321 -0.74767161 -1.32332634 -0.56343008 -0.78785848 -0.09156122]]
Intercept: [29.17330007]
Baseline accuracy (train): 0.9560439560439561
Baseline accuracy (test) : 0.9649122807017544


In [18]:
# small C -> strong regularization, large C -> weak regularization
# C is inverse of regularization strength:

# Grid for C (inverse regularization strength)
C_values = np.logspace(-3, 3, 13)

# Define grid of hyperparameters to search over
param_grid = {
    "C": C_values,
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]   # supports both L1 and L2
}
# Base Logistic Regression model for grid search
log_reg = LogisticRegression(max_iter=10000)

log_cv = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5
)

# GridSearchCV will try all combinations of parameters using cross‑validation
log_cv.fit(X_train, y_train)

print("Best hyperparameters:", log_cv.best_params_)
print("Best cross-validation accuracy:", log_cv.best_score_)


Best hyperparameters: {'C': np.float64(1000.0), 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.9670329670329669


In [19]:
#Get the best model (with best C and penalty) and evaluate on test set
best_log_model = log_cv.best_estimator_

y_test_pred_best = best_log_model.predict(X_test)
acc_test_best = accuracy_score(y_test, y_test_pred_best)

print("Best tuned model accuracy on test:", acc_test_best)


Best tuned model accuracy on test: 0.9736842105263158


In [20]:
# Use best C from tuning
best_C = log_cv.best_params_["C"]

# L1 (Lasso-like) logistic regression
log_l1 = LogisticRegression(
    C=best_C,
    penalty="l1",
    solver="liblinear",
    max_iter=10000
)

# L2 (Ridge-like) logistic regression
log_l2 = LogisticRegression(
    C=best_C,
    penalty="l2",
    solver="liblinear",
    max_iter=10000
)

# Train both models on the training data
log_l1.fit(X_train, y_train)
log_l2.fit(X_train, y_train)

# Predict on train and test for L1 model
y_train_pred_l1 = log_l1.predict(X_train)
y_test_pred_l1 = log_l1.predict(X_test)

# Predict on train and test for L2 model
y_train_pred_l2 = log_l2.predict(X_train)
y_test_pred_l2 = log_l2.predict(X_test)

# Compute accuracies for L1 model
acc_train_l1 = accuracy_score(y_train, y_train_pred_l1)
acc_test_l1 = accuracy_score(y_test, y_test_pred_l1)

# Compute accuracies for L2 model
acc_train_l2 = accuracy_score(y_train, y_train_pred_l2)
acc_test_l2 = accuracy_score(y_test, y_test_pred_l2)

print("L1 Logistic Regression accuracy (train):", acc_train_l1)
print("L1 Logistic Regression accuracy (test) :", acc_test_l1)

print("L2 Logistic Regression accuracy (train):", acc_train_l2)
print("L2 Logistic Regression accuracy (test) :", acc_test_l2)




L1 Logistic Regression accuracy (train): 0.9956043956043956
L1 Logistic Regression accuracy (test) : 0.9649122807017544
L2 Logistic Regression accuracy (train): 0.9846153846153847
L2 Logistic Regression accuracy (test) : 0.9736842105263158


In [21]:
# coef_ has shape (1, n_features) for binary classification.
# [0] selects the 1D array of coefficients for the positive class.
coef_l1 = log_l1.coef_[0]  # coefficients from L1-regularized model
coef_l2 = log_l2.coef_[0]  # coefficients from L2-regularized model

# Count how many coefficients are non-zero (shows sparsity)
num_nonzero_l1 = (coef_l1 != 0).sum()
num_nonzero_l2 = (coef_l2 != 0).sum()

print("Number of non-zero L1 coefficients:", num_nonzero_l1)
print("Number of non-zero L2 coefficients:", num_nonzero_l2)

# Print the actual coefficient values
print("L1 coefficients:")
print(coef_l1)

print("L2 coefficients:")
print(coef_l2)

Number of non-zero L1 coefficients: 27
Number of non-zero L2 coefficients: 30
L1 coefficients:
[ 4.68851957e+00 -7.50167479e-01  4.61136380e-01 -4.74364773e-02
 -1.89542797e+02  2.84392669e+02 -2.53741877e+01 -2.74555356e+02
  9.72169787e+00 -1.60476769e+02 -5.83248092e+01  1.57698604e+00
  7.51402413e+00 -2.06453966e-01  0.00000000e+00 -1.38225193e+02
  4.77580923e+02 -1.43567146e+03  1.41987421e+02  0.00000000e+00
  3.31277666e+00 -4.82313473e-01  1.39698984e-01 -9.21667731e-02
  0.00000000e+00  1.69307371e+01 -9.15965957e+01 -9.60981129e+00
 -6.52071983e+01  4.10203306e+01]
L2 coefficients:
[ 4.66732120e+00  3.99657496e-02 -4.37361558e-01 -1.06159934e-02
 -4.83151492e+00  1.73782484e+00 -3.78299694e+00 -7.76561248e+00
 -7.03123819e+00  1.06818217e+00  1.50282613e+00  2.27378587e+00
 -3.94292982e-01 -1.28332658e-01 -8.23302808e-01  7.55222156e+00
  9.74274384e+00 -6.50671727e-01  1.22700943e+00  1.31774105e+00
  4.72574116e-01 -4.41149628e-01  3.92848204e-02 -2.27574000e-02
 -9.99507