In [44]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import load_breast_cancer

In [45]:
data = load_breast_cancer()
X = data.data
y = data.target

print(f"Data shape: {X.shape}, Labels shape: {y.shape}")
print("Column names:", data.feature_names)

Data shape: (569, 30), Labels shape: (569,)
Column names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [47]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [48]:
# y predicted is sigmoid(w * x + b), y true is boolean 0 or 1
def calculate_logistic_loss(y_predicted, y_true):
    eps = 1e-10
    return -np.mean(y_true * np.log(y_predicted + eps) + (1 - y_true) * np.log(1 - y_predicted + eps))

In [49]:
def accuracy(y_true, y_pred):
    return np.mean((y_pred >= 0.5) == y_true)

In [50]:
class LogisticRegressionCustom:
    def __init__(self, lr=0.01, iterations=5000, reg_lambda=0.0):
        self.lr = lr
        self.iterations = iterations
        self.reg_lambda = reg_lambda
        self.loss_history = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.iterations):
            pred = np.dot(X, self.w) + self.b # x*w + b, prediction is a linear combination
            y_pred = sigmoid(pred) # apply sigmoid to get probabilities

            # gradients
            dw = ((1 / n_samples) * np.dot(X.T, (y_pred - y))) + (self.reg_lambda / n_samples) * self.w # L2 regularization
            db = (1 / n_samples) * np.sum(y_pred - y)

            # update
            self.w -= self.lr * dw
            self.b -= self.lr * db
            loss = calculate_logistic_loss(y_pred, y)
            self.loss_history.append(loss)

    # return estimated probs for class 1
    def predict_proba(self, X):
        return sigmoid(np.dot(X, self.w) + self.b)

    # return class labels 0 or 1
    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)

In [51]:
# polynomial features with degree 2 without reg
model = LogisticRegressionCustom(lr=0.01, iterations=3000, reg_lambda=0.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy w/o no regularization:", accuracy(y_test, y_pred))

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

model_poly = LogisticRegressionCustom(lr=0.01, iterations=3000, reg_lambda=0.0)
model_poly.fit(X_train_poly, y_train)

y_pred_poly = model_poly.predict(X_test_poly)
print("Accuracy for poly degree 2 w/o regularization:", accuracy(y_test, y_pred_poly))

  return 1 / (1 + np.exp(-z))


Accuracy w/o no regularization: 0.956140350877193


  return 1 / (1 + np.exp(-z))


Accuracy for poly degree 2 w/o regularization: 0.9736842105263158


In [52]:
# polynomial features with degrees 1-4 without reg, increase degree until overfitting happens
degrees = [1, 2, 3, 4]
for d in degrees:
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_train_d = poly.fit_transform(X_train)
    X_test_d = poly.transform(X_test)

    model_d = LogisticRegressionCustom(lr=0.01, iterations=3000, reg_lambda=0.0)
    model_d.fit(X_train_d, y_train)

    train_acc = accuracy(y_train, model_d.predict(X_train_d))
    test_acc = accuracy(y_test, model_d.predict(X_test_d))

    print(f"Degree {d} - Train Acc={train_acc}, Test Acc={test_acc}")

  return 1 / (1 + np.exp(-z))


Degree 1 - Train Acc=0.9164835164835164, Test Acc=0.956140350877193


  return 1 / (1 + np.exp(-z))


Degree 2 - Train Acc=0.9142857142857143, Test Acc=0.9736842105263158


  return 1 / (1 + np.exp(-z))


Degree 3 - Train Acc=0.9120879120879121, Test Acc=0.9736842105263158


  return 1 / (1 + np.exp(-z))


Degree 4 - Train Acc=0.8923076923076924, Test Acc=0.9385964912280702


In [53]:
# use reg for degree 4 polynomial features
poly = PolynomialFeatures(degree=4, include_bias=False)
X_train_d4 = poly.fit_transform(X_train)
X_test_d4 = poly.transform(X_test)

reg_values = [0.0, 0.01, 0.1, 1.0, 10]


for reg in reg_values:
    model_reg = LogisticRegressionCustom(
        lr=0.01, iterations=3000, reg_lambda=reg
    )
    model_reg.fit(X_train_d4, y_train)

    train_acc = accuracy(y_train, model_reg.predict(X_train_d4))
    test_acc = accuracy(y_test, model_reg.predict(X_test_d4))

    print(f"Reg={reg}: Train Acc={train_acc}, Test Acc={test_acc}")

  return 1 / (1 + np.exp(-z))


Reg=0.0: Train Acc=0.8923076923076924, Test Acc=0.9385964912280702
Reg=0.01: Train Acc=0.9186813186813186, Test Acc=0.9649122807017544
Reg=0.1: Train Acc=0.8923076923076924, Test Acc=0.9385964912280702
Reg=1.0: Train Acc=0.9142857142857143, Test Acc=0.9649122807017544
Reg=10: Train Acc=0.9230769230769231, Test Acc=0.9473684210526315
