In [38]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

RANDOM_STATE = 42

### Problem 3

In [30]:
def logistic_regression(X, y, batch_size, learning_rate=0.0003, num_epochs=1000): 
    np.random.seed(RANDOM_STATE)

    # add bias and initialize weights with standard guassian
    X = np.c_[np.ones(X.shape[0]), X]
    w = np.random.randn(X.shape[1])

    # train the model
    for epoch in range(num_epochs): 
        # shuffle the data
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        # mini-batch sgd 
        for i in range(0, len(X), batch_size): 
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]

            # forward pass  
            logits = np.dot(X_batch, w)
            y_pred = 1 / (1 + np.exp(-logits))

            # compute gradient 
            grad = np.dot(X_batch.T, y_pred - y_batch)
            w -= learning_rate * grad

    return w

### Problem 4

#### (a)

In [31]:
# load wisconsin breast cancer dataset
data = load_breast_cancer()
X = data.data 
y = data.target
print("Data shape: ", X.shape)
print("Target shape: ", y.shape)

Data shape:  (569, 30)
Target shape:  (569,)


#### (b)

In [46]:
# split ratios
train_ratio = 0.70
val_ratio = 0.15
test_ratio = 0.15

# Use train_test_split to split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + val_ratio), random_state=RANDOM_STATE) 

# standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print("Train: ", X_train.shape, y_train.shape)
print("Val: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

Train:  (398, 30) (398,)
Val:  (85, 30) (85,)
Test:  (86, 30) (86,)


#### (c)

In [47]:
class_sizes = np.unique(np.concatenate((y_train, y_val)), return_counts=True)

print("Label 0 class size: ", class_sizes[1][0])
print("Label 1 class size: ", class_sizes[1][1])


Label 0 class size:  186
Label 1 class size:  297


#### (d)

In [48]:
def cross_entropy_loss(y_true, y_pred): 
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def predict(X, w): 
    X = np.c_[np.ones(X.shape[0]), X]
    return 1 / (1 + np.exp(-np.dot(X, w)))

# grid search for learning rate and batch size
learning_rates = [0.0003, 0.001, 0.003, 0.01]
batch_sizes = [16, 32, 64, 128, 256] 
num_epochs = 1000
lowest_val_loss = float('inf')
best_w = None
for lr in learning_rates: 
    for batch_size in batch_sizes: 
        w_current = logistic_regression(X_train, y_train, batch_size, lr, num_epochs)
        # get loss on validation set
        y_val_pred = predict(X_val, w_current)
        val_loss = cross_entropy_loss(y_val, y_val_pred)
        # update best model
        if val_loss < lowest_val_loss: 
            lowest_val_loss = val_loss
            best_lr = lr
            best_batch_size = batch_size
            best_w = w_current

        print(f"Learning rate: {lr}, Batch size: {batch_size}, Validation loss: {val_loss}")

print(f"Best learning rate: {best_lr}, Best batch size: {best_batch_size}, Best validation loss: {lowest_val_loss}")


Learning rate: 0.0003, Batch size: 16, Validation loss: 0.055431828761968255
Learning rate: 0.0003, Batch size: 32, Validation loss: 0.055431676564869456
Learning rate: 0.0003, Batch size: 64, Validation loss: 0.055431225393166576
Learning rate: 0.0003, Batch size: 128, Validation loss: 0.055430322510303616
Learning rate: 0.0003, Batch size: 256, Validation loss: 0.05542926196690215
Learning rate: 0.001, Batch size: 16, Validation loss: 0.060011764024911526
Learning rate: 0.001, Batch size: 32, Validation loss: 0.06001127372453017
Learning rate: 0.001, Batch size: 64, Validation loss: 0.060009568268255646
Learning rate: 0.001, Batch size: 128, Validation loss: 0.06000853531157793
Learning rate: 0.001, Batch size: 256, Validation loss: 0.06000752171909894
Learning rate: 0.003, Batch size: 16, Validation loss: 0.06932122371537591
Learning rate: 0.003, Batch size: 32, Validation loss: 0.0693159390014663
Learning rate: 0.003, Batch size: 64, Validation loss: 0.0693034097357649
Learning rat

#### (e)

In [49]:
# evaluate on test set
y_pred = predict(X_test, best_w)
accuracy = accuracy_score(y_test, y_pred > 0.5)
precision = precision_score(y_test, y_pred > 0.5)
recall = recall_score(y_test, y_pred > 0.5)
f1 = f1_score(y_test, y_pred > 0.5)

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

Accuracy: 0.9883720930232558, Precision: 0.9836065573770492, Recall: 1.0, F1: 0.9917355371900827


#### (f)


Summarization: Overall the model performs well on the test set, with a high performance across the four eval metrics. The overall accuracy is 98.84%. With a precision of 98.36%, the model results in maybe a few false positives. The perfect recall score indicates that there is no false negatives. Through grid search on learning rate and batch size, we found that the model performed the best when lr=0.0003 and batch_size = 256. But in general, it seems for this problem, the model performs well for smaller learning rates with larger batch sizes.  