In [6]:
from typing import Tuple

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
data = pd.read_csv("https://raw.githubusercontent.com/changyaochen/MECE4520/master/lectures/lecture_4/breast_cancer_data.csv")
data["label"] = data["diagnosis"].apply(lambda x: 0 if x == "B" else 1)
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,...,texture_extreme,perimeter_extreme,area_extreme,smoothness_extreme,compactness_extreme,concavity_extreme,concave_extreme,symmetry_extreme,fractal_extreme,label
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


## Forward propagation of a 2-layer NN

In [8]:
features = [
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave_mean",
    "symmetry_mean",
    "fractal_mean",
#     "radius_se",
#     "texture_se",
#     "perimeter_se",
#     "area_se",
#     "smoothness_se",
#     "compactness_se",
#     "concavity_se",
#     "concave_se",
#     "symmetry_se",
#     "fractal_se",
#     "radius_extreme",
#     "texture_extreme",
#     "perimeter_extreme",
#     "area_extreme",
#     "smoothness_extreme",
#     "compactness_extreme",
#     "concavity_extreme",
#     "concave_extreme",
#     "symmetry_extreme",
#     "fractal_extreme",
]
label = "label"

# train test split
X, X_test, Y, Y_test = train_test_split(data[features].values, data[label].values, test_size=0.2)

# Standardize the input
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# formatting
Y = Y.reshape((-1, 1))
Y_test = Y_test.reshape((-1, 1))

In [9]:
# forward pass for a simple 2-layer NN, with 3 hidden units
np.random.seed(10)

def sigmoid(x):
    """Calculates sigmoid function."""
    return 1. / (1 + np.exp(-x))

# parameters for the first layer
W_1 = np.random.normal(size=(3, X.shape[1]))
print(f"Shape of W_1 is {W_1.shape}")

b_1 = np.random.normal(size=(3, 1))
print(f"Shape of b_1 is {b_1.shape}")

# parameters for the second layer
W_2 = np.random.normal(size=(1, 3))
print(f"Shape of W_2 is {W_2.shape}")

b_2 = np.random.normal(size=(1, 1))
print(f"Shape of b_1 is {b_2.shape}")

# calculate the forward propagation
Z_1 = X @ W_1.T
print(f"\nShape of Z_1 is {Z_1.shape}")
print("Samples for Z_1:")
print(Z_1[:5])

A_1 = sigmoid(Z_1 + b_1.T) 
print(f"Shape of A_1 is {A_1.shape}")
print("Samples for A_1:")
print(A_1[:5])

Z_2 = A_1 @ W_2.T
print(f"\nShape of Z_2 is {Z_2.shape}")
print("Samples for Z_2:")
print(Z_1[:5])

A_2 = Y_hat = sigmoid(Z_2 + b_2.T)
print(f"Shape of A_2 is {A_2.shape}")
print("Samples for A_2:")
print(A_2[:5])

Shape of W_1 is (3, 10)
Shape of b_1 is (3, 1)
Shape of W_2 is (1, 3)
Shape of b_1 is (1, 1)

Shape of Z_1 is (455, 3)
Samples for Z_1:
[[-0.92715153 -3.04939693 -3.21565149]
 [-0.47573528 -0.24206723  9.79966072]
 [ 0.16174361  0.22528869 -7.11657099]
 [-0.13512266 -0.56932038 -1.11101173]
 [-1.15595304 -2.6071221  -0.2024807 ]]
Shape of A_1 is (455, 3)
Samples for A_1:
[[2.32460663e-01 2.66305167e-02 4.38163404e-02]
 [3.22337014e-01 3.11873239e-01 9.99951426e-01]
 [4.73631094e-01 4.19696277e-01 9.25858648e-04]
 [4.00726281e-01 2.46265606e-01 2.73228549e-01]
 [1.94149426e-01 4.08386208e-02 4.82563971e-01]]

Shape of Z_2 is (455, 1)
Samples for Z_2:
[[-0.92715153 -3.04939693 -3.21565149]
 [-0.47573528 -0.24206723  9.79966072]
 [ 0.16174361  0.22528869 -7.11657099]
 [-0.13512266 -0.56932038 -1.11101173]
 [-1.15595304 -2.6071221  -0.2024807 ]]
Shape of A_2 is (455, 1)
Samples for A_2:
[[0.58246866]
 [0.70052798]
 [0.67349519]
 [0.64219928]
 [0.61190097]]


## Training a NN with backward propagation

In [10]:
def forward_prop(
    X: np.array,
    W_1: np.array,
    b_1: np.array,
    W_2: np.array,
    b_2: np.array,
) -> Tuple:
    """Performs the forward propagation of the given NN."""
    # Note the NN structure is passed in from outside.
    Z_1 = X @ W_1.T
    A_1 = sigmoid(Z_1 + b_1.T)
    
    Z_2 = A_1 @ W_2.T
    A_2 = Y = sigmoid(Z_2 + b_2.T)
    
    return A_2, Z_2, A_1, Z_1

Y_hat, _, _, _ = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)

In [11]:
def derivatives_by_backprop(
    X: np.array,
    Y: np.array,
    W_1: np.array,
    b_1: np.array,
    W_2: np.array,
    b_2: np.array,
) -> Tuple:
    """Calculates the derivatives of the parameters by backforward propagation.
    
    Here we assume it is a binary classification problem, with sigmoid activation functions.
    """
    # forward propagation
    dW_2, db_2, dW_1, db_1 = 0, 0, 0, 0
    Y_hat, Z_2, A_1, Z_1 = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
    n = len(Y_hat)
    
    loss = -np.mean(np.multiply(Y, np.log(Y_hat)) + np.multiply(1 - Y, np.log(1 - Y_hat)))
    
    dZ_2 = Y_hat - Y
    dW_2 = dZ_2.T @ A_1 / n
    db_2 = np.mean(dZ_2.T, axis=1, keepdims=True) 
    
    dZ_1 = np.multiply(dZ_2 @ W_2, np.multiply(A_1, 1 - A_1))
    dW_1 = (dZ_1.T @ X) / n
    db_1 = np.mean(dZ_1.T, axis=1, keepdims=True) 
    
    return dW_2, db_2, dW_1, db_1, loss

dW_2, db_2, dW_1, db_1, loss = derivatives_by_backprop(X=X, Y=Y, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)    

In [12]:
def gradient_descent(
    X: np.array,
    Y: np.array,
    W_1_init: np.array,
    b_1_init: np.array,
    W_2_init: np.array,
    b_2_init: np.array,
    learning_rate: float = 0.01,
    epsilon: float = 1e-6,
    verbose: bool = False,
) -> Tuple:
    """Runs gradient descent to fit the NN via backprop."""
    W_1 = W_1_init
    b_1 = b_1_init
    W_2 = W_2_init
    b_2 = b_2_init
    losses = [float("inf"), ]
    roc_auc_scores = [0.5, ]
    
    diff_in_loss = float("inf")
    iteration = 0
    while abs(diff_in_loss) > epsilon:
        iteration += 1
        dW_2, db_2, dW_1, db_1, loss = derivatives_by_backprop(
            X=X, Y=Y, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2
        ) 
        
        W_1 -= learning_rate * dW_1
        b_1 -= learning_rate * db_1
        W_2 -= learning_rate * dW_2
        b_2 -= learning_rate * db_2
        
        losses.append(loss)
        diff_in_loss = losses[-1] - losses[-2]
        
        Y_hat, _, _, _ = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
        roc_auc = roc_auc_score(y_true=Y, y_score=Y_hat)
        roc_auc_scores.append(roc_auc)
        
        if verbose and iteration % 10 == 0:
            print(loss, roc_auc)
    return W_1, b_1, W_2, b_2, losses

In [36]:
# parameters for the first layer
W_1_init = np.random.normal(size=(3, X.shape[1]))
b_1_init = np.random.normal(size=(3, 1))

# parameters for the second layer
W_2_init = np.random.normal(size=(1, 3))
b_2_init = np.random.normal(size=(1, 1))

W_1, b_1, W_2, b_2, losses = gradient_descent(
    X=X,
    Y=Y,
    W_1_init=W_1_init,
    b_1_init=b_1_init,
    W_2_init=W_2_init,
    b_2_init=b_2_init,
    learning_rate=0.1,
    epsilon=1e-3,
    verbose=True,
)

1.2808512372525476 0.09713659121943145
0.9842747111109987 0.11370877643066994
0.8229444385574967 0.19760417097695207
0.7373637328279206 0.343029751313775
0.6862452448467318 0.49280009930897506
0.649633293454997 0.6445773161749494
0.6187031070389205 0.761555013034303
0.589438001136619 0.8297678652708238
0.5609986106431064 0.8720776265154964
0.5348803689834151 0.8982703686845699
0.5119956567398527 0.9158356436462944
0.4920619421347611 0.9268010096412463
0.4744519982329816 0.9337526378946497
0.4586180932800434 0.9388422228658915
0.44416343908290656 0.9439111184673314
0.43081297938654994 0.9474696900732403
0.4183762172047802 0.9502834443662846
0.40671981883197506 0.952373070716266
0.3957490579304668 0.9548351057226796
0.38539502788867214 0.9567178383746431


In [29]:
Y_test_hat, _, _, _ = forward_prop(X=X_test, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
roc_auc_score(y_true=Y_test, y_score=Y_test_hat)

0.9918113331149688

In [34]:
from tensorflow import keras
from tensorflow.keras import layers

def keras_model(nn_size: int, num_features: int, num_layers: int):
    """Creates a simple Keras model."""
    inputs = keras.Input(
        shape=(num_features, ), name="inputs")
    x = inputs
    for i in range(num_layers):
        x = layers.Dense(
            nn_size, activation="sigmoid", name=f"desnse_layer_{i}")(x)

    outputs = layers.Dense(
        1, activation="sigmoid", name="output")(x)

    model = keras.Model(
        inputs=inputs, outputs=outputs, name="simple_model")
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["AUC"])
    return model

model = keras_model(nn_size=3, num_features=X.shape[1], num_layers=1)
history = model.fit(
    X,
    Y,
    batch_size=32,
    epochs=10,
    validation_split=0.2,
    shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
roc_auc_score(y_true=Y_test, y_score=model.predict(X_test))

0.9315427448411397