In [1]:
from typing import Tuple

import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/changyaochen/MECE4520/master/lectures/lecture_4/breast_cancer_data.csv")
data["label"] = data["diagnosis"].apply(lambda x: 0 if x == "B" else 1)
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,...,texture_extreme,perimeter_extreme,area_extreme,smoothness_extreme,compactness_extreme,concavity_extreme,concave_extreme,symmetry_extreme,fractal_extreme,label
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


## Forward propagation of a 2-layer NN

In [3]:
features = [
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave_mean",
    "symmetry_mean",
    "fractal_mean",
]
label = "label"

# train test split
X_raw, X_raw_test, Y, Y_test = train_test_split(data[features].values, data[label].values, test_size=0.2, random_state=42)

# Standardize the input
scaler = StandardScaler()
scaler.fit(X_raw)
X = scaler.transform(X_raw)
X_test = scaler.transform(X_raw_test)

# formatting
Y = Y.reshape((-1, 1))
Y_test = Y_test.reshape((-1, 1))

In [4]:
# forward pass for a simple 2-layer NN, with 3 hidden units
np.random.seed(10)

def sigmoid(x):
    """Calculates sigmoid function."""
    return 1. / (1 + np.exp(-x))

# parameters for the first layer
W_1 = np.random.normal(size=(3, X.shape[1]))
print(f"Shape of W_1 is {W_1.shape}")

b_1 = np.random.normal(size=(3, 1))
print(f"Shape of b_1 is {b_1.shape}")

# parameters for the second layer
W_2 = np.random.normal(size=(1, 3))
print(f"Shape of W_2 is {W_2.shape}")

b_2 = np.random.normal(size=(1, 1))
print(f"Shape of b_1 is {b_2.shape}")

# calculate the forward propagation
Z_1 = X @ W_1.T
print(f"\nShape of Z_1 is {Z_1.shape}")
print("Samples for Z_1:")
print(Z_1[:5])

A_1 = sigmoid(Z_1 + b_1.T) 
print(f"Shape of A_1 is {A_1.shape}")
print("Samples for A_1:")
print(A_1[:5])

Z_2 = A_1 @ W_2.T
print(f"\nShape of Z_2 is {Z_2.shape}")
print("Samples for Z_2:")
print(Z_1[:5])

A_2 = Y_hat = sigmoid(Z_2 + b_2.T)
print(f"Shape of A_2 is {A_2.shape}")
print("Samples for A_2:")
print(A_2[:5])

Shape of W_1 is (3, 10)
Shape of b_1 is (3, 1)
Shape of W_2 is (1, 3)
Shape of b_1 is (1, 1)

Shape of Z_1 is (455, 3)
Samples for Z_1:
[[ 0.16410112 -4.76306361  3.93309998]
 [-0.46604358  4.1992739   9.5658238 ]
 [-1.60754809 -0.23753874 -1.01727238]
 [ 1.37695245  2.28649564 -5.09016965]
 [ 0.12721277  3.49293739  0.32441791]]
Shape of A_1 is (455, 3)
Samples for A_1:
[[0.47421887 0.00490603 0.98314001]
 [0.32445766 0.97466643 0.99993863]
 [0.13297977 0.31284592 0.29223288]
 [0.75206111 0.85032936 0.00698167]
 [0.46503108 0.94996148 0.61233221]]

Shape of Z_2 is (455, 1)
Samples for Z_2:
[[ 0.16410112 -4.76306361  3.93309998]
 [-0.46604358  4.1992739   9.5658238 ]
 [-1.60754809 -0.23753874 -1.01727238]
 [ 1.37695245  2.28649564 -5.09016965]
 [ 0.12721277  3.49293739  0.32441791]]
Shape of A_2 is (455, 1)
Samples for A_2:
[[0.59207723]
 [0.84761911]
 [0.69066552]
 [0.76062638]
 [0.82363926]]


## Training a NN with backward propagation

In [5]:
def forward_prop(
    X: np.array,
    W_1: np.array,
    b_1: np.array,
    W_2: np.array,
    b_2: np.array,
) -> Tuple:
    """Performs the forward propagation of the given NN."""
    # Note the NN structure is passed in from outside.
    Z_1 = X @ W_1.T
    A_1 = sigmoid(Z_1 + b_1.T)
    
    Z_2 = A_1 @ W_2.T
    A_2 = Y = sigmoid(Z_2 + b_2.T)
    
    return A_2, Z_2, A_1, Z_1

Y_hat, _, _, _ = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)

In [6]:
def derivatives_by_backprop(
    X: np.array,
    Y: np.array,
    W_1: np.array,
    b_1: np.array,
    W_2: np.array,
    b_2: np.array,
) -> Tuple:
    """Calculates the derivatives of the parameters by backforward propagation.
    
    Here we assume it is a binary classification problem, with sigmoid activation functions.
    """
    # forward propagation
    dW_2, db_2, dW_1, db_1 = 0, 0, 0, 0
    Y_hat, Z_2, A_1, Z_1 = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
    n = len(Y_hat)
    
    loss = -np.mean(np.multiply(Y, np.log(Y_hat)) + np.multiply(1 - Y, np.log(1 - Y_hat)))
    
    dZ_2 = Y_hat - Y
    dW_2 = dZ_2.T @ A_1 / n
    db_2 = np.mean(dZ_2.T, axis=1, keepdims=True) 
    
    dZ_1 = np.multiply(dZ_2 @ W_2, np.multiply(A_1, 1 - A_1))
    dW_1 = (dZ_1.T @ X) / n
    db_1 = np.mean(dZ_1.T, axis=1, keepdims=True) 
    
    return dW_2, db_2, dW_1, db_1, loss

dW_2, db_2, dW_1, db_1, loss = derivatives_by_backprop(X=X, Y=Y, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)    

In [7]:
def gradient_descent(
    X: np.array,
    Y: np.array,
    W_1_init: np.array,
    b_1_init: np.array,
    W_2_init: np.array,
    b_2_init: np.array,
    learning_rate: float = 0.01,
    epsilon: float = 1e-6,
    verbose: bool = False,
) -> Tuple:
    """Runs gradient descent to fit the NN via backprop."""
    W_1 = W_1_init
    b_1 = b_1_init
    W_2 = W_2_init
    b_2 = b_2_init
    losses = [float("inf"), ]
    roc_auc_scores = [0.5, ]
    
    diff_in_loss = float("inf")
    iteration = 0
    while abs(diff_in_loss) > epsilon:
        iteration += 1
        dW_2, db_2, dW_1, db_1, loss = derivatives_by_backprop(
            X=X, Y=Y, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2
        ) 
        
        W_1 -= learning_rate * dW_1
        b_1 -= learning_rate * db_1
        W_2 -= learning_rate * dW_2
        b_2 -= learning_rate * db_2
        
        losses.append(loss)
        diff_in_loss = losses[-1] - losses[-2]
        
        Y_hat, _, _, _ = forward_prop(X=X, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
        roc_auc = roc_auc_score(y_true=Y, y_score=Y_hat)
        roc_auc_scores.append(roc_auc)
        
        if verbose and iteration % 10 == 0:
            print(loss, roc_auc)
    return W_1, b_1, W_2, b_2, losses

In [8]:
# parameters for the first layer
np.random.seed(42)
W_1_init = np.random.normal(size=(3, X.shape[1]))
b_1_init = np.random.normal(size=(3, 1))

# parameters for the second layer
W_2_init = np.random.normal(size=(1, 3))
b_2_init = np.random.normal(size=(1, 1))

W_1, b_1, W_2, b_2, losses = gradient_descent(
    X=X,
    Y=Y,
    W_1_init=W_1_init,
    b_1_init=b_1_init,
    W_2_init=W_2_init,
    b_2_init=b_2_init,
    learning_rate=0.1,
    epsilon=1e-3,
    verbose=True,
)

0.783355706236572 0.1278603053750983
0.6919297915722732 0.40731162328795467
0.6221831830749797 0.7705341995282824
0.5667720895357118 0.8880498200024827
0.5215585720622693 0.9298423470021102
0.48399626055251443 0.9451111019158357
0.4523753008520904 0.9553523399677246
0.4254695544919871 0.9608557123350022
0.4023620540738291 0.9643729052013075
0.3823494894203707 0.9665039102908926
0.3648838371694929 0.9681383705052344
0.3495334179228035 0.9694211114329457
0.3359553890933617 0.9708693673190715
0.32387555184301897 0.9714279803037199
0.3130730233613099 0.9721107295071791


In [9]:
# evaluate the model on the test set
Y_test_hat, _, _, _ = forward_prop(X=X_test, W_1=W_1, b_1=b_1, W_2=W_2, b_2=b_2)
roc_auc_score(y_true=Y_test, y_score=Y_test_hat)

0.9905011464133638

In [10]:
# train a NN with Keras
from tensorflow import keras
from tensorflow.keras import layers

def keras_model(nn_size: int, num_features: int, num_layers: int):
    """Creates a simple Keras model."""
    inputs = keras.Input(
        shape=(num_features, ), name="inputs")
    x = inputs
    for i in range(num_layers):
        x = layers.Dense(
            nn_size, activation="sigmoid", name=f"desnse_layer_{i}")(x)

    outputs = layers.Dense(
        1, activation="sigmoid", name="output")(x)

    model = keras.Model(
        inputs=inputs, outputs=outputs, name="simple_model")
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["AUC"])
    return model

model = keras_model(nn_size=3, num_features=X.shape[1], num_layers=1)
history = model.fit(
    x=X,
    y=Y,
    batch_size=32,
    epochs=20,
    validation_data=(X_test, Y_test),
    verbose=1,
    shuffle=True,
)

2021-11-08 01:04:07.424834: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-08 01:04:07.490654: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
# evaluate the model on the test set
roc_auc_score(y_true=Y_test, y_score=model.predict(X_test))

0.9790370127743204