In [7]:
import numpy as np
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
y = np.matrix(data.target).T
X = np.matrix(data.data)
M = X.shape[0]
N = X.shape[1]

# Normalize each input feature

def normalize(X):
    M = X.shape[0]
    XX = X - np.tile(np.mean(X,0),[M,1])
    XX = np.divide(XX, np.tile(np.std(XX,0),[M,1]))
    return XX

XX = normalize(X)

# Let's start with a 3-layer network with sigmoid activation functions,
# 6 units in layer 1, and 5 units in layer 2.

h2 = 5
h1 = 6
W = [[], np.random.normal(0,0.1,[N,h1]),
         np.random.normal(0,0.1,[h1,h2]),
         np.random.normal(0,0.1,[h2,1])]
b = [[], np.random.normal(0,0.1,[h1,1]),
         np.random.normal(0,0.1,[h2,1]),
         np.random.normal(0,0.1,[1,1])]
L = len(W)-1

def act(z):
    return 1/(1+np.exp(-z))

def actder(z):
    az = act(z)
    prod = np.multiply(az,1-az)
    return prod

def ff(x,W,b):
    L = len(W)-1
    a = x
    for l in range(1,L+1):
        z = W[l].T*a+b[l]
        a = act(z)
    return a

def loss(y,yhat):
    return -((1-y) * np.log(1-yhat) + y * np.log(yhat))
    
# Use mini-batch size 1

alpha = 0.01
max_iter = 1000
for iter in range(0, max_iter):
    loss_this_iter = 0
    order = np.random.permutation(M)
    for i in range(0,M):
        
        # Grab the pattern order[i]
        
        x_this = XX[order[i],:].T
        y_this = y[order[i],0]

        # Feed forward step
        
        a = [x_this]
        z = [[]]
        delta = [[]]
        dW = [[]]
        db = [[]]
        for l in range(1,L+1):
            z.append(W[l].T*a[l-1]+b[l])
            a.append(act(z[l]))
            # Just to give arrays the right shape for the backprop step
            delta.append([]); dW.append([]); db.append([])
            
        loss_this_pattern = loss(y_this, a[L][0,0])
        loss_this_iter = loss_this_iter + loss_this_pattern
            
        # Backprop step

        delta[L] = a[L] - y_this
        for l in range(L,0,-1):
            db[l] = delta[l].copy()
            dW[l] = a[l-1] * delta[l].T
            if l > 1:
                delta[l-1] = np.multiply(actder(z[l-1]), W[l] *
                             delta[l])
                
        # Check delta calculation
        
        if False:
            print('Target: %f' % y_this)
            print('y_hat: %f' % a[L][0,0])
            print(db)
            y_pred = ff(x_this,W,b)
            diff = 1e-3
            W[1][10,0] = W[1][10,0] + diff
            y_pred_db = ff(x_this,W,b)
            L1 = loss(y_this,y_pred)
            L2 = loss(y_this,y_pred_db)
            db_finite_difference = (L2-L1)/diff
            print('Original out %f, perturbed out %f' %
                 (y_pred[0,0], y_pred_db[0,0]))
            print('Theoretical dW %f, calculated db %f' %
                  (dW[1][10,0], db_finite_difference[0,0]))
        
        for l in range(1,L+1):            
            W[l] = W[l] - alpha * dW[l]
            b[l] = b[l] - alpha * db[l]
        
    print('Iteration %d loss %f' % (iter, loss_this_iter))


Iteration 0 loss 380.203615
Iteration 1 loss 376.481795
Iteration 2 loss 377.241251
Iteration 3 loss 376.696839
Iteration 4 loss 376.477832
Iteration 5 loss 376.613159
Iteration 6 loss 376.346457
Iteration 7 loss 375.917764
Iteration 8 loss 375.119121
Iteration 9 loss 374.910703
Iteration 10 loss 373.213126
Iteration 11 loss 370.788597
Iteration 12 loss 367.132264
Iteration 13 loss 360.149841
Iteration 14 loss 347.689953
Iteration 15 loss 327.932626
Iteration 16 loss 295.027145
Iteration 17 loss 251.245490
Iteration 18 loss 203.710591
Iteration 19 loss 161.962016
Iteration 20 loss 129.665621
Iteration 21 loss 106.088010
Iteration 22 loss 89.709142
Iteration 23 loss 77.886782
Iteration 24 loss 69.321922
Iteration 25 loss 62.941945
Iteration 26 loss 58.051575
Iteration 27 loss 54.154231
Iteration 28 loss 51.037998
Iteration 29 loss 48.444908
Iteration 30 loss 46.341934
Iteration 31 loss 44.505411
Iteration 32 loss 42.956490
Iteration 33 loss 41.544080
Iteration 34 loss 40.402639
Iteratio