In [1]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [42]:
N_SAMPLES = 1000
TEST_SIZE = 0.1


X, y = make_moons(n_samples = N_SAMPLES, noise=0.2, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

In [45]:
X_train = X_train.reshape(X_train.shape[1], X_train.shape[0])

In [46]:
def init_layers(nn_architecture, seed=99):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}

    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]

        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_output_size, 1) * 0.1

    return params_values

In [47]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

In [48]:
def relu(Z):
    return np.maximum(0, Z)

In [49]:
def sigmoid_backwards(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

In [50]:
def relu_backwards(dA, Z):
    #print("relu backwards shape")
    #print(dA.shape, Z.shape)
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

In [51]:
def single_layer_forward_prop(A_prev, W_curr, b_curr, activation = "relu"):
    Z_curr = np.dot(W_curr, A_prev) + b_curr

    if activation is "relu":
        activation_func = relu
    elif activation is "sigmoid":
        activation_func = sigmoid
    else:
        raise Exception('Non supported activation function')
        
    A_curr = activation_func(Z_curr)

    return A_curr, Z_curr

In [23]:
def full_forward_prop(X, params_values, nn_architecture):
    memory = {}
    A_curr = X

    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        A_prev = A_curr
        print(A_prev.shape)

        activ_func_curr = layer["activation"]
        W_curr = params_values["W" + str(layer_idx)]
        print(W_curr.shape)
        b_curr = params_values["b" + str(layer_idx)]
        print(b_curr.shape)
        A_curr, Z_curr = single_layer_forward_prop(A_prev, W_curr, b_curr, activ_func_curr)
        
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr

    return A_curr, memory

In [24]:
def convert_prob_into_class(probs):
    probs_ = np.copy(probs)
    probs_[probs_ > 0.5] = 1
    probs_[probs_ <= 0.5] = 0
    return probs_


def get_cost_value(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = -1/m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1-Y, np.log(1-Y_hat).T))
    #return np.squeeze(cost)

    Y_hat_ = convert_prob_into_class(Y_hat)
    return(Y_hat_ == Y).all(axis=0).mean()

In [30]:
def single_layer_back_prop(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
    """
    Implements dZ[l] = dA[l] * g' * (Z[l])
    This is the backprop for one layer
    
    """
    
    # number of examples
    m = A_prev.shape[1]
    
    if activation is "relu":
        backward_act_func = relu_backwards
    elif activation is "sigmoid":
        backward_act_func = sigmoid_backwards
    else:
        raise Exception('Not supported activation function')
    
    # calculate derivative of activation function
    dZ_curr = backward_act_func(dA_curr, Z_curr)
    print(dZ_curr)
    
    return dZ_curr

In [35]:
def full_backward_prop(Y_hat, Y, memory, params_values, nn_architecture):
    """
    Find gradients -((Y/Y_hat) - ((1-Y)/(1-Y_hat))) = dY_hat
    """
    grads_values = {}
    
    # number of examples
    m = Y.shape
    Y = Y.reshape(Y_hat.shape)
    
    dA_prev = -(np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat))
    print("dA_prev = \n")
    print(dA_prev.shape)
    #print(dA_prev[:10])
    
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        # We extract all the parameters for each layer
        layer_idx_curr = layer_idx_prev + 1
        print("Layer idx curr and prev = ", layer_idx_curr, layer_idx_prev)
        activation_func_curr = layer['activation']
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        # run single layer backprop and update grad_values
        
        dZ = single_layer_back_prop(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation_func_curr)
        
        print("dZ = ")
        print(dZ.shape)
        #print(dZ[:10])
        
        
    
    
    return print("End")
    
    

In [36]:
nn_architecture = [
    {"input_dim": 2, "output_dim": 5, "activation": "relu"},
    #{"input_dim": 25, "output_dim": 50, "activation": "relu"},
    #{"input_dim": 50, "output_dim": 50, "activation": "relu"},
    #{"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 5, "output_dim": 1, "activation": "sigmoid"},
]

In [37]:
def train(X_train, y_train, nn_architecture, epochs = 10, learning_rate = 0.01):
    params_values = init_layers(nn_architecture)
    print("initial params values = \n")
    print(params_values)
    cost_history = []
    accuracy_history = []
    
    for i in range(epochs):
        # forward prop
        y_hat, memory = full_forward_prop(X_train, #X_train.reshape(X_train.shape[1], X_train.shape[0]),
                                          params_values, nn_architecture)
        print("\n y_hat shape = \n")
        print(y_hat.shape)
        
        # Calc loss
        loss = get_cost_value(y_hat, y_train)
        print("\n Loss = \n")
        print(loss)
        
        
        print("\nNew params = \n")
        print(params_values)
        
        # Back prop!
        
        full_backward_prop(y_hat, y_train, memory, params_values, nn_architecture)
        

    
    return params_values, y_hat, memory

In [52]:
y_train = np.transpose(y_train.reshape((y_train.shape[0], 1)))

In [53]:
params, y_hat , memory = train(X_train=X_train, y_train=y_train, nn_architecture=nn_architecture, epochs=1)


initial params values = 

{'W1': array([[-0.01423588,  0.20572217],
       [ 0.02832619,  0.1329812 ],
       [-0.01546219, -0.00690309],
       [ 0.07551805,  0.08256466],
       [-0.01130692, -0.23678376]]), 'b1': array([[-0.01670494],
       [ 0.0685398 ],
       [ 0.00235001],
       [ 0.04562013],
       [ 0.02704928]]), 'W2': array([[-0.14350081,  0.08828171, -0.05800817, -0.05015653,  0.05909533]]), 'b2': array([[-0.07316163]])}
(2, 900)
(5, 2)
(5, 1)
(5, 900)
(1, 5)
(1, 1)

 y_hat shape = 

(1, 900)

 Loss = 

0.4955555555555556

New params = 

{'W1': array([[-0.01423588,  0.20572217],
       [ 0.02832619,  0.1329812 ],
       [-0.01546219, -0.00690309],
       [ 0.07551805,  0.08256466],
       [-0.01130692, -0.23678376]]), 'b1': array([[-0.01670494],
       [ 0.0685398 ],
       [ 0.00235001],
       [ 0.04562013],
       [ 0.02704928]]), 'W2': array([[-0.14350081,  0.08828171, -0.05800817, -0.05015653,  0.05909533]]), 'b2': array([[-0.07316163]])}
dA_prev = 

(1, 900)
Layer 

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 5

In [None]:
y_train.shape

In [None]:
y_hat.shape

In [None]:
m = y_hat.shape[1]
cost = -1/m * (np.dot(y_train, np.log(y_hat).T) + np.dot(1-y_train, np.log(1-y_hat).T))
np.squeeze(cost)

In [None]:
m

In [None]:
y_hat

In [None]:
sum(y_train)

In [None]:
454/900

In [None]:
for idx, i in reversed(list(enumerate(nn_architecture))):
    print(idx)
    print(i)

In [None]:
i['activation']

In [None]:
for idx, i in enumerate(memory):
    print(memory[i])

In [None]:
memory

In [None]:
relu(-3), relu(3)