In [None]:
import numpy as np
import graphviz as gv

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def relu(x):
    return max(0, x)

def predict(w, x):
    a = np.zeros((3,))
    a[2] = relu(np.dot(x,w[6:8]) + w[8])

    a[1] = relu(np.dot(x,w[3:5]) + w[5])

    a[0] = sigmoid(np.dot(a[1:3], w[0:2]) + w[2])
    return a[0]

def create_neural_network_graph(X, Y, w):
    nn_graph = gv.Digraph('NeuralNetwork')
    
    for i, x in enumerate(X):
        nn_graph.node(f'input1_{i}', f'Input 1: {x[0]}', shape='circle')
        nn_graph.node(f'input2_{i}', f'Input 2: {x[1]}', shape='circle')
        
        dot_product_1 = np.dot(x, w[3:5])
        dot_product_2 = np.dot(x, w[6:8])

        nn_graph.node(f'relu1_{i}', f'RELU 1: {relu(dot_product_1 + w[5])}\nBias: {w[5]} \nDot product: {dot_product_1}\nWeights: {w[3:5]}', shape='rectangle')
        nn_graph.node(f'relu2_{i}', f'RELU 2: {relu(dot_product_2 + w[8])}\nBias: {w[8]}  \nDot product: {dot_product_2}\nWeights: {w[6:8]}', shape='rectangle')
        
        dot_product_3 = np.dot([relu(dot_product_1 + w[5]), relu(dot_product_2 + w[8])], w[0:2])

        nn_graph.node(f'sigmoid_{i}', f'Sigmoid: {sigmoid(dot_product_3 + w[2])}\nBias: {w[2]}  \nDot product: {dot_product_3}\nWeights: {w[0:2]}', shape='rectangle')
        
        nn_graph.node(f'output_{i}', f'Output: {predict(w, x)} (actual: {Y[i]})', shape='circle')

        nn_graph.edges([(f'input1_{i}', f'relu1_{i}'), (f'input1_{i}', f'relu2_{i}'), (f'input2_{i}', f'relu1_{i}'), (f'input2_{i}', f'relu2_{i}')])
        nn_graph.edges([(f'relu1_{i}', f'sigmoid_{i}'), (f'relu2_{i}', f'sigmoid_{i}')])
        nn_graph.edge(f'sigmoid_{i}', f'output_{i}')

    return nn_graph


w = np.array([10,4, 
              -7, 
              2, 2.005, 
              -2,
              -7.999, -8,
              4])


X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])

Y = np.array([1, 0, 0, 1])

nn_graph = create_neural_network_graph(X, Y, w)

#nn_graph.render('neural_network_data_flow_with_weights.gv', view=True)
display(nn_graph)

In [None]:
import graphviz as gv

def draw_network(w):
    g = gv.Digraph(format='png')

    # agregar nodos
    g.node('x1', 'x1')
    g.node('x2', 'x2')
    g.node('h1', 'h1')
    g.node('h2', 'h2')
    g.node('y', 'y')

    # agregar conexiones
    g.edge('x1', 'h1', label=str(w[0]))
    g.edge('x2', 'h1', label=str(w[1]))
    g.edge('x1', 'h2', label=str(w[3]))
    g.edge('x2', 'h2', label=str(w[4]))
    g.edge('h1', 'y', label=str(w[2]))
    g.edge('h2', 'y', label=str(w[6]))
    # configurar el estilo del gráfico
    g.attr(rankdir='LR', size='8,5')

    # renderizar y mostrar el gráfico
    display(g)

# calcular las salidas para cada entrada
outputs = [predict(w, x) for x in X]

# dibujar el gráfico de la red neuronal
draw_network(w)

### 1.
Find a weight vector such that the neural network calculates the negated XOR function:
    
$$f(x,y)=\neg(x\text{ xor }y)$$

Use the following function to test your answer:

In [None]:
def test_prediction(X, Y, w):
    epsilon = 0.001
    for i, x in enumerate(X):
        print (x, predict(w, x))
        if np.abs(predict(w, x) - Y[i]) > epsilon: 
            print(np.abs(predict(w, x) - Y[i]))
            raise Exception("Prediction error")
    return True
test_prediction(X, Y, w)

### 2.

Suppose that we have a cross entropy loss function:

$$L(w, x, y) = - y \log f_w(x) - (1-y) \log (1-f_w(x))$$

where $f_w(x)$ corresponds to the prediction of the neural network from the  previous question.

In [None]:
def loss(w, x, y):
    return -y * np.log(predict(w, x)) -(1. - y) * np.log(1. - predict(w, x)) 

print([loss(w, X[i], Y[i]) for i in range(4)])
print([predict(w,X[i]) -Y[i] for i in range(4)])


Write a function that calculates the gradient of the loss with respect to the weights:

$$ \frac{\partial L}{\partial w} $$

In [None]:
def loss(w, x, y):
    return -y * np.log(x) -(1. - y) * np.log(1. - x) 

In [None]:
def backpropagation(w, x, y):
    # Left
    preac_0 = np.dot(x, w[3:5]) + w[5]
    act_0 = relu(preac_0)
    # Right
    preac_1 = np.dot(x, w[6:8]) + w[8]
    act_1 = relu(preac_1)
    # predic
    preac_2 = np.dot([act_0,act_1], w[0:2]) + w[2]
    act_2 = sigmoid(preac_2)
    # Gradientes
    delta = np.zeros_like(w)    
    #________________________________________________________
  
    """
    La derivada en general de w_0, w_1, w_2:
    dL/dw_0 = (dL/dact_2) * (dact_2/dpreac_2) * (dpreact_2/dw_0)
    dL/dw_1 = (dL/dact_2) * (dact_2/dpreac_2) * (dpreact_2/dw_1)
    dL/dw_2 = (dL/dact_2) * (dact_2/dpreac_2) * (dpreact_2/dw_2)  

    Veamos la función de perdida:
    L(w, x, y) = - y \log f_w(x) - (1-y) \log (1-f_w(x))
    dL/df_w(x) = -y/f_w(x) + 1-y/1-f_w(x)
    Note qué; se genera un arbo de realizar la derivada interna de:        
    dL_dfw_x_left = -y/f_w(x)
    dL_dfw_x_right = (1 - y) / (1 - fw_x)  
    """ 

    dL_dfw_x_left = -y / act_2
    dL_dfw_x_right = (1 - y) / (1 - act_2)

    dact_2_dpreac_2 =  act_2 *(1-act_2)

    dpreac_2_dw_0 = act_0    # derivar -> act_0*w_0+act_1*w_1+w_2
    dpreac_2_dw_1 = act_1
    dpreac_2_dw_2 = 1
    
    delta[0] = (dL_dfw_x_left*dact_2_dpreac_2*dpreac_2_dw_0) + (dL_dfw_x_right*dact_2_dpreac_2*dpreac_2_dw_0)
    delta[1] = (dL_dfw_x_left*dact_2_dpreac_2*dpreac_2_dw_1) + (dL_dfw_x_right*dact_2_dpreac_2*dpreac_2_dw_1)
    delta[2] = (dL_dfw_x_left*dact_2_dpreac_2*dpreac_2_dw_2) + (dL_dfw_x_right*dact_2_dpreac_2*dpreac_2_dw_2)
    """
    La derivada en general de w_3, w_4, w_5: 
    dL/dw_3 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_0) * (dact_0/dpreac_0*dpreac_0/dw_3)
    dL/dw_4 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_0) * (dact_0/dpreac_0*dpreac_0/dw_4)
    dL/dw_5 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_0) * (dact_0/dpreac_0*dpreac_0/dw_5)
    """
    dpreac_2_dact_0 = w[0] # derivar -> act_0*w_0+ act_1*w_1 +w_2
    dact_0_dpreac_0 = int(preac_0>0) # derivada de Relu -> 1 si x>0

    dpreac_0_dw_3 = x[0] # derivar -> Inp_0*w_3 + Inp_1*w_4 + w_5
    dpreac_0_dw_4 = x[1]
    dpreac_0_dw_5 = 1
    
    delta[3] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_3)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_3)
    delta[4] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_4)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_4)
    delta[5] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_5)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_0) * (dact_0_dpreac_0*dpreac_0_dw_5)


    """
    La derivada en general de w_6, w_7, w_8: 
    dL/dw_6 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_1) * (dact_1/dpreac_1*dpreac_1/dw_6)
    dL/dw_7 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_1) * (dact_1/dpreac_1*dpreac_1/dw_7)
    dL/dw_8 = (dL/dact_2 * dact_2/dpreac_2) * (dpreac_2/dact_1) * (dact_1/dpreac_1*dpreac_1/dw_8)
    """
    dpreac_2_dact_1 = w[1] # derivar -> act_0*w_0+ act_1*w_1 +w_2
    dact_1_dpreac_1 = int(preac_1>0) # derivada de Relu -> 1 si x>0

    dpreac_1_dw_6 = x[0] # derivar -> Inp_0*w_6 + Inp_1*w_7 + w_8
    dpreac_1_dw_7 = x[1]
    dpreac_1_dw_8 = 1

    delta[6] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_6)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_6)
    delta[7] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_7)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_7)
    delta[8] = (dL_dfw_x_left*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_8)+(dL_dfw_x_right*dact_2_dpreac_2) * (dpreac_2_dact_1) * (dact_1_dpreac_1*dpreac_1_dw_8)

    return delta






tws = np.array([[-0.70032787,  0.05195189,  0.02322052,  1.4555916 ,  0.12168937,
        -0.93580307, -0.58649814, -0.25847014, -0.11531032],
       [ 1.11732048,  0.60225913,  0.05929297, -1.09018787,  2.33186956,
         0.68248461, -0.16774443, -0.12996126,  0.31700533],
       [ 0.80285183,  0.08585098,  1.62153749,  0.61251705,  0.18263732,
         2.08412764, -0.2940164 , -0.72975557, -1.33828478],
       [-0.74973286,  1.24623671,  0.63761743,  2.13714693,  0.90258674,
         1.70238408, -2.60308453,  0.03070776,  2.34519973]])
txs = np.array([[-0.96460511,  0.79790901],
       [ 0.34546505,  0.92062212],
       [-0.85750439,  0.50268203],
       [ 0.69988938,  2.07328522]])
tys = np.array([[ 0.66453404],
       [-1.35012527],
       [-0.7976646 ],
       [ 0.57095802]])
tls = np.array([[ 0.        , -0.03798627, -0.1555583 ,  0.        ,  0.        ,
         0.        ,  0.0077955 , -0.00644834, -0.00808155],
       [ 5.63408329,  0.32024725,  2.29715661,  0.88669136,  2.36292409,
         2.56666012,  0.47794521,  1.27366556,  1.38348354],
       [ 2.8850554 ,  0.        ,  1.74777687, -1.20325518,  0.70536637,
         1.40320586,  0.        ,  0.        ,  0.        ],
       [-2.48486794, -0.2877231 , -0.49016324,  0.25720339,  0.76191466,
         0.36749148, -0.42753402, -1.26648581, -0.61085942]])

def test_dL_dw():
    num_tests = tws.shape[0]
    epsilon = 0.0001
    for i in range(num_tests):
        tw = tws[i]
        tx = txs[i]
        ty = tys[i]
        tl = tls[i]
        if   np.linalg.norm(backpropagation(tw, tx, ty) - tl)> epsilon:
            raise Exception("dL_dw test failed!")
    return print("OK")

test_dL_dw()