## Gradient Descient

In [1]:
import random

def f(x): ## loss function
    return x**2 + 5


def df(x):
    return 2*x


old_x = float('inf')
x = random.randint(0, 10000)
learning_rate = 0.3
epochs = 0


while abs(x - old_x) > 1.0e-7:
    cost = f(x)
    gradx = df(x)
    ## grad takes into account the effect each parameter has on the cost, so 
    ## that's hwo to find the direction of steepest ascent.

    old_x = x
    x -= learning_rate * gradx

    print('EPOCH{}: Cost={:.3f}, x = {:.3f}'.format(epochs, cost, gradx))
    epochs += 1


EPOCH0: Cost=95922441.000, x = 19588.000
EPOCH1: Cost=15347594.760, x = 7835.200
EPOCH2: Cost=2455619.362, x = 3134.080
EPOCH3: Cost=392903.298, x = 1253.632
EPOCH4: Cost=62868.728, x = 501.453
EPOCH5: Cost=10063.196, x = 200.581
EPOCH6: Cost=1614.311, x = 80.232
EPOCH7: Cost=262.490, x = 32.093
EPOCH8: Cost=46.198, x = 12.837
EPOCH9: Cost=11.592, x = 5.135
EPOCH10: Cost=6.055, x = 2.054
EPOCH11: Cost=5.169, x = 0.822
EPOCH12: Cost=5.027, x = 0.329
EPOCH13: Cost=5.004, x = 0.131
EPOCH14: Cost=5.001, x = 0.053
EPOCH15: Cost=5.000, x = 0.021
EPOCH16: Cost=5.000, x = 0.008
EPOCH17: Cost=5.000, x = 0.003
EPOCH18: Cost=5.000, x = 0.001
EPOCH19: Cost=5.000, x = 0.001
EPOCH20: Cost=5.000, x = 0.000
EPOCH21: Cost=5.000, x = 0.000
EPOCH22: Cost=5.000, x = 0.000
EPOCH23: Cost=5.000, x = 0.000
EPOCH24: Cost=5.000, x = 0.000
EPOCH25: Cost=5.000, x = 0.000
EPOCH26: Cost=5.000, x = 0.000
EPOCH27: Cost=5.000, x = 0.000
EPOCH28: Cost=5.000, x = 0.000


## Mini Tensorflow For Neural Networks

### Initial Graph Node

In [3]:
import numpy as np


class Node(object):
    def __init__(self, inbound_nodes=[]):
        self.inbound_nodes = inbound_nodes
        self.outbound_nodes = []

        for n in self.inbound_nodes:
            n.outbound_nodes.append(self)
            # set 'self' node as inbound_nodes's outbound_nodes

        self.value = None

        self.gradients = {}
        # keys are the inputs to this node, and their
        # values are the partials of this node with 
        # respect to that input.
        # \partial{node}{input_i}
        

    def forward(self):
        '''
        Forward propagation. 
        Compute the output value vased on 'inbound_nodes' and store the 
        result in self.value
        '''

        raise NotImplemented
    

    def backward(self):

        raise NotImplemented

In [4]:
class Input(Node):
    def __init__(self):
        '''
        An Input node has no inbound nodes.
        So no need to pass anything to the Node instantiator.
        '''
        Node.__init__(self)

    def forward(self, value=None):
        '''
        Only input node is the node where the value may be passed
        as an argument to forward().
        All other node implementations should get the value of the 
        previous node from self.inbound_nodes
        
        Example: 
        val0: self.inbound_nodes[0].value
        '''
        if value is not None:
            self.value = value
            ## It's is input node, when need to forward, this node initiate self's value.

        # Input subclass just holds a value, such as a data feature or a model parameter(weight/bias)
        
    def backward(self):
        self.gradients = {self:0}
        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]
            self.gradients[self] = grad_cost * 1
            
        
        # input N --> N1, N2
        # \partial L / \partial N 
        # ==> \partial L / \partial N1 * \ partial N1 / \partial N


class CaculatNode(Node):
    def __init__(self, f, *nodes):
        Node.__init__(self, nodes)
        self.func = f

    def forward(self):
        self.value = self.func(map(lambda n: n.value, self.inbound_nodes))


class Add(Node):
    def __init__(self, *nodes):
        Node.__init__(self, nodes)


    def forward(self):
        self.value = sum(map(lambda n: n.value, self.inbound_nodes))
        ## when execute forward, this node caculate value as defined.

class Linear(Node):
    def __init__(self, nodes, weights, bias):
        Node.__init__(self, [nodes, weights, bias])

    def forward(self):
        inbound_nodes = self.inbound_nodes[0].value
        weights = self.inbound_nodes[1].value
        bias = self.inbound_nodes[2].value

        self.value = np.dot(inbound_nodes, weights) + bias
        
    def backward(self):

        # initial a partial for each of the inbound_nodes.
        self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}

        for n in self.outbound_nodes:
            # Get the partial of the cost w.r.t this node.
            grad_cost = n.gradients[self]

            self.gradients[self.inbound_nodes[0]] = np.dot(grad_cost, self.inbound_nodes[1].value.T)
            self.gradients[self.inbound_nodes[1]] = np.dot(self.inbound_nodes[0].value.T, grad_cost)
            self.gradients[self.inbound_nodes[2]] = np.sum(grad_cost, axis=0, keepdims=False)

        # WX + B / W ==> X
        # WX + B / X ==> W

class Sigmoid(Node):
    def __init__(self, node):
        Node.__init__(self, [node])


    def _sigmoid(self, x):
        return 1./(1 + np.exp(-1 * x))

    def forward(self):
        self.x = self.inbound_nodes[0].value
        self.value = self._sigmoid(self.x)

    def backward(self):
        self.partial = self._sigmoid(self.x) * (1 - self._sigmoid(self.x))
        
        # y = 1 / (1 + e^-x)
        # y' = 1 / (1 + e^-x) (1 - 1 / (1 + e^-x))
        
        self.gradients = {n: np.zeros_like(n.value) for n in self.inbound_nodes}

        for n in self.outbound_nodes:
            grad_cost = n.gradients[self]  # Get the partial of the cost with respect to this node.

            self.gradients[self.inbound_nodes[0]] = grad_cost * self.partial
            # use * to keep all the dimension same!.



class MSE(Node):
    def __init__(self, y, a):
        Node.__init__(self, [y, a])


    def forward(self):
        y = self.inbound_nodes[0].value.reshape(-1, 1)
        a = self.inbound_nodes[1].value.reshape(-1, 1)
        assert(y.shape == a.shape)

        self.m = self.inbound_nodes[0].value.shape[0]
        self.diff = y - a

        self.value = np.mean(self.diff**2)


    def backward(self):
        self.gradients[self.inbound_nodes[0]] = (2 / self.m) * self.diff
        self.gradients[self.inbound_nodes[1]] = (-2 / self.m) * self.diff


def forward_and_backward(outputnode, graph):
    # execute all the forward method of sorted_nodes.

    ## In practice, it's common to feed in mutiple data example in each forward pass rather than just 1. Because the examples can be processed in parallel. The number of examples is called batch size.
    for n in graph:
        n.forward()
        ## each node execute forward, get self.value based on the topological sort result.

    for n in  graph[::-1]:
        n.backward()

    #return outputnode.value

###   v -->  a -->  C
##    b --> C
##    b --> v -- a --> C
##    v --> v ---> a -- > C

def topological_sort(feed_dict):
    """
    Sort generic nodes in topological order using Kahn's Algorithm.
    `feed_dict`: A dictionary where the key is a `Input` node and the value is the respective value feed to that node.
    Returns a list of sorted nodes.
    """

    input_nodes = [n for n in feed_dict.keys()]

    G = {}
    nodes = [n for n in input_nodes]
    while len(nodes) > 0:
        n = nodes.pop(0)
        if n not in G:
            G[n] = {'in': set(), 'out': set()}
        for m in n.outbound_nodes:
            if m not in G:
                G[m] = {'in': set(), 'out': set()}
            G[n]['out'].add(m)
            G[m]['in'].add(n)
            nodes.append(m)

    L = []
    S = set(input_nodes)
    while len(S) > 0:
        n = S.pop()

        if isinstance(n, Input):
            n.value = feed_dict[n]
            ## if n is Input Node, set n'value as 
            ## feed_dict[n]
            ## else, n's value is caculate as its
            ## inbounds

        L.append(n)
        for m in n.outbound_nodes:
            G[n]['out'].remove(m)
            G[m]['in'].remove(n)
            # if no other incoming edges add to S
            if len(G[m]['in']) == 0:
                S.add(m)
    return L


def sgd_update(trainables, learning_rate=1e-2):
    # there are so many other update / optimization methods
    # such as Adam, Mom, 
    for t in trainables:
        t.value -= learning_rate * t.gradients[t]

In [6]:
from sklearn.datasets import load_boston

In [7]:
data = load_boston()

In [12]:
"""
Check out the new network architecture and dataset!
Notice that the weights and biases are
generated randomly.
No need to change anything, but feel free to tweak
to test your network, play around with the epochs, batch size, etc!
"""

import numpy as np
from sklearn.datasets import load_boston
from sklearn.utils import shuffle, resample
#from miniflow import *

# Load data
data = load_boston()
X_ = data['data']
y_ = data['target']

# Normalize data
X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0)

n_features = X_.shape[1]
n_hidden = 10
W1_ = np.random.randn(n_features, n_hidden)
b1_ = np.zeros(n_hidden)
W2_ = np.random.randn(n_hidden, 1)
b2_ = np.zeros(1)

# Neural network
X, y = Input(), Input()
W1, b1 = Input(), Input()
W2, b2 = Input(), Input()

l1 = Linear(X, W1, b1)
s1 = Sigmoid(l1)
l2 = Linear(s1, W2, b2)
cost = MSE(y, l2)

feed_dict = {
    X: X_,
    y: y_,
    W1: W1_,
    b1: b1_,
    W2: W2_,
    b2: b2_
}

epochs = 5000
# Total number of examples
m = X_.shape[0]
batch_size = 16
steps_per_epoch = m // batch_size

graph = topological_sort(feed_dict)
trainables = [W1, b1, W2, b2]

print("Total number of examples = {}".format(m))

# Step 4
for i in range(epochs):
    loss = 0
    for j in range(steps_per_epoch):
        # Step 1
        # Randomly sample a batch of examples
        X_batch, y_batch = resample(X_, y_, n_samples=batch_size)

        # Reset value of X and y Inputs
        X.value = X_batch
        y.value = y_batch

        # Step 2
        _ = None
        forward_and_backward(_, graph) # set output node not important.

        # Step 3
        rate = 1e-2
    
        sgd_update(trainables, rate)

        loss += graph[-1].value
    
    if i % 100 == 0: 
        print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))


Total number of examples = 506
Epoch: 1, Loss: 132.010
Epoch: 101, Loss: 8.355
Epoch: 201, Loss: 6.157
Epoch: 301, Loss: 5.707
Epoch: 401, Loss: 6.437
Epoch: 501, Loss: 4.818
Epoch: 601, Loss: 5.351
Epoch: 701, Loss: 4.621
Epoch: 801, Loss: 4.701
Epoch: 901, Loss: 4.346
Epoch: 1001, Loss: 4.122
Epoch: 1101, Loss: 3.872
Epoch: 1201, Loss: 3.513
Epoch: 1301, Loss: 4.013
Epoch: 1401, Loss: 3.456
Epoch: 1501, Loss: 4.228
Epoch: 1601, Loss: 3.637
Epoch: 1701, Loss: 3.418
Epoch: 1801, Loss: 4.491
Epoch: 1901, Loss: 4.001
Epoch: 2001, Loss: 3.115
Epoch: 2101, Loss: 3.879
Epoch: 2201, Loss: 3.413
Epoch: 2301, Loss: 3.651
Epoch: 2401, Loss: 3.555
Epoch: 2501, Loss: 3.349
Epoch: 2601, Loss: 3.825
Epoch: 2701, Loss: 3.454
Epoch: 2801, Loss: 3.170
Epoch: 2901, Loss: 3.267
Epoch: 3001, Loss: 2.845
Epoch: 3101, Loss: 3.721
Epoch: 3201, Loss: 3.479
Epoch: 3301, Loss: 3.449
Epoch: 3401, Loss: 3.640
Epoch: 3501, Loss: 3.635
Epoch: 3601, Loss: 3.602
Epoch: 3701, Loss: 3.478
Epoch: 3801, Loss: 3.412
Epoc