In [25]:
import numpy as np
import time
from sys import stdout
# lifting to get 8 x 8 weight matrix.
def Wb(X, p, p_var):
    d = X.shape[1]
    b = np.random.uniform(0, 2*np.pi, p)
    W = np.random.normal(0, np.sqrt(p_var), (p, d))
    return W, b

def phi(X, W, b, p):
    ''' Featurize the inputs using random Fourier features ''' 
    return np.sqrt(2/float(p))*np.cos(np.dot(X,W.T) + b)

class NeuralNetwork(object):
    
    def __init__(self, X, y, epsilon=1e-3, epochs=10, reg=0.1, input_layer_size=8, output_layer_size=8, hidden_layer_size=8):
        self.X = X
        self.y = y
        self.training_similarity = []
        self.running_time = []
        self.t = time.time()
        
        #Define Hyperparameters:
        self.epsilon = epsilon
        self.epochs = epochs
        self.reg = reg
        self.decay = .8
        self.input_layer_size = input_layer_size
        self.output_layer_size = output_layer_size
        self.hidden_layer_size = hidden_layer_size
        #Weights (parameters)
        self.W1 = np.zeros((self.input_layer_size, self.hidden_layer_size))
        self.W2 = np.zeros((self.hidden_layer_size, self.output_layer_size))
        #self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
        #self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
    
    def forward(self, X):
        #Propogate inputs though network
        self.z2 = np.dot(X, self.W1)
        self.a2 = self.sigmoid(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        yHat = self.sigmoid(self.z3) 
        return yHat
    
    def sigmoid(self, z):
        #Apply sigmoid activation function to scalar, vector, or matrix
        return 1/(1+np.exp(-z))
    
    def sigmoidPrime(self,z):
        #Gradient of sigmoid
        return np.exp(-z)/((1+np.exp(-z))**2)
    #.5*squared loss
    def costFunction(self, X, y):
        #Compute cost for given X,y, use weights already stored in class.
        self.yHat = self.forward(X)
        J = 0.5*sum((y-self.yHat)**2)
        return J
    
    def costFunctionPrime(self, X, y):
        #Compute derivative with respect to W and W2 for a given X and y:
        self.yHat = self.forward(X)
        
        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        self.dJdW2 = np.dot(self.a2.T, delta3)
        
        delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
        self.dJdW1 = np.dot(X.T, delta2)  

        return self.dJdW1, self.dJdW2
    
    def backprop(self, W1, W2, X, y, epsilon):
        self.costFunctionPrime(X, y)
        return W1-epsilon*self.dJdW1, W2-epsilon*self.dJdW2
    
    # train with stochastic gradient descent
    def train(self):
        yHat = self.forward(self.X)
        m, s = divmod(time.time() - self.t, 60)
        #self.running_time.append(m)
        for i in range(self.epochs):
            indices = np.random.permutation(xrange(self.X.shape[0]))
            for j in range(self.X.shape[0]):
                y = self.y[indices[j], :]
                X = self.X[indices[j], :].reshape((1, self.X.shape[1]))
                out = self.forward(X)
                self.W1, self.W2 = self.backprop(self.W1, self.W2, X, y, self.epsilon)
                self.J = self.costFunction(X,y)
                if j%100 == 0:
                    stdout.write("\r[ITERATION] {} J = {}".format(j + 1, self.J))
                    stdout.flush()

            yHat = self.forward(self.X)
            train_sim = np.mean(yHat.dot(self.y.T))
            self.training_similarity.append(train_sim)
            #print yHat
            m, s = divmod(time.time() - self.t, 60)
            self.running_time.append(m)
            print("EPOCH: {}, AVG TRAIN SIMILARITY: {}".format(i, train_sim))
            print("Trial with epsilon={}. Time elapsed: {} minutes, {} seconds.".format(self.epsilon, m, s))
            
            if i%2 == 0:
                self.epsilon = self.decay*self.epsilon
                
        print "D O N E"
        return self
# V = W1, W = W2
def linear_sweep(X, y):
    # linear sweep of all the epsilons
    epsilon = 1e-2
    for i in range(0,10):
        print ("EPSILON: {}".format(epsilon-i*1e-5))
        nn = NeuralNetwork(X, y, epsilon-i*1e-5, 10)
        nn.train()
        print (nn.W1.shape, nn.W2.shape)
        print (nn.W1, nn.W2)

In [26]:
#pg 6. table 1: neuralnet with wicklefeatures

In [27]:
#weight matrix of 8x8 zeros.
#pg 14. table 2
#A.
X = np.array([2, 4, 7]).reshape((1,3)) # <type 'numpy.int64'>
y = np.array([1, 4, 6]).reshape((1,3))

# number of random features.
p = 8
# variance of the gaussians. tune this.
p_var = 0.016
W, b = Wb(X, p, p_var)
X_featurized, y_featurized = phi(X, W, b, p), phi(y, W, b, p)
#print X_featurized.shape
#print y_featurized.shape
linear_sweep(X_featurized, y_featurized)
#EPSILON: 0.00997, AVERAGE TRAINING_SIMILARITY: 0.08051932

EPSILON: 0.01
[ITERATION] 1 J = [  2.05523389e-01   3.68595782e-05   1.27660013e-03   2.19250011e-01
   2.56962213e-01   4.44723593e-01   7.55136975e-02   2.46448780e-01]EPOCH: 0, AVG TRAIN SIMILARITY: -0.058073013313
Trial with epsilon=0.01. Time elapsed: 0.0 minutes, 0.00359916687012 seconds.
[ITERATION] 1 J = [  2.05112550e-01   3.67858958e-05   1.27404820e-03   2.18811733e-01
   2.56448550e-01   4.43834604e-01   7.53627458e-02   2.45956133e-01]EPOCH: 1, AVG TRAIN SIMILARITY: -0.0572264828418
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00635504722595 seconds.
[ITERATION] 1 J = [  2.04702536e-01   3.67123604e-05   1.27150136e-03   2.18374336e-01
   2.55935920e-01   4.42947414e-01   7.52120961e-02   2.45464476e-01]EPOCH: 2, AVG TRAIN SIMILARITY: -0.0563808149878
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00741314888 seconds.
[ITERATION] 1 J = [  2.04375153e-01   3.66536435e-05   1.26946775e-03   2.18025088e-01
   2.55526602e-01   4.42239035e-01   7.50918053e-02 

In [28]:
#B
X = np.array([3, 4, 6]).reshape((1,3))
y = np.array([3, 4, 7]).reshape((1,3))
p = 8
p_var = 0.016
W, b = Wb(X, p, p_var)
X_featurized, y_featurized = phi(X, W, b, p), phi(y, W, b, p)
linear_sweep(X_featurized, y_featurized)
#EPSILON: 0.00999, AVERAGE TRAINING_SIMILARITY: 0.70666954

EPSILON: 0.01
[ITERATION] 1 J = [  1.43404025e-02   3.78405130e-02   4.56494111e-01   2.99439828e-02
   3.94145187e-01   6.01861617e-03   9.12862133e-10   4.61825264e-03]EPOCH: 0, AVG TRAIN SIMILARITY: 0.629736500531
Trial with epsilon=0.01. Time elapsed: 0.0 minutes, 0.0019268989563 seconds.
[ITERATION] 1 J = [  1.43117360e-02   3.77648698e-02   4.55581593e-01   2.98841248e-02
   3.93357302e-01   6.00658495e-03   9.11037319e-10   4.60902074e-03]EPOCH: 1, AVG TRAIN SIMILARITY: 0.630256451798
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.0039758682251 seconds.
[ITERATION] 1 J = [  1.42831267e-02   3.76893777e-02   4.54670923e-01   2.98243863e-02
   3.92571008e-01   5.99457773e-03   9.09216146e-10   4.59980726e-03]EPOCH: 2, AVG TRAIN SIMILARITY: 0.630775863908
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00631093978882 seconds.
[ITERATION] 1 J = [  1.42602827e-02   3.76290987e-02   4.53943797e-01   2.97766861e-02
   3.91943187e-01   5.98499015e-03   9.07761967e-10   4.

In [29]:
#A, B
X = np.array(([2, 4, 7], [3, 4, 6]))
y = np.array(([1, 4, 6], [3, 4, 7]))
p = 8
p_var = 0.016
W, b = Wb(X, p, p_var)
X_featurized, y_featurized = phi(X, W, b, p), phi(y, W, b, p)
linear_sweep(X_featurized, y_featurized)
#EPSILON: 0.00999, AVERAGE TRAINING_SIMILARITY: -0.292271097976

EPSILON: 0.01
[ITERATION] 1 J = [  4.83791090e-01   1.00130842e-04   8.39282395e-06   1.02964968e-02
   4.96999384e-01   4.98481284e-01   1.44783205e-01   4.07784295e-01]EPOCH: 0, AVG TRAIN SIMILARITY: -0.340030166654
Trial with epsilon=0.01. Time elapsed: 0.0 minutes, 0.00133490562439 seconds.
[ITERATION] 1 J = [  4.90839524e-01   9.78346827e-03   7.24261580e-07   9.54054867e-03
   4.60134296e-01   4.94518364e-01   1.55874529e-01   4.91891825e-01]EPOCH: 1, AVG TRAIN SIMILARITY: -0.336394132181
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00295209884644 seconds.
[ITERATION] 1 J = [  4.88888549e-01   9.76197844e-03   7.17904009e-07   9.50174542e-03
   4.58262707e-01   4.92541724e-01   1.55263925e-01   4.90017364e-01]EPOCH: 2, AVG TRAIN SIMILARITY: -0.332765672474
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00500297546387 seconds.
[ITERATION] 1 J = [  4.87140490e-01   9.74445465e-03   7.11871722e-07   9.46688619e-03
   4.56581561e-01   4.90769582e-01   1.54717660e-01

In [30]:
# rule of 78. where [1, 4, 7 -> 1, 4, 7] is the exception 
X = np.array(([2, 4, 7], [1, 6, 8], [3, 5, 7], [1, 4, 7]))
y = np.array(([2, 4, 8], [1, 6, 7], [3, 5, 8], [1, 4, 7]))
p = 8
p_var = 0.016
W, b = Wb(X, p, p_var)
X_featurized, y_featurized = phi(X, W, b, p), phi(y, W, b, p)
linear_sweep(X_featurized, y_featurized)
#EPSILON: 0.01, AVERAGE TRAINING_SIMILARITY: -0.379843488671

EPSILON: 0.01
[ITERATION] 1 J = [  3.45480036e-01   4.23172112e-02   2.70395965e-01   5.65485279e-05
   9.26039770e-02   2.29377325e-01   9.64561543e-06   4.04510003e-01]EPOCH: 0, AVG TRAIN SIMILARITY: 0.0485746424061
Trial with epsilon=0.01. Time elapsed: 0.0 minutes, 0.0024631023407 seconds.
[ITERATION] 1 J = [  3.41953792e-01   4.19124198e-02   2.67867896e-01   5.45644297e-05
   9.17462973e-02   2.27137875e-01   9.48487236e-06   4.01013252e-01]EPOCH: 1, AVG TRAIN SIMILARITY: 0.0518072805626
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00491213798523 seconds.
[ITERATION] 1 J = [  3.58678075e-01   3.68313763e-02   3.39123254e-01   2.25069266e-08
   6.54235653e-02   2.29342502e-01   2.54821341e-04   3.25922855e-01]EPOCH: 2, AVG TRAIN SIMILARITY: 0.0550258406405
Trial with epsilon=0.008. Time elapsed: 0.0 minutes, 0.00753712654114 seconds.
[ITERATION] 1 J = [  4.53403538e-01   4.39127720e-02   2.19634849e-01   4.71940907e-03
   1.01062362e-01   2.46961507e-01   1.92940323e-06 

In [31]:
#pg 16. table 4. A
