In [None]:
import sys
#sys.path.append('/Users/bradh/hed-dlg/')

import numpy as np
import multiprocessing as mp
from itertools import chain
import random

import blocks
from blocks.bricks import Linear, Softmax, Softplus, NDimensionalSoftmax
from blocks.bricks.recurrent import GatedRecurrent, Fork, LSTM
from blocks.initialization import Constant, IsotropicGaussian, Identity, Uniform
from blocks.bricks.cost import BinaryCrossEntropy, CategoricalCrossEntropy
from blocks.filter import VariableFilter
from blocks.roles import PARAMETER
from blocks.graph import ComputationGraph

import theano
from theano import tensor as T

# Data

In [None]:
dataPath = 'bradspcaps.txt'
data = []
with open(dataPath, 'rb') as f:
    for line in f.readlines():
        data.append(line.split("'data': ")[-1].split(',')[0].replace("'", ""))

In [None]:
#TODO Making character dictionary
chars = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']
for 

In [None]:
def hexOneHot(number):
    zeroVec = np.zeros(256)
    zeroVec[number] = 1.0
    
    return zeroVec

def oneHot(index, granular = 'hex'):
    if granular == 'hex':
        vecLen = 257
    else:
        vecLen = 17
    
    zeroVec = np.zeros(vecLen)
    zeroVec[index] = 1.0
    
    return zeroVec

def normalizeArrays(headerListOArrays, timeSteps, reverse = True, charLevel = False):
    newHeaderList = []

    if charLevel:
        vecLen = 17
    else:
        vecLen = 257
    
    for arr in headerListOArrays:
        lenArr = len(arr)
        
        if lenArr < timeSteps:
            arr = np.vstack((arr, np.zeros((timeSteps-lenArr,vecLen))))
        
        if reverse:
            arr = arr[::-1]
        
        newHeaderList.append(arr)
        
    return newHeaderList

In [None]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def dropout(X, p=0.):
    if p != 0:
        retain_prob = 1 - p
        X = X / retain_prob * srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
    return X

# Gradient clipping
def clip_norm(g, c, n): 
    '''n is the norm, c is the threashold, and g is the gradient'''
    
    if c > 0: 
        g = T.switch(T.ge(n, c), g*c/n, g) 
    return g

def clip_norms(gs, c):
    norm = T.sqrt(sum([T.sum(g**2) for g in gs]))
    return [clip_norm(g, c, norm) for g in gs]

# Regularizers
def max_norm(p, maxnorm = 0.):
    if maxnorm > 0:
        norms = T.sqrt(T.sum(T.sqr(p), axis=0))
        desired = T.clip(norms, 0, maxnorm)
        p = p * (desired/ (1e-7 + norms))
    return p

def gradient_regularize(p, g, l1 = 0., l2 = 0.):
    g += p * l2
    g += T.sgn(p) * l1
    return g

def weight_regularize(p, maxnorm = 0.):
    p = max_norm(p, maxnorm)
    return p

def Adam(params, cost, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, l1 = 0., l2 = 0., maxnorm = 0., c = 8):
    
    updates = []
    grads = T.grad(cost, params)
    grads = clip_norms(grads, c)
    
    i = theano.shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (T.sqrt(fix2) / fix1)
    
    for p, g in zip(params, grads):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        g_t = gradient_regularize(p, g_t, l1=l1, l2=l2)
        p_t = p - (lr_t * g_t)
        p_t = weight_regularize(p_t, maxnorm=maxnorm)
        
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    
    updates.append((i, i_t))
    return updates

def RMSprop(cost, params, lr = 0.001, l1 = 0., l2 = 0., maxnorm = 0., rho=0.9, epsilon=1e-6, c = 8):
    
    grads = T.grad(cost, params)
    grads = clip_norms(grads, c)
    updates = []
    
    for p, g in zip(params, grads):
        g = gradient_regularize(p, g, l1 = l1, l2 = l2)
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc, acc_new))
        
        updated_p = p - lr * (g / T.sqrt(acc_new + epsilon))
        updated_p = weight_regularize(updated_p, maxnorm = maxnorm)
        updates.append((p, updated_p))
    return updates

In [None]:
#makes output by shifting inputs down in time one step and then copying the last time step to the end.
def targetModifier(targetArray):
    newTarget = np.vstack((targetArray[1:, :], targetArray[-1,:]))
    return newTarget

def targetMaker(listOinputs):
    #TODO: do this with arrays
    outputs = []
    for inp in listOinputs:
        outputs.append(targetModifier(inp))
    outputs = np.asarray(outputs)
    
    return outputs

In [None]:
#Making the hex dictionary
hexstring = '0,	1,	2,	3,	4,	5,	6,	7,	8,	9,	A,	B,	C,	D,	E,	F,	10,	11,	12,	13,	14,	15,	16,	17,	18,	19\
,	1A,	1B,	1C,	1D,	1E,	1F,	20,	21,	22,	23,	24,	25,	26,	27,	28,	29,	2A,	2B,	2C,	2D,	2E,	2F,	30,	31,	32,	33,	34,	35\
,	36,	37,	38,	39,	3A,	3B,	3C,	3D,	3E,	3F,	40,	41,	42,	43,	44,	45,	46,	47,	48,	49,	4A,	4B,	4C,	4D,	4E,	4F,	50,	51\
,	52,	53,	54,	55,	56,	57,	58,	59,	5A,	5B,	5C,	5D,	5E,	5F,	60,	61,	62,	63,	64,	65,	66,	67,	68,	69,	6A,	6B,	6C,	6D\
,	6E,	6F,	70,	71,	72,	73,	74,	75,	76,	77,	78,	79,	7A,	7B,	7C,	7D,	7E,	7F,	80,	81,	82,	83,	84,	85,	86,	87,	88,	89\
,	8A,	8B,	8C,	8D,	8E,	8F,	90,	91,	92,	93,	94,	95,	96,	97,	98,	99,	9A,	9B,	9C,	9D,	9E,	9F,	A0,	A1,	A2,	A3,	A4,	A5\
,	A6,	A7,	A8,	A9,	AA,	AB,	AC,	AD,	AE,	AF,	B0,	B1,	B2,	B3,	B4,	B5,	B6,	B7,	B8,	B9,	BA,	BB,	BC,	BD,	BE,	BF,	C0,	C1\
,	C2,	C3,	C4,	C5,	C6,	C7,	C8,	C9,	CA,	CB,	CC,	CD,	CE,	CF,	D0,	D1,	D2,	D3,	D4,	D5,	D6,	D7,	D8,	D9,	DA,	DB,	DC,	DD\
,	DE,	DF,	E0,	E1,	E2,	E3,	E4,	E5,	E6,	E7,	E8,	E9,	EA,	EB,	EC,	ED,	EE,	EF,	F0,	F1,	F2,	F3,	F4,	F5,	F6,	F7,	F8,	F9\
,	FA,	FB,	FC,	FD,	FE,	FF'.replace('\t', '')

hexList = hexstring.lower().split(',')
hexList.append('EOP') #End Of Packet token
hexDict = {}
    
for key, val in enumerate(hexList):
    if len(val) == 1:
        val = '0'+val
    hexDict[val] = key    

#we add 256 on the end to signify the end of the packet ('EOP')
tokenizedHeader = [[hexDict[header[i:i+2]] for i in xrange(0,len(header)-2+1,2)]+[256] for header in data]


#list of arrays that represent a header with row = time 
oneHotHeaders = [np.asarray([oneHot(item) for item in header]) for header in tokenizedHeader]

normalizedData = normalizeArrays(oneHotHeaders, 253, reverse=False)

# Encoder RNN

In [None]:
#dim = original data dimension (columns)
dim = 5
rnnType = 'gru'
X = T.tensor3('inputs') #X has dimensions of input data (10,15,5)
linewt_init = Uniform(width=0.08)
rnnwt_init = IsotropicGaussian(0.05)
rnnbias_init = Constant(0.0)

if rnnType == 'gru':
    rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER
fork = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data1, data2 = fork.apply(X)

if rnnType == 'gru':
    hEnc = rnn.apply(data1, data2)[:,-1] #the [:,-1] gets the last hidden state for each obs in minibatch
else:
    hinit, _ = rnn.apply(data2)
    hEnc = hinit[:,-1]

#get weights initialized. without weights are nans.
fork.initialize()
rnn.initialize()

###ALT RNN LAYER
def initialize(to_init):
    for bricks in to_init:
        bricks.weights_init = initialization.Uniform(width=0.08)
        bricks.biases_init = initialization.Constant(0)
        bricks.initialize()

def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)


def lstm_layer(dim, h, n):
    linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n))
    lstm = LSTM(dim=dim, name='lstm' + str(n))
    initialize([linear, lstm])
    return lstm.apply(linear.apply(h))


In [None]:
#test
dataTest = theano.function([X], hEnc, allow_input_downcast=True)

In [None]:
#test
fakeData = np.random.rand(10,15,5)
dataTest(fakeData)

# Context RNN

In [None]:

if rnnType == 'gru':
    rnnContext = GatedRecurrent(dim=numMiniSessions, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER
forkContext = Fork(output_names=['linearContext', 'gatesContext'],
            name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data3, data4 = forkContext.apply(hEnc)

if rnnType == 'gru':
    hContext = rnnContext.apply(data3, data4)[:,-1] #the [:,-1] gets the last hidden state for each obs in minibatch
else:
    hinitContext, _ = rnnContext.apply(data4)
    hContext = hinit[:,-1]

#get weights initialized. without weights are nans.
forkContext.initialize()
rnnContext.initialize()

# Decoder RNN

In [None]:
#input is hContext
#we need original input for softmax
#figure out how cost is calculated
#pray


In [None]:
numTokens = 257
rnnType = 'gru'
X = T.tensor3('inputs')
Y = T.tensor3('outputs')
linewt_init = Uniform(width=0.02)
rnnwt_init = IsotropicGaussian(0.08)
rnnbias_init = Constant(0.0)

if rnnType == 'gru':
    rnnDec = GatedRecurrent(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnnDec = LSTM(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER
forkDec = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=numTokens, output_dims=[numTokens, numTokens * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data5, data6 = forkDec.apply(X)

if rnnType == 'gru':
    hDec = rnnDec.apply(data5, data6) 
else:
    hinit, _ = rnnDec.apply(data6)
    hDec = hinit

In [None]:
pYx = 1/(1+T.exp(-hDec))
softmax = NDimensionalSoftmax()
softout = softmax.apply(pYx, extra_ndim = 1)

#get weights initialized
forkDec.initialize()
rnnDec.initialize()

#test
decoderTest = theano.function([X], softout, allow_input_downcast=True)

In [None]:
#cost = BinaryCrossEntropy().apply(Y, softout)
precost = Y*np.log(softout) + (1-Y)*np.log(1-softout)
cost = -T.mean(T.sum(T.sum(precost[:,:-1,:], axis = 2), axis = 1))
cg = ComputationGraph([cost])

In [None]:
learning_rate = 0.01
params = VariableFilter(roles = [PARAMETER])(cg.variables)
#updates = Adam(params, cost, learning_rate, c=10) #c is gradient clipping parameter
updates = RMSprop(cost, params, learning_rate, c=1)

In [None]:
gradients = T.grad(cost, params)
gradients = clip_norms(gradients, 1)
gradientFun = theano.function([X,Y], gradients, allow_input_downcast=True)
train = theano.function([X,Y], cost, updates = updates, allow_input_downcast=True)
predict = theano.function([X], softout, allow_input_downcast=True)

#test
inputs = np.asarray(normalizedData[:3])
outputs = targetMaker(inputs)

In [None]:
#shuffle data
random.shuffle(normalizedData)
trainPercent = 0.9
trainIndex = int(len(normalizedData)*trainPercent)

trainData = normalizedData[0:trainIndex]
testData = normalizedData[trainIndex:]

In [None]:
#TODO: make a training function
runname = 'firstRun'
epochCost = []
gradNorms = []

epochs = 200000
batch_size = 64
iteration = 0

for epoch in xrange(epochs):
    
    costCollect = []

    for start, end in zip(range(0, len(trainData),batch_size), range(batch_size, len(trainData), batch_size)):
        
        inputs = trainData[start:end]
        outputs = targetMaker(inputs)
        costfun = train(inputs, outputs)
        
        
        costCollect.append(costfun)
                
        iteration+=1
        
    ####SAVE COST TO FILE  
    if epoch%30 == 0:
        print(' ')
        print 'Epoch: ', epoch
        epochCost.append(np.mean(costCollect))
        print 'Epoch cost average: ', epochCost[-1]
        grads = gradientFun(inputs, outputs)
        for gra in grads:
            print '  gradient norms: ', np.linalg.norm(gra)
        
    
    np.savetxt(runname+"_COST.csv", epochCost, delimiter=",")


# Pretraining essentials

In [None]:
numTokens = 257
rnnType = 'gru'
X = T.tensor3('inputs')
Y = T.tensor3('outputs')
linewt_init = Uniform(width=0.02)
rnnwt_init = IsotropicGaussian(0.08)
rnnbias_init = Constant(0.0)

if rnnType == 'gru':
    rnnDec = GatedRecurrent(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
    dimMultiplier = 2
else:
    rnnDec = LSTM(dim=numTokens, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
    dimMultiplier = 4

###ICLR suggestion -> don't use bias in RNNs
###RECURRENT LAYER
forkDec = Fork(output_names=['linear', 'gates'],
            name='fork', input_dim=numTokens, output_dims=[numTokens, numTokens * dimMultiplier], 
            weights_init = linewt_init, biases_init = rnnbias_init)
data5, data6 = forkDec.apply(X)

if rnnType == 'gru':
    hDec = rnnDec.apply(data5, data6) 
else:
    hinit, _ = rnnDec.apply(data6)
    hDec = hinit

#CRITICAL: need to loop through the arrays. Do regular people update after every sequence? or minibatch of seqs?
    
pYx = 1/(1+T.exp(-hDec))
softmax = NDimensionalSoftmax()
softout = softmax.apply(pYx, extra_ndim = 1)

#get weights initialized
fork.initialize()
rnn.initialize()

#cost = BinaryCrossEntropy().apply(Y, softout)
precost = Y*np.log(softout) + (1-Y)*np.log(1-softout)
cost = -T.mean(T.sum(T.sum(precost[:,:-1,:], axis = 2), axis = 1))
cg = ComputationGraph([cost])

learning_rate = 0.01
params = VariableFilter(roles = [PARAMETER])(cg.variables)
#updates = Adam(params, cost, learning_rate, c=10) #c is gradient clipping parameter
updates = RMSprop(cost, params, learning_rate, c=1)

gradients = T.grad(cost, params)
gradients = clip_norms(gradients, 1)
gradientFun = theano.function([X,Y], gradients, allow_input_downcast=True)
train = theano.function([X,Y], cost, updates = updates, allow_input_downcast=True)
predict = theano.function([X], softout, allow_input_downcast=True)

random.shuffle(normalizedData)
trainPercent = 0.9
trainIndex = int(len(normalizedData)*trainPercent)

trainData = normalizedData[0:trainIndex]
testData = normalizedData[trainIndex:]

runname = 'firstRun'
epochCost = []
gradNorms = []

epochs = 200000
batch_size = 64
iteration = 0

for epoch in xrange(epochs):
    
    costCollect = []

    for start, end in zip(range(0, len(trainData),batch_size), range(batch_size, len(trainData), batch_size)):
        
        inputs = trainData[start:end]
        outputs = targetMaker(inputs)
        costfun = train(inputs, outputs)
        
        
        costCollect.append(costfun)
                
        iteration+=1
        
    ####SAVE COST TO FILE  
    if epoch%30 == 0:
        print(' ')
        print 'Epoch: ', epoch
        epochCost.append(np.mean(costCollect))
        print 'Epoch cost average: ', epochCost[-1]
        grads = gradientFun(inputs, outputs)
        for gra in grads:
            print '  gradient norms: ', np.linalg.norm(gra)
        
    
    np.savetxt(runname+"_COST.csv", epochCost, delimiter=",")


# Putting it all together

In [None]:
#output will be same dimension as input
test = theano.function([X], h, allow_input_downcast=True)

In [None]:
##DATA for GRU
#data.shape = (a,b,c)
#a = number in minibatch
#b = timesteps
#c = features in one timestep (eg. size of dictionary, number chars, embedding space)
fakeData = np.random.rand(10, 15, 5)



In [None]:
test(fakeData)

In [None]:
linear = Linear(input_dim = 5, output_dim = 5, weights_init=Identity(2), biases_init=Constant(0.0))

In [None]:
lineout = linear.apply(X)
linear.initialize()

In [None]:
lineTest = theano.function([X], lineout, allow_input_downcast=True)

In [None]:
lineTest(fakeData[0:1])

In [None]:
lineTest.get_shared()[0].get_value()

In [None]:
dim = 5
fork = Fork(output_names=['linear', 'gates'],
                name='fork', input_dim=dim, output_dims=[dim, dim * 2], 
                weights_init = Uniform(width=0.08), biases_init = Constant(0.0))

In [None]:
fork.initialize()

In [None]:
forkOut = fork.apply(X)

In [None]:
forkTest = theano.function([X], forkOut, allow_input_downcast=True)

In [None]:
forkTest(fakeData)

In [None]:
forkTest.get_shared()[2].get_value().shape

# Converting to CPU

In [None]:
#GPU TO CPU conversion
#Now get the weights from the test function. These weights will be numpy arrays
w1 = test.get_shared()[0].get_value()

#Here the weights are going to be set to the numpy arrays taken from the GPU predict function
input_linear.parameters[0].set_value(w1)

In [None]:
test.get_shared()[2].get_value().shape

In [None]:
chars = '1234567890abcdefghijklmnopqrstuvwxyz'
words = ['']

# Scratchpad