In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.datasets import imdb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LayerNormalization

In [2]:
tsteps = 20
seg_len = 20
lstm_cell_size = 32
#vocab_size=250
attention_size = 6
n_symb = 4

# optimizer
sgd_opt = 'adam'
lr = 4e-3
beta1 = 0
beta2 = 0.9999
eps=1e-5

opt = Adam(
    learning_rate=lr , beta_1=0.0, beta_2=beta2, epsilon=eps
)

class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

sequence_input = tf.keras.layers.Input(shape=(tsteps,seg_len,))

#embedded_sequences = tf.keras.layers.Embedding(vocab_size, 128, input_length=max_len)(sequence_input)

lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
                                     (lstm_cell_size,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_activation='relu',
                                      recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(sequence_input)

lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
    (tf.keras.layers.LSTM
     (lstm_cell_size,
      return_sequences=True,
      return_state=True,
      recurrent_activation='relu',
      recurrent_initializer='glorot_uniform'))(lstm)

state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

#  PROBLEM IN THIS LINE
context_vector, attention_weights = Attention(attention_size)(lstm, state_h)

output = tf.keras.layers.Dense(n_symb, activation='softmax')(context_vector)

BDA = tf.keras.Model(inputs=sequence_input, outputs=output)

# summarize layers
print(BDA.summary())

BDA.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20, 20)]     0                                            
__________________________________________________________________________________________________
bi_lstm_0 (Bidirectional)       [(None, 20, 64), (No 13568       input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 20, 64), (No 24832       bi_lstm_0[0][0]                  
                                                                 bi_lstm_0[0][1]                  
                                                                 bi_lstm_0[0][2]                  
                                                                 bi_lstm_0[0][3]              

In [3]:
!nvidia-smi

Fri Jun 12 04:14:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 431.40       Driver Version: 431.40       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 166... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   46C    P0    28W /  N/A |   4977MiB /  6144MiB |     13%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [4]:
from __future__ import print_function
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(1)

import arithc as arith
import fqt, ppm
import contextlib, sys
import filecmp
from IPython.display import clear_output
import numpy as np

from tensorflow.keras.utils import to_categorical

from plist import ProbabilityList

with open('data/ecoli/Ecoli.txt') as f:
    for line in f:
        #inner_list = [elt.strip() for elt in line.split()]
        inner_list = list(line)
        ecoli = inner_list


temp_dict = {'a':97,'g': 103,'c': 99,'t': 116}
s =  [temp_dict[i] for i in ecoli]
char_list = [97, 103, 99, 116] # we can read this as we go
update_period = len(s)


legend = dict([(v, k) for k, v in enumerate(char_list)]) # map character to 0,1,2,3,4, etc.
vocab_size = len(char_list)

n = 100000
k = tsteps*seg_len 
inputfile, outputfile = 'data/ecoli/Ecoli.txt', 'data/ecoli/Ecoli_BDA.txt'

e_idx = 0
tempdict = {}
#Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:

        #np.save('temp'+str(overall),predicted_val)
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!
    for symbol in s[:n+k]:
        t = model.get_total() ## New lines!
        l = model.get_low(legend[symbol])
        h = model.get_high(legend[symbol])
        enc.storeRegion(l,h,t) 
        model.increment(legend[symbol])
        e_idx += 1

    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs)
        
    
    for overall in range(10):
        predicted_val = []
        if overall <= 8:
            x = np.zeros((500000, tsteps, seg_len)) # 64 characters context
            y = np.zeros((500000, n_symb))
            print(len(x))
            print(overall)
            idx3 = 0
            for idx2 in range(500000*overall+k,500000*(overall+1)+k): #len(s)):
                train_seq = [legend[i] for i in s[idx2-k:idx2]] 
                train_target = legend[s[idx2]]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1

                
        if overall == 9:
            x = np.zeros((len(s)-500000*overall-k, tsteps, seg_len)) # 64 characters context
            y = np.zeros((len(s)-500000*overall-k, n_symb))
            print(len(x))
            print(overall)
            idx3 = 0
            for idx2 in range(500000*overall+k,len(s)): #len(s)):
                train_seq = [legend[i] for i in s[idx2-k:idx2]] 
                train_target = legend[s[idx2]]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1

        if overall != 0:
            predicted_val += list(BDA.predict(x[0:n]))
        if overall != 9:
            for i in range(len(x)//100000 -2):
                BDA.fit(x[n*i:n*(i+1)], y[n*i:n*(i+1)],
                      batch_size=250,
                      epochs=1,
                      validation_data=(x[n*(i+1):n*(i+2)], y[n*(i+1):n*(i+2)]))

                predicted_val += list(BDA.predict(x[n*(i+1):n*(i+2)]))
            i = len(x)//100000 -2
            BDA.fit(x[n*i:n*(i+1)], y[n*i:n*(i+1)],
                      batch_size=250,
                      epochs=1,
                      validation_data=(x[n*(i+1):], y[n*(i+1):]))
            predicted_val += list(BDA.predict(x[n*(i+1):]))

            BDA.fit(x[n*(i+1):], y[n*(i+1):],
                      batch_size=250,
                      epochs=1)
        if overall == 9:
            BDA.fit(x[0:n], y[0:n],
                      batch_size=250,
                      epochs=1)
            predicted_val += list(BDA.predict(x[n:]))
        x= None
        y = None
        del x
        del y
        for prob_list in predicted_val:
            for val, prob in enumerate(prob_list):
                model.set(val, int(prob*100000)+1)
                

            t = model.get_total()
            l = model.get_low(legend[s[e_idx]])
            h = model.get_high(legend[s[e_idx]])
            enc.storeRegion(l,h,t) 
            e_idx += 1
        predicted_val = None
        del predicted_val

    t = model.get_total()
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l,h,t)
    e_idx += 1
    print(e_idx)
    enc.finish_encode(bitout)

500000
0
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples
500000
1
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples
500000
2
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples
Train on 100000 samples, validate on 100000 samples

KeyboardInterrupt: 

## We can see exploding gradient: we need to perform gradient clipping

In [29]:
tsteps = 200
seg_len = 1
lstm_cell_size = 16
#vocab_size=250
attention_size = 6
n_symb = 4

# optimizer
sgd_opt = 'adam'
lr = 4e-3
beta1 = 0
beta2 = 0.9999
eps=1e-5
batch_size = 250
opt = Adam(
    learning_rate=lr , beta_1=0.0, beta_2=beta2, epsilon=eps, clipvalue=0.5
)

class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

sequence_input = tf.keras.layers.Input(shape=(tsteps,seg_len,))

#embedded_sequences = tf.keras.layers.Embedding(vocab_size, 128, input_length=max_len)(sequence_input)
#sequence_input = tf.keras.layers.BatchNormalization()(sequence_input)

lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
                                     (lstm_cell_size,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_activation='tanh',
                                      recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(sequence_input)

lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
    (tf.keras.layers.LSTM
     (lstm_cell_size,
      return_sequences=True,
      return_state=True,
      recurrent_activation='tanh',
      recurrent_initializer='glorot_uniform'))(lstm)

state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
#state_h = tf.keras.layers.BatchNormalization()(state_h)
#state_c = tf.keras.layers.BatchNormalization()(state_c)

#  PROBLEM IN THIS LINE
context_vector, attention_weights = Attention(attention_size)(lstm, state_h)
#context_vector = tf.keras.layers.BatchNormalization()(context_vector)
output = tf.keras.layers.Dense(n_symb, activation='softmax')(context_vector)

BDA = tf.keras.Model(inputs=sequence_input, outputs=output)

# summarize layers
print(BDA.summary())

BDA.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 200, 1)]     0                                            
__________________________________________________________________________________________________
bi_lstm_0 (Bidirectional)       [(None, 200, 32), (N 2304        input_23[0][0]                   
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional [(None, 200, 32), (N 6272        bi_lstm_0[0][0]                  
                                                                 bi_lstm_0[0][1]                  
                                                                 bi_lstm_0[0][2]                  
                                                                 bi_lstm_0[0][3]           

In [None]:
tsteps = 200
seg_len = 1
lstm_cell_size = 16
#vocab_size=250
attention_size = 6
n_symb = 4

# optimizer
sgd_opt = 'adam'
lr = 4e-3
beta1 = 0
beta2 = 0.9999
eps=1e-5
batch_size = 250
opt = Adam(
    learning_rate=lr , beta_1=0.0, beta_2=beta2, epsilon=eps, clipvalue=0.5
)

class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

    
    
model = Sequential()
forward_layer = tf.keras.layers.LSTM(32, return_sequences=True)
backward_layer = tf.keras.layers.LSTM(32, activation='relu', return_sequences=True,
                   go_backwards=True)
model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
                     input_shape=(tsteps, seg_len,)))
model.add(tf.keras.layers.BatchNormalization())
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    
    
sequence_input = tf.keras.layers.Input(shape=(tsteps,seg_len,))

#embedded_sequences = tf.keras.layers.Embedding(vocab_size, 128, input_length=max_len)(sequence_input)
#sequence_input = tf.keras.layers.BatchNormalization()(sequence_input)

lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
                                     (lstm_cell_size,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_activation='tanh',
                                      recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(sequence_input)

lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
    (tf.keras.layers.LSTM
     (lstm_cell_size,
      return_sequences=True,
      return_state=True,
      recurrent_activation='tanh',
      recurrent_initializer='glorot_uniform'))(lstm)

state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
state_h = tf.keras.layers.BatchNormalization()(state_h)
state_c = tf.keras.layers.BatchNormalization()(state_c)

#  PROBLEM IN THIS LINE
context_vector, attention_weights = Attention(attention_size)(lstm, state_h)
context_vector = tf.keras.layers.BatchNormalization()(context_vector)
output = tf.keras.layers.Dense(n_symb, activation='softmax')(context_vector)

BDA = tf.keras.Model(inputs=sequence_input, outputs=output)

# summarize layers
print(BDA.summary())

BDA.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

In [30]:
from __future__ import print_function
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(1)

import arithc as arith
import fqt, ppm
import contextlib, sys
import filecmp
from IPython.display import clear_output
import numpy as np

from tensorflow.keras.utils import to_categorical

from plist import ProbabilityList

with open('data/ecoli/Ecoli.txt') as f:
    for line in f:
        #inner_list = [elt.strip() for elt in line.split()]
        inner_list = list(line)
        ecoli = inner_list


temp_dict = {'a':97,'g': 103,'c': 99,'t': 116}
s =  [temp_dict[i] for i in ecoli]
char_list = [97, 103, 99, 116] # we can read this as we go
update_period = len(s)


legend = dict([(v, k) for k, v in enumerate(char_list)]) # map character to 0,1,2,3,4, etc.
vocab_size = len(char_list)

n = 100000
k = tsteps*seg_len 
inputfile, outputfile = 'data/ecoli/Ecoli.txt', 'data/ecoli/Ecoli_BDA.txt'

e_idx = 0
tempdict = {}
#Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:

        #np.save('temp'+str(overall),predicted_val)
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!
    for symbol in s[:n+k]:
        t = model.get_total() ## New lines!
        l = model.get_low(legend[symbol])
        h = model.get_high(legend[symbol])
        enc.storeRegion(l,h,t) 
        model.increment(legend[symbol])
        e_idx += 1

    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs)
        
    
    for overall in range(10):
        predicted_val = []
        if overall <= 8:
            x = np.zeros((500000, tsteps, seg_len)) # 64 characters context
            y = np.zeros((500000, n_symb))
            print(len(x))
            print(overall)
            idx3 = 0
            for idx2 in range(500000*overall+k,500000*(overall+1)+k): #len(s)):
                train_seq = [legend[i] for i in s[idx2-k:idx2]] 
                train_target = legend[s[idx2]]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1

                
        if overall == 9:
            x = np.zeros((len(s)-500000*overall-k, tsteps, seg_len)) # 64 characters context
            y = np.zeros((len(s)-500000*overall-k, n_symb))
            print(len(x))
            print(overall)
            idx3 = 0
            for idx2 in range(500000*overall+k,len(s)): #len(s)):
                train_seq = [legend[i] for i in s[idx2-k:idx2]] 
                train_target = legend[s[idx2]]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1

        if overall != 0:
            predicted_val += list(BDA.predict(x[0:n]))
        if overall != 9:
            for i in range(len(x)//100000 -2):
                BDA.fit(x[n*i:n*(i+1)], y[n*i:n*(i+1)],
                      batch_size=250,
                      epochs=1,
                      validation_data=(x[n*(i+1):n*(i+2)], y[n*(i+1):n*(i+2)]))

                predicted_val += list(BDA.predict(x[n*(i+1):n*(i+2)]))
            i = len(x)//100000 -2
            BDA.fit(x[n*i:n*(i+1)], y[n*i:n*(i+1)],
                      batch_size=250,
                      epochs=1,
                      validation_data=(x[n*(i+1):], y[n*(i+1):]))
            predicted_val += list(BDA.predict(x[n*(i+1):]))

            BDA.fit(x[n*(i+1):], y[n*(i+1):],
                      batch_size=250,
                      epochs=1)
        if overall == 9:
            BDA.fit(x[0:n], y[0:n],
                      batch_size=250,
                      epochs=1)
            predicted_val += list(BDA.predict(x[n:]))
        x= None
        y = None
        del x
        del y
        for prob_list in predicted_val:
            for val, prob in enumerate(prob_list):
                model.set(val, int(prob*100000)+1)
                

            t = model.get_total()
            l = model.get_low(legend[s[e_idx]])
            h = model.get_high(legend[s[e_idx]])
            enc.storeRegion(l,h,t) 
            e_idx += 1
        predicted_val = None
        del predicted_val

    t = model.get_total()
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l,h,t)
    e_idx += 1
    print(e_idx)
    enc.finish_encode(bitout)

500000
0
Train on 100000 samples, validate on 100000 samples


KeyboardInterrupt: 