In [1]:
from __future__ import print_function

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.datasets import imdb

# Embedding
max_features = 20000
maxlen = 100
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2

'''
Note:
batch_size is highly sensitive.
Only 2 epochs are needed as the dataset is very small.
'''

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Build model...
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


Test score: 0.3490920394256711
Test accuracy: 0.85564


In [2]:
x_train.shape

(25000, 100)

In [3]:
import arithc as arith
import fqt, ppm
import contextlib, sys
import filecmp
from IPython.display import clear_output
import numpy as np

from tensorflow.keras.utils import to_categorical

In [4]:
list_of_lists = []
with open('data\ecoli\Ecoli.txt') as f:
    for line in f:
        #inner_list = [elt.strip() for elt in line.split()]
        inner_list = list(line)
        list_of_lists.append(inner_list)
print(len(list_of_lists[0])) # About 4 MB
ecoli = list_of_lists[0]


s = ecoli
char_list = [97, 103, 99, 116] # we can read this as we go
print(char_list)
update_period = len(s)

k = 64 #context length
n = 100000 # train every
legend = dict([(v, k) for k, v in enumerate(char_list)]) # map character to 0,1,2,3,4, etc.
vocab_size = len(char_list)

temp_dict = {'a':97,'g': 103,'c': 99,'t': 116}
s = [temp_dict[i] for i in s]
#Train model
x = np.zeros((update_period-k, k)) # 64 characters context
y = np.zeros((update_period-k, vocab_size))


print(len(s))
idx3 = 0
for idx2 in range(k,len(s)):
    train_seq = [legend[i] for i in s[idx2-k:idx2]] 
    train_target = legend[s[idx2]]
    x[idx3,:] = np.array(train_seq)
    y[idx3] = to_categorical(train_target, num_classes=vocab_size )
    idx3 += 1

4638690
[97, 103, 99, 116]
4638690


In [5]:
from __future__ import print_function

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.datasets import imdb

# Embedding
max_features = 20000
maxlen = k
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2

seq_length = 256  #Length of the sequence to be inserted into the LSTM
vocab_size = len(char_list)  #Size of the final dense layer of the model
lstm_cells = 32  #Size of the LSTM layer


D1LSTM = Sequential()
D1LSTM.add(Embedding(max_features, embedding_size, input_length=k))
D1LSTM.add(Dropout(0.25))
D1LSTM.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
D1LSTM.add(MaxPooling1D(pool_size=pool_size))
D1LSTM.add(LSTM(lstm_cells))
D1LSTM.add(Dense(vocab_size))
D1LSTM.add(Activation('softmax'))
D1LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [6]:
D1LSTM.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 64, 128)           2560000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 64)            41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 15, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
_________________________________________________________________
activation_1 (Activation)    (None, 4)                

In [7]:
predicted_onehot = []
for i in range(len(ecoli)//n - 1):
    if i%5 == 0:
        print(i)
    D1LSTM.fit(x[n*i:n*(i+1)], y[n*i:n*(i+1)],
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x[n*(i+1):n*(i+2)], y[n*(i+1):n*(i+2)]))

    predicted_onehot += list(D1LSTM.predict_proba(x[n*(i+1):n*(i+2)]))
predicted_onehot += list(D1LSTM.predict_proba(x[n*(len(ecoli)//n):]))

0
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
5
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
10
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch

Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2
Train on 100000 samples, validate on 100000 samples
Epoch 1/2
Epoch 2/2


In [9]:
np.save('tempura',predicted_onehot)

In [10]:
char_list = [97, 103, 99, 116]
def LSTM1D_Compress(inp, bitout):
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!

    idx = 0
    while True:
        symbol = inp.read(1)
        if len(symbol) == 0:
                break

        idx += 1
        ## Progress Evaluation ## only internal
        if idx % (len(ecoli)//10) == 0:
            print(str(10*idx//(len(ecoli)//10)) + ' percent done')
            clear_output(wait = True)
        if idx == (n+k+1):
            initfreqs = fqt.FlatFrequencyTable(257)
            model = fqt.SimpleFrequencyTable(initfreqs) # reset the model
        if idx >= (n+k+1):
            for val, prob in enumerate(predicted_onehot[idx-(n+k+1)]):
                model.set(char_list[val], int(prob*100000)+1)
            
        t = model.get_total() ## New lines!
        l = model.get_low(symbol[0])
        h = model.get_high(symbol[0])
        enc.storeRegion(l,h,t) 
        
        if idx < (n+k+1): # back up before LSTM model
            model.increment(symbol[0])
    t = model.get_total() ## New lines!
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l,h,t)
    enc.finish_encode(bitout)  # New line!
inputfile, outputfile = 'data\ecoli\Ecoli.txt', 'data\ecoli\Ecoli_LSTM1D.txt'

#Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    LSTM1D_Compress(inp, bitout)

100 percent done


## This is very good, however, we are not sure if it's worth the trade off timewise

Let's check accuracy

In [19]:
cc = ['a','g','c','t']
predicted_label = [cc[np.argmax(i)] for i in predicted_onehot]

print(np.sum(np.array(predicted_label) == np.array(ecoli[n+k:])))

1529883


In [20]:
1529883/len(predicted_label)

0.33708064951815814

34% right!. Let's consider if we dont have 100,000 delays, and if we use final model on the entire file!

In [21]:
predicted_cheat = list(D1LSTM.predict_proba(x[:n]))

In [22]:
char_list = [97, 103, 99, 116]
def LSTM1D_Cheat_Compress(inp, bitout):
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!

    idx = 0
    while True:
        symbol = inp.read(1)
        if len(symbol) == 0:
                break

        idx += 1
        ## Progress Evaluation ## only internal
        if idx % (len(ecoli)//10) == 0:
            print(str(10*idx//(len(ecoli)//10)) + ' percent done')
            clear_output(wait = True)
        if idx == (k+1):
            initfreqs = fqt.FlatFrequencyTable(257)
            model = fqt.SimpleFrequencyTable(initfreqs) # reset the model
        if idx >= (n+k+1):
            for val, prob in enumerate(predicted_onehot[idx-(n+k+1)]):
                model.set(char_list[val], int(prob*100000)+1)
        elif idx >= (k+1):
            for val, prob in enumerate(predicted_cheat[idx-(k+1)]):
                model.set(char_list[val], int(prob*100000)+1)
            
        t = model.get_total() ## New lines!
        l = model.get_low(symbol[0])
        h = model.get_high(symbol[0])
        enc.storeRegion(l,h,t) 
        
        if idx < (k+1): # back up before LSTM model
            model.increment(symbol[0])
    t = model.get_total() ## New lines!
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l,h,t)
    enc.finish_encode(bitout)  # New line!
inputfile, outputfile = 'data\ecoli\Ecoli.txt', 'data\ecoli\Ecoli_LSTM1D_cheat.txt'

#Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    LSTM1D_Cheat_Compress(inp, bitout)

100 percent done


In [23]:
predicted_full = list(D1LSTM.predict_proba(x[:]))

In [24]:
char_list = [97, 103, 99, 116]
def LSTM1D_Cheat_Compress(inp, bitout):
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!

    idx = 0
    while True:
        symbol = inp.read(1)
        if len(symbol) == 0:
                break

        idx += 1
        ## Progress Evaluation ## only internal
        if idx % (len(ecoli)//10) == 0:
            print(str(10*idx//(len(ecoli)//10)) + ' percent done')
            clear_output(wait = True)
        if idx == (k+1):
            initfreqs = fqt.FlatFrequencyTable(257)
            model = fqt.SimpleFrequencyTable(initfreqs) # reset the model
        if idx >= (k+1):
            for val, prob in enumerate(predicted_full[idx-(k+1)]):
                model.set(char_list[val], int(prob*100000)+1)
            
        t = model.get_total() ## New lines!
        l = model.get_low(symbol[0])
        h = model.get_high(symbol[0])
        enc.storeRegion(l,h,t) 
        
        if idx < (k+1): # back up before LSTM model
            model.increment(symbol[0])
    t = model.get_total() ## New lines!
    l = model.get_low(256)
    h = model.get_high(256)
    enc.storeRegion(l,h,t)
    enc.finish_encode(bitout)  # New line!
inputfile, outputfile = 'data\ecoli\Ecoli.txt', 'data\ecoli\Ecoli_LSTM1D_full.txt'

#Perform file compression
with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    LSTM1D_Cheat_Compress(inp, bitout)

100 percent done
