In [1]:
import arithc as arith
import fqt, ppm
import contextlib, sys
import filecmp
from IPython.display import clear_output
import numpy as np

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.datasets import imdb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Bidirectional

from plist import ProbabilityList
import timeit


In [2]:
from __future__ import print_function
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(1)

with open('data/ecoli/Ecoli.txt') as f:
    for line in f:
        ecoli = list(line)



temp_dict = {'a':97,'g': 103,'c': 99,'t': 116}
char_list = [97, 103, 99, 116] # we can read this as we go
legend = dict([(v, k) for k, v in enumerate(char_list)]) # map character to 0,1,2,3,4, etc.
s =  [legend[temp_dict[i]] for i in ecoli]

vocab_size = len(char_list)

n = 100000 # number of samples
tsteps = 30 #time steps
seg_len = 6 #input_dim
k = tsteps*seg_len # full context for each sample
n_symb = 4

# optimizer
sgd_opt = 'adam'
lr = 4e-3
beta1 = 0
beta2 = 0.9999
eps=1e-5

# LSTM Training
hidden_size = 32
batch_size = 250

epochs = 1

n_layer = 4 #only 4 total laters? or 4 LSTM it does say 4

opt = Adam(
    learning_rate=lr , beta_1=0.0, beta_2=beta2, epsilon=eps
)

n_symb = 4

BILSTM = Sequential()
BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,seg_len), return_sequences=True), input_shape=(tsteps,seg_len)))
BILSTM.add(LayerNormalization(axis=1 , center=True , scale=True))
# BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size), return_sequences=True)))
# BILSTM.add(BatchNormalization(axis=1 , center=True , scale=True))
# BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size), return_sequences=True)))
# BILSTM.add(BatchNormalization(axis=1 , center=True , scale=True))
BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size))))
BILSTM.add(LayerNormalization(axis=1 , center=True , scale=True))
BILSTM.add(Dense(n_symb))
BILSTM.add(Activation('softmax'))
BILSTM.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

BILSTM.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 30, 64)            9984      
_________________________________________________________________
layer_normalization (LayerNo (None, 30, 64)            60        
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                24832     
_________________________________________________________________
layer_normalization_1 (Layer (None, 64)                128       
_________________________________________________________________
dense (Dense)                (None, 4)                 260       
_________________________________________________________________
activation (Activation)      (None, 4)                 0         
Total params: 35,264
Trainable params: 35,264
Non-trainable params: 0
____________________________________________________

In [3]:
len(s)//200000

23

In [4]:
len(s)

4638690

In [None]:
# from __future__ import print_function
# from numpy.random import seed
# seed(1)

# from tensorflow import random
# random.set_seed(1)

tic=timeit.default_timer()


inputfile, outputfile = 'data/ecoli/Ecoli.txt', 'data/ecoli/Ecoli.bi_complex_seed1'
epochs = 1
e_idx = 0

with open(inputfile, "rb") as inp, \
        contextlib.closing(arith.BitOutputStream(open(outputfile, "wb"))) as bitout:
    
    
    ## For the first n+k characters, we compress with default method
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs) # For the first 200,000
    enc = arith.ArithmeticCoder(32)
    enc.start_encode(bitout) # New line!
    for symbol in s[:n+k]:
        t = model.get_total() ## New lines!
        l = model.get_low(symbol)
        h = model.get_high(symbol)
        enc.storeRegion(l,h,t) 
        model.increment(symbol)
        e_idx += 1
        
    prior = [0 for i in range(257)]
    prior[:4] = [0.25,0.25,0.25,0.25]
    prior[256] = 1-sum(prior[:256])
    model = ProbabilityList(prior)   # reset model, now e_idx = n+k
    
    for overall in range(len(s)//200000 + 1):
        predicted_val = []
        if overall < len(s)//200000:
            x = np.zeros((200000, tsteps, seg_len)) # 64 characters context
            y = np.zeros((200000, n_symb))
            print(overall)
            idx3 = 0
            for idx2 in range(200000*overall+k,200000*(overall+1)+k): #len(s)):
                train_seq = s[idx2-k:idx2]
                train_target = s[idx2]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1
                
                
        if overall == len(s)//200000:
            x = np.zeros((len(s)-200000*overall-k, tsteps, seg_len)) # 64 characters context
            y = np.zeros((len(s)-200000*overall-k, n_symb))
            print(len(x))
            print(overall)
            idx3 = 0
            for idx2 in range(200000*overall+k,len(s)): #len(s)):
                train_seq = s[idx2-k:idx2]
                train_target = s[idx2]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1

        if overall != 0 and overall != len(s)//200000:
            predicted_val += list(BILSTM.predict(x[0:n]))
        if overall != len(s)//200000:
            BILSTM.fit(x[0:n], y[0:n],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x[n:2*n], y[n:2*n]))
            
            predicted_val += list(BILSTM.predict(x[n:2*n]))
            
            # For checking
            x_arr = np.array(s[200000*(overall+1)-1:200000*(overall+1)+k-1]).reshape(1,tsteps,seg_len)
            print(BILSTM(x_arr.astype(np.float32), training= False).numpy())
            print(predicted_val[-1])
            
            BILSTM.fit(x[n:2*n], y[n:2*n],
                  batch_size=batch_size,
                  epochs=epochs)
            
        if overall == len(s)//200000:
            predicted_val += list(BILSTM.predict(x[:]))
            
        for prob_list in predicted_val:
#             for val, prob in enumerate(prob_list):
#                 model.set(val, int(prob*100000)+1)
                
            model.prob_list[:4] = prob_list
            #model.prob_list[4:256] = [1/100000 for i in range(252)]
            model.normalize()
            t = int(100000) ## New lines!
            l = int(model.get_low(s[e_idx])*100000)
            h = int(model.get_high(s[e_idx])*100000)
            enc.storeRegion(l,h,t) 
#             t = model.get_total()
#             l = model.get_low(legend[s[e_idx]])
#             h = model.get_high(legend[s[e_idx]])
#             enc.storeRegion(l,h,t) 
            e_idx += 1
        if overall != len(s)//200000: ## checking to confirm
            print(e_idx-1) 
            print(200000*(overall+1)+k-1)
            
        x= None
        y = None
        del x
        del y
        predicted_val = None
        del predicted_val

    e_idx += 1
    print(e_idx)
    enc.finish_encode(bitout)
    

toc=timeit.default_timer()
print(toc-tic)

0
Train on 100000 samples, validate on 100000 samples
[[0.16897546 0.28263983 0.33494687 0.21343787]]
[0.16897546 0.28263977 0.33494687 0.2134379 ]
Train on 100000 samples
200179
200179
1
Train on 100000 samples, validate on 100000 samples
[[0.12537988 0.24028209 0.4076923  0.22664571]]
[0.12537985 0.24028212 0.40769237 0.22664568]
Train on 100000 samples
400179
400179
2
Train on 100000 samples, validate on 100000 samples
[[0.10809107 0.24105893 0.36036244 0.2904875 ]]
[0.1080911  0.24105895 0.36036247 0.29048747]
Train on 100000 samples
600179
600179
3
Train on 100000 samples, validate on 100000 samples
[[0.2778893  0.37835196 0.21039386 0.13336486]]
[0.2778893  0.37835193 0.21039389 0.13336487]
Train on 100000 samples
800179
800179
4
Train on 100000 samples, validate on 100000 samples
[[0.3860478  0.29037386 0.1630534  0.1605249 ]]
[0.3860478 0.2903739 0.1630534 0.1605249]
Train on 100000 samples
1000179
1000179
5
Train on 100000 samples, validate on 100000 samples
[[0.32627773 0.268

## Decompression

In [None]:
from __future__ import print_function
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(1)

with open('data/ecoli/Ecoli.txt') as f:
    for line in f:
        ecoli = list(line)



temp_dict = {'a':97,'g': 103,'c': 99,'t': 116}
char_list = [97, 103, 99, 116] # we can read this as we go
legend = dict([(v, k) for k, v in enumerate(char_list)]) # map character to 0,1,2,3,4, etc.
s =  [legend[temp_dict[i]] for i in ecoli]

vocab_size = len(char_list)

n = 100000 # number of samples
tsteps = 30 #time steps
seg_len = 6 #input_dim
k = tsteps*seg_len # full context for each sample
n_symb = 4

# optimizer
sgd_opt = 'adam'
lr = 4e-3
beta1 = 0
beta2 = 0.9999
eps=1e-5

# LSTM Training
hidden_size = 32
batch_size = 250

epochs = 1

n_layer = 4 #only 4 total laters? or 4 LSTM it does say 4

opt = Adam(
    learning_rate=lr , beta_1=0.0, beta_2=beta2, epsilon=eps
)

n_symb = 4

BILSTM = Sequential()
BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,seg_len), return_sequences=True), input_shape=(tsteps,seg_len)))
BILSTM.add(LayerNormalization(axis=1 , center=True , scale=True))
# BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size), return_sequences=True)))
# BILSTM.add(BatchNormalization(axis=1 , center=True , scale=True))
# BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size), return_sequences=True)))
# BILSTM.add(BatchNormalization(axis=1 , center=True , scale=True))
BILSTM.add(Bidirectional(LSTM(hidden_size, activation='tanh', stateful=False, batch_input_shape=(batch_size,tsteps,hidden_size))))
BILSTM.add(LayerNormalization(axis=1 , center=True , scale=True))
BILSTM.add(Dense(n_symb))
BILSTM.add(Activation('softmax'))
BILSTM.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

BILSTM.summary()

In [None]:
# from __future__ import print_function
# from numpy.random import seed
# seed(1)

# from tensorflow import random
# random.set_seed(1)

tic=timeit.default_timer()


inputfile, outputfile = 'data/ecoli/Ecoli.bi_complex_seed1', 'data/ecoli/Ecoli_complex_decompressed.txt'
epochs = 1
e_idx = 0

# Perform file decompression
with open(inputfile, "rb") as inp, open(outputfile, "wb") as out:
    bitin = arith.BitInputStream(inp)
    
    
    ## For the first n+k characters, we compress with default method
    initfreqs = fqt.FlatFrequencyTable(257)
    model = fqt.SimpleFrequencyTable(initfreqs)
    dec = arith.ArithmeticCoder(32)
    dec.start_decode(bitin)
    new_s = []
    while e_idx < n+k:
        total = model.get_total()
        Range = dec.R
        offset = dec.getTarget()
        value = dec.getTarget(total)
        start = 0
        end = model.get_symbol_limit()
        while end - start > 1:
            middle = (start + end) >> 1
            if model.get_low(middle) > value:
                end = middle
            else:
                start = middle
        symbol = start
        l = model.get_low(symbol) 
        h = model.get_high(symbol)
        dec.loadRegion(l,h,total)
        
        model.increment(symbol)
        out.write(bytes((char_list[symbol],)))
        new_s.append(symbol)
        e_idx += 1
        
    prior = [0 for i in range(257)]
    prior[:4] = [0.25,0.25,0.25,0.25]
    prior[256] = 1-sum(prior[:256])
    model = ProbabilityList(prior)   # reset model, now e_idx = n+k
    
    for overall in range(len(s)//100000 + 1): # assume we save len(s), this only takes 8 bits, and cut the need for 256

        if overall < len(s)//100000:
            x = np.zeros((100000, tsteps, seg_len)) # 64 characters context
            y = np.zeros((100000, n_symb))
            print(overall)
            idx3 = 0
            for idx2 in range(100000*overall+k,100000*(overall+1)+k): #len(s)):
                train_seq = new_s[idx2-k:idx2]
                train_target = new_s[idx2]
                x[idx3,:] = np.array(train_seq).reshape(tsteps,seg_len)
                y[idx3] = to_categorical(train_target, num_classes=n_symb )
                idx3 += 1
            BILSTM.fit(x[0:n], y[0:n],
              batch_size=batch_size,
              epochs=epochs) 
        if overall == len(s)//100000:
            segment_len = len(s)-200000*overall-k
        else:
            segment_len = 100000
        print(new_s == s[:len(new_s)])  
        temp_x = new_s[-1*k-1:-1]
        x_arr = np.array(temp_x).reshape(1,tsteps,seg_len)
        print(BILSTM(x_arr.astype(np.float32), training= False).numpy())
        temp_x = new_s[-1*k:]
        for i in range(segment_len):
            x_arr = np.array(temp_x).reshape(1,tsteps,seg_len)
            prob_list_temp = BILSTM(x_arr.astype(np.float32), training= False).numpy()
            model.prob_list[:4] = prob_list_temp[0]

            model.normalize()
            
#             print(model.prob_list[:4])
#             print(dec.R)
#             print(dec.getTarget(total))
#             print(model.get_symbol_limit())
            total = int(100000) ## New lines!
            Range = dec.R
            offset = dec.getTarget()
            value = dec.getTarget(total)
            start = 0
            end = model.get_symbol_limit()
            while end - start > 1:
                middle = (start + end) >> 1
                if int(model.get_low(middle)*100000) > value:
                    #print(int(model.get_low(middle)*100000))
                    end = middle
                else:
                    start = middle

            symbol = start
            assert symbol != 256
            out.write(bytes((char_list[symbol],)))
            
            l = int(model.get_low(symbol)*100000)
            h = int(model.get_high(symbol)*100000)
            dec.loadRegion(l,h,total) 

            temp_x = temp_x[1:] + [symbol]
            new_s.append(symbol)
            if e_idx%20000 == 0:
                print(e_idx)
            e_idx += 1
            
        
        print(BILSTM(x_arr.astype(np.float32), training= False).numpy())
        print(e_idx-1) 
        print(200000*(overall+1)+k-1)
            

    e_idx += 1
    print(e_idx)
    

toc=timeit.default_timer()
print(toc-tic)

In [None]:
np.save('complex.npy', new_s) 

In [19]:
print(new_s[100030:100060])
print(s[100030:100060])

[1, 2, 1, 1, 1, 2, 3, 3, 0, 0, 2, 2, 0, 0, 3, 0, 0, 0, 3, 1, 1, 2, 3, 1, 1, 2, 1, 0, 0, 1]
[1, 2, 1, 1, 1, 2, 3, 3, 0, 0, 2, 2, 0, 0, 3, 0, 0, 0, 3, 1, 1, 2, 3, 1, 1, 2, 1, 0, 0, 1]


In [20]:
print(new_s[200030:200060])
print(s[200030:200060])

[3, 2, 1, 1, 0, 3, 1, 0, 3, 1, 2, 3, 1, 3, 0, 1, 1, 2, 1, 1, 3, 0, 0, 2, 1, 2, 2, 0, 3, 1]
[3, 2, 1, 1, 0, 3, 1, 0, 3, 1, 2, 3, 1, 3, 0, 1, 1, 2, 1, 1, 3, 0, 0, 2, 1, 2, 2, 0, 3, 1]
