1. Read in the split sequences. 
2. Get the alphabets and add in a padding character (' '), a stop character ('.'), and a start character ('$'). 
3. Save n x L x c arrays as h5py files. X is the mature sequence. y is the signal peptide. 
4. Check that saved sequences decode correctly. 
5. Save n x L arrays as h5py files. 
6. Check that saved sequences decode correctly.
7. Save the character tables

**Creates test batch of protein sequences from Zach's excel "initial_enzymes_1" so we can see the attention model's predictions on Zach's protein sequences. **

In [1]:
import pickle
import h5py 
import itertools

import numpy as np

from tools import CharacterTable

  from ._conv import register_converters as _register_converters


In [2]:
# read in data from pickle files
with open('../data/filtered_datasets/train_99.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/gen_test.pkl', 'rb') as f:
    test = pickle.load(f)
    
train_small = train[:1000]

In [3]:
alphabet = ''.join(sorted(set(itertools.chain.from_iterable([t[1] for t in train]))))
alphabet = ' .$' + alphabet

In [4]:
alphabet

' .$ACDEFGHIKLMNPQRSTUVWXYZ'

In [5]:
max_len_in = 107 # max length of prot seq (105 aa) + 2 for tokens
max_len_out = 72
n_chars = len(alphabet)
n_chars

26

In [6]:
ctable = CharacterTable(alphabet)
encoded = ctable.encode('$ACZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
$ACZ.  |


In [7]:
def encode(seqs, max_len, ctable):
    if ctable.one_hot:
        X = np.zeros((len(seqs), max_len, n_chars))
    else:
        X = np.zeros((len(seqs), max_len))
    seqs = ['$' + seq + '.' for seq in seqs]
    seqs = [seq + ' ' * ((max_len) - len(seq))for seq in seqs]
    for i, seq in enumerate(seqs):
        X[i] = ctable.encode(seq, max_len)
    return X

In [8]:
def to_h5py(seqs, fname, ctable):
    chunksize = 500
    with h5py.File('../data/' + fname + '.hdf5', 'w') as f:
        if ctable.one_hot:
            print('true')
            X = f.create_dataset('X', (len(seqs), max_len_in, n_chars))
        else:
            X = f.create_dataset('X', (len(seqs), max_len_in))          
        for i in range(0, len(seqs), chunksize):
            X[i:i + chunksize, :] = encode([seq for seq in seqs[i:i+chunksize]], max_len_in, ctable)
        left = len(seqs) % chunksize
        if left > 0:
            X[-left:, :] = encode([seq for seq in seqs[-left:]], max_len_in, ctable) 

In [9]:
to_h5py(test, 'gen_test_z', ctable)

with open('../data/ctable_copies/ctable_onehot.pkl', 'wb') as f:
    pickle.dump(ctable, f)

true


In [10]:
ctable = CharacterTable(alphabet, one_hot=False)
encoded = ctable.encode('$ACZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[ 1  3  4 25  2]
$ACZ.|


In [11]:
to_h5py(test, 'gen_test_tokens_2', ctable)

with open('../data/ctable_copies/ctable_token.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [12]:
with h5py.File('../data/test_tokens.hdf5', 'r') as f:
    X = np.array(f['X'][:10])

    
ctable.decode(X[3])

'$ATSRANDAPIVLLHGFTGWGREEMFGFKYWGGVRGDIEQWLNDNGYRTYTLAVGPLSSNWDRACEAYAQLVGGTVDYGAAHAAKHGHARFGRTYPGLLPE.     '

In [13]:
test[3]

'ATSRANDAPIVLLHGFTGWGREEMFGFKYWGGVRGDIEQWLNDNGYRTYTLAVGPLSSNWDRACEAYAQLVGGTVDYGAAHAAKHGHARFGRTYPGLLPE'

In [14]:
ctable.decode(X[3])

'$ATSRANDAPIVLLHGFTGWGREEMFGFKYWGGVRGDIEQWLNDNGYRTYTLAVGPLSSNWDRACEAYAQLVGGTVDYGAAHAAKHGHARFGRTYPGLLPE.     '

In [15]:
encode(test, 107, ctable)

array([[ 1.,  3.,  6., ...,  0.,  0.,  0.],
       [ 1., 19., 24., ...,  0.,  0.,  0.],
       [ 1.,  3.,  3., ...,  0.,  0.,  0.],
       ...,
       [ 1.,  3.,  8., ...,  0.,  0.,  0.],
       [ 1.,  3.,  3., ...,  0.,  0.,  0.],
       [ 1.,  6., 19., ...,  0.,  0.,  0.]])

In [16]:
test[1]

'TYTEIVTGSTPDDRFDNLAGYPSAPHYVDVTAGDTGPLRMHYVDEGPRDGTPVVLLHGEPTWSYLYRTMIPPLAAGGCRVLAPDLIGFGRSDKPSRIEDY'