1. Read in the split sequences. 
2. Get the alphabets and add in a padding character (' '), a stop character ('.'), and a start character ('$'). 
3. Save n x L x c arrays as h5py files. X is the mature sequence. y is the signal peptide. 
4. Check that saved sequences decode correctly. 
5. Save n x L arrays as h5py files. 
6. Check that saved sequences decode correctly.
7. Save the character tables

In [1]:
import pickle
import h5py 
import itertools

import numpy as np

from tools import CharacterTable

In [2]:
# read in data from pickle files
with open('../data/train.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/validate.pkl', 'rb') as f:
    validate = pickle.load(f)
    
with open('../data/test.pkl', 'rb') as f:
    test = pickle.load(f)
    
train_small = train[:1000]

In [3]:
alphabet = ''.join(sorted(set(itertools.chain.from_iterable([t[1] for t in train]))))
alphabet = ' .$' + alphabet

In [4]:
max_len_in = 107 # max length of prot seq (105 aa) + 2 for tokens
max_len_out = 72
n_chars = len(alphabet)

In [5]:
ctable = CharacterTable(alphabet)
encoded = ctable.encode('$ABZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]
$ABZ.  |


In [6]:
def encode(seqs, max_len, ctable):
    if ctable.one_hot:
        X = np.zeros((len(seqs), max_len, n_chars))
    else:
        X = np.zeros((len(seqs), max_len))
    seqs = ['$' + seq + '.' for seq in seqs]
    seqs = [seq + ' ' * ((max_len) - len(seq))for seq in seqs]
    for i, seq in enumerate(seqs):
        X[i] = ctable.encode(seq, max_len)
    return X

In [7]:
def to_h5py(seqs, fname, ctable):
    chunksize = 500
    with h5py.File('../data/' + fname + '.hdf5', 'w') as f:
        if ctable.one_hot:
            X = f.create_dataset('X', (len(seqs), max_len_in, n_chars))
            y = f.create_dataset('y', (len(seqs), max_len_out, n_chars))
        else:
            X = f.create_dataset('X', (len(seqs), max_len_in))
            y = f.create_dataset('y', (len(seqs), max_len_out))            
        for i in range(0, len(seqs), chunksize):
            X[i:i + chunksize, :] = encode([seq[1] for seq in seqs[i:i+chunksize]], max_len_in, ctable)
            y[i:i + chunksize, :] = encode([seq[0] for seq in seqs[i:i+chunksize]], max_len_out, ctable)
        left = len(seqs) % chunksize
        if left > 0:
            X[-left:, :] = encode([seq[1] for seq in seqs[-left:]], max_len_in, ctable)
            y[-left:, :] = encode([seq[0] for seq in seqs[-left:]], max_len_out, ctable)   

In [8]:
to_h5py(train, 'train', ctable)
to_h5py(test, 'test', ctable)
to_h5py(validate, 'validate', ctable)
to_h5py(train_small, 'train_small', ctable)

with open('../outputs/ctable_onehot.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [9]:
with h5py.File('../data/train.hdf5', 'r') as f:
    X = np.array(f['X'][:10])
    y = np.array(f['y'][:10])

ctable.decode(y[3])

'$MWGPLIYALLGLAIVAAAFLFVRRSQA.                                           '

In [10]:
train[3][0]

'MWGPLIYALLGLAIVAAAFLFVRRSQA'

In [11]:
ctable = CharacterTable(alphabet, one_hot=False)
encoded = ctable.encode('$ABZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[ 1  3  4 26  2]
$ABZ.|


In [12]:
to_h5py(train, 'train_tokens', ctable)
to_h5py(test, 'test_tokens', ctable)
to_h5py(validate, 'validate_tokens', ctable)
to_h5py(train_small, 'train_small_tokens', ctable)

with open('../outputs/ctable_token.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [13]:
with h5py.File('../data/train_tokens.hdf5', 'r') as f:
    X = np.array(f['X'][:10])
    y = np.array(f['y'][:10])

ctable.decode(y[3])

'$MWGPLIYALLGLAIVAAAFLFVRRSQA.                                           '

In [14]:
train[3][0]

'MWGPLIYALLGLAIVAAAFLFVRRSQA'