1. Read in the split sequences. 
2. Get the alphabets and add in a padding character (' '), a stop character ('.'), and a start character ('$'). 
3. Save n x L x c arrays as h5py files. X is the mature sequence. y is the signal peptide. 
4. Check that saved sequences decode correctly. 
5. Save n x L arrays as h5py files. 
6. Check that saved sequences decode correctly.
7. Save the character tables

**For dataset that removes sequences at least 95% similar to the protein sequences in Zach's excel "initial_enzymes_1." Rerun on 6-14-18 for just training and validation sets.**

In [1]:
import pickle
import h5py 
import itertools

import numpy as np

from tools import CharacterTable

  from ._conv import register_converters as _register_converters


In [2]:
# read in data from pickle files
with open('../data/filtered_datasets/train_95.pkl', 'rb') as f:
    train_95 = pickle.load(f)  
with open('../data/filtered_datasets/validate_95.pkl', 'rb') as f:
    validate_95 = pickle.load(f) 
    
train_small_95 = train_95[:1000]

In [3]:
alphabet = ''.join(sorted(set(itertools.chain.from_iterable([t[1] for t in train_95]))))
alphabet = ' .$' + alphabet + 'Z'

In [4]:
alphabet

' .$ACDEFGHIKLMNPQRSTUVWXYZ'

In [5]:
max_len_in = 107 # max length of prot seq (105 aa) + 2 for tokens
max_len_out = 72
n_chars = len(alphabet)

In [6]:
ctable = CharacterTable(alphabet)
encoded = ctable.encode('$ACE.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
$ACE.  |


In [7]:
def encode(seqs, max_len, ctable):
    if ctable.one_hot:
        X = np.zeros((len(seqs), max_len, n_chars))
    else:
        X = np.zeros((len(seqs), max_len))
    seqs = ['$' + seq + '.' for seq in seqs]
    seqs = [seq + ' ' * ((max_len) - len(seq))for seq in seqs]
    for i, seq in enumerate(seqs):
        X[i] = ctable.encode(seq, max_len)
    return X

In [8]:
def to_h5py(seqs, fname, ctable):
    chunksize = 500
    with h5py.File('../data/filtered_datasets/' + fname + '.hdf5', 'w') as f:
        if ctable.one_hot:
            X = f.create_dataset('X', (len(seqs), max_len_in, n_chars))
            y = f.create_dataset('y', (len(seqs), max_len_out, n_chars))
        else:
            X = f.create_dataset('X', (len(seqs), max_len_in))
            y = f.create_dataset('y', (len(seqs), max_len_out))            
        for i in range(0, len(seqs), chunksize):
            X[i:i + chunksize, :] = encode([seq[1] for seq in seqs[i:i+chunksize]], max_len_in, ctable)
            y[i:i + chunksize, :] = encode([seq[0] for seq in seqs[i:i+chunksize]], max_len_out, ctable)
        left = len(seqs) % chunksize
        if left > 0:
            X[-left:, :] = encode([seq[1] for seq in seqs[-left:]], max_len_in, ctable)
            y[-left:, :] = encode([seq[0] for seq in seqs[-left:]], max_len_out, ctable)   

In [9]:
to_h5py(train_95, 'train_95', ctable)
to_h5py(validate_95, 'validate_95', ctable)
to_h5py(train_small_95, 'train_small_95', ctable)

with open('../data/filtered_datasets/outputs/ctable_onehot_95.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [10]:
ctable = CharacterTable(alphabet, one_hot=False)
encoded = ctable.encode('$ACE.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[1 3 4 6 2]
$ACE.|


In [None]:
to_h5py(train_95, 'train_tokens_95', ctable)
to_h5py(validate_95, 'validate_tokens_95', ctable)
to_h5py(train_small_95, 'train_small_tokens_95', ctable)

with open('../data/filtered_datasets/outputs/ctable_token_95.pkl', 'wb') as f:
    pickle.dump(ctable, f)