1. Read in the split sequences. 
2. Get the alphabets and add in a padding character (' '), a stop character ('.'), and a start character ('$'). 
3. Save n x L x c arrays as h5py files. X is the mature sequence. y is the signal peptide. 
4. Check that saved sequences decode correctly. 
5. Save n x L arrays as h5py files. 
6. Check that saved sequences decode correctly.
7. Save the character tables

**For dataset that removes sequences at least 99% similar to the protein sequences in Zach's excel "initial_enzymes_1." Rerun on 6-14-18 for just training and validation sets.**

In [1]:
import pickle
import h5py 
import itertools

import numpy as np

from tools import CharacterTable

In [2]:
# read in data from pickle files
with open('../data/filtered_datasets/train_augmented_99.pkl', 'rb') as f:
    train_99 = pickle.load(f)  
with open('../data/filtered_datasets/validate_99.pkl', 'rb') as f:
    validate_99 = pickle.load(f) 
    
train_small_99 = train_99[:1000]

In [3]:
alphabet = ''.join(sorted(set(itertools.chain.from_iterable([t[1] for t in train_99]))))
alphabet = ' .$' + alphabet

In [4]:
alphabet

' .$ACDEFGHIKLMNPQRSTUVWXYZ'

In [5]:
max_len_in = 107 # max length of prot seq (105 aa) + 2 for tokens
max_len_out = 72
n_chars = len(alphabet)

In [6]:
ctable = CharacterTable(alphabet)
encoded = ctable.encode('$ACZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]]
$ACZ.  |


In [7]:
def encode(seqs, max_len, ctable):
    if ctable.one_hot:
        X = np.zeros((len(seqs), max_len, n_chars))
    else:
        X = np.zeros((len(seqs), max_len))
    seqs = ['$' + seq + '.' for seq in seqs]
    seqs = [seq + ' ' * ((max_len) - len(seq))for seq in seqs]
    for i, seq in enumerate(seqs):
        X[i] = ctable.encode(seq, max_len)
    return X

In [8]:
def to_h5py(seqs, fname, ctable):
    chunksize = 500
    with h5py.File('../../6-14-18_filtered_data/' + fname + '.hdf5', 'w') as f:
        if ctable.one_hot:
            X = f.create_dataset('X', (len(seqs), max_len_in, n_chars))
            y = f.create_dataset('y', (len(seqs), max_len_out, n_chars))
        else:
            X = f.create_dataset('X', (len(seqs), max_len_in))
            y = f.create_dataset('y', (len(seqs), max_len_out))            
        for i in range(0, len(seqs), chunksize):
            X[i:i + chunksize, :] = encode([seq[1] for seq in seqs[i:i+chunksize]], max_len_in, ctable)
            y[i:i + chunksize, :] = encode([seq[0] for seq in seqs[i:i+chunksize]], max_len_out, ctable)
        left = len(seqs) % chunksize
        if left > 0:
            X[-left:, :] = encode([seq[1] for seq in seqs[-left:]], max_len_in, ctable)
            y[-left:, :] = encode([seq[0] for seq in seqs[-left:]], max_len_out, ctable)   

In [9]:
to_h5py(train_99, 'train_augmented_99', ctable)
to_h5py(validate_99, 'validate_99', ctable)
to_h5py(train_small_99, 'train_small_augmented_99', ctable)

with open('../../6-14-18_filtered_data/outputs/ctable_onehot_99.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [10]:
ctable = CharacterTable(alphabet, one_hot=False)
encoded = ctable.encode('$ACZ.', 7, reverse=False)
decoded = ctable.decode(encoded, reverse=False)
print(encoded)
print(decoded + '|')

[ 1  3  4 25  2]
$ACZ.|


In [11]:
to_h5py(train_99, 'train_tokens_augmented_99', ctable)
to_h5py(validate_99, 'validate_tokens_99', ctable)
to_h5py(train_small_99, 'train_small_tokens_augmented_99', ctable)

with open('../../6-14-18_filtered_data/outputs/ctable_token_99.pkl', 'wb') as f:
    pickle.dump(ctable, f)

In [12]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

with h5py.File('../data/validate.hdf5', 'r') as f:
    src = Variable(torch.Tensor(f['X'][:100]))
    tgt = f['y'][:100].astype(int)
    tgt = Variable(torch.LongTensor(tgt))
src = src.transpose(2, 1)
src = src.cuda()
tgt = tgt.cuda()
tgt.size()

#validate src: torch.Size([100, 26, 107]), tgt: torch.Size([100, 72, 26])

OSError: Unable to open file (Unable to open file: name = '../data/validate.hdf5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)