# Convert Numpy Arrays to TF Records
Save the CullPDB protein dataset as TF records to make importing them to seq2seq models easier.
This notebook is a sort of visual documentation of the file *make_tfrecords.py*. I find it is
easiest for me to understand what a data pipeline is doing if I can visualize what the data 
looks like as it moves through that pipeline.

In [35]:
import numpy as np
import tensorflow as tf

# define tf.train.Features we need
def _floats_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [36]:
# Load the data; I've saved it in my HOME/data/cpdb directory
from pathlib import Path
HOME = str(Path.home())

# Reshape the data so we have (num_samples x max_seq_length x num_features)
data = np.load(HOME+"/data/cpdb/cpdb_6133.npy.gz").reshape(6133, 700, 57)

# Display the second residue of the first sample; features and label
data[0, 1, :]

array([ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
        0.06008665,  0.03106848,  0.02275394,  0.25161827,  0.03522972,
        0.00966052,  0.01477403,  0.99159086,  0.5       ,  0.35663486,
        0.14185107,  0.0765622 ,  0.01870651,  0.32300413,  0.03260945,
        0.17653529,  0.99043465,  0.61301416,  0.00816257,  0.26894143,
        0.02436359,  0.        ])

In [37]:
# We're only interested in the amino acid and PSI-BLAST profile matrix features, and reshape
seqs = np.concatenate([data[:, :, 0:22].copy(), data[:, :, 35:56].copy()], axis=2).reshape(6133, -1)

# There are 43 features per residue
num_features = 43

seqs[0, 43:86]

array([ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.06008665,  0.03106848,  0.02275394,
        0.25161827,  0.03522972,  0.00966052,  0.01477403,  0.99159086,
        0.5       ,  0.35663486,  0.14185107,  0.0765622 ,  0.01870651,
        0.32300413,  0.03260945,  0.17653529,  0.99043465,  0.61301416,
        0.00816257,  0.26894143,  0.02436359])

In [38]:
# Reshape labels but don't flatten so we can count NoSeqs later
labels = data[:, :, 22:31].copy().reshape(6133, 700, -1)
labels[0, 1, :]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.])

## Save as a TF Record
We want to save each sample as a TF record. To save space and reduce training time, the
padding from the original dataset (each protein was padded to 700 with 'NoSeq' tokens) 
is removed.

In [39]:
# Count the number of non-NoSeq tokens to find protein length
noseq = np.array([[0., 0., 0., 0., 0., 0., 0., 0., 1.]])

# Find all positions in the vector that are the 'NoSeq' label
noseqs = np.all(np.equal(labels[0, :, :], noseq), axis=1)
print(noseqs)

# Now negate the boolean values and count the number of Trues
seq_length = np.logical_not(noseqs).sum()
seq_length

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

62

In [40]:
# Now that we know the length, we can flatten the labels
labels = labels.reshape(6133, -1)

# There are 9 labels
num_labels = 9

# Convert into a tf.train.Example, which would eventually be written to a file
example = tf.train.Example(features=tf.train.Features(feature={
    'seq_data': _floats_feature(seqs[0, 0:num_features*seq_length]),
    'label_data': _floats_feature(labels[0, 0:num_labels*seq_length])
    }
))
example

features {
  feature {
    key: "label_data"
    value {
      float_list {
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        valu