# Convert Numpy Arrays to TF Records
Save the CullPDB protein dataset as TF records to make importing them to models easier.
This notebook is a walkthrough of the file *make_tfrecords.py*. I find it is
easiest to understand what a data pipeline is doing if I can visualize what the data 
looks like as it moves through that pipeline.

In [1]:
import numpy as np
import tensorflow as tf

# define tf.train.Features we need
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _floats_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [2]:
# Load the data; I've saved it in my HOME/data/cpdb directory
from pathlib import Path
HOME = str(Path.home())

# Reshape the data so we have (num_samples x max_seq_length x num_features)
data = np.load(HOME+"/data/cpdb/cpdb+profile_6133_filtered.npy.gz").reshape(-1, 700, 57)
num_samples = data.shape[0]

# Display the second residue of the first sample
data[0, 1, :]

array([0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 1.        ,
       0.05732417, 0.00669285, 0.9976241 , 0.96832377, 0.00682713,
       0.14185107, 0.09885607, 0.00919371, 0.80059224, 0.00784502,
       0.01763634, 0.55971366, 0.0541667 , 0.4378235 , 0.12786157,
       0.16798161, 0.14931294, 0.01224884, 0.00532502, 0.26894143,
       0.01551975, 0.        ])

In [3]:
# We're only interested in the amino acid and PSI-BLAST profile matrix features
seqs = np.concatenate([data[:, :, 0:22].copy(), data[:, :, 35:56].copy()], axis=2).reshape(num_samples, -1)

# There are 43 features per residue
num_features = 43

seqs[0, 43:86]

array([0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05732417, 0.00669285, 0.9976241 ,
       0.96832377, 0.00682713, 0.14185107, 0.09885607, 0.00919371,
       0.80059224, 0.00784502, 0.01763634, 0.55971366, 0.0541667 ,
       0.4378235 , 0.12786157, 0.16798161, 0.14931294, 0.01224884,
       0.00532502, 0.26894143, 0.01551975])

In [4]:
# Reshape labels but don't flatten so we can count NoSeqs later
labels = data[:, :, 22:31].copy().reshape(num_samples, 700, -1)
labels[0, 1, :]

array([1., 0., 0., 0., 0., 0., 0., 0., 0.])

## Save as a TF Record
We want to save each sample as a TF record. To save space and reduce training time, the
padding from the original dataset (each protein was padded to 700 with 'NoSeq' tokens) 
is removed.

In [5]:
# Count the number of non-NoSeq tokens to find protein length
noseq = np.array([[0., 0., 0., 0., 0., 0., 0., 0., 1.]])

# Find all positions in the vector that are the 'NoSeq' label
noseqs = np.all(np.equal(labels[0, :, :], noseq), axis=1)

# Now negate the boolean values and count the number of Trues
seq_length = np.logical_not(noseqs).sum()
print(seq_length)

noseqs

315


array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [6]:
# Now that we know the length, we can flatten the labels
labels = labels.reshape(num_samples, -1)

# There are 9 labels
num_labels = 9

# Convert into a tf.train.Example, which would eventually be written to a file
example = tf.train.Example(features=tf.train.Features(feature={
    'seq_len': _int64_feature(seq_length),
    'seq_data': _floats_feature(seqs[0, 0:num_features*seq_length]),
    'label_data': _floats_feature(labels[0, 0:num_labels*seq_length])
    }
))
example

features {
  feature {
    key: "label_data"
    value {
      float_list {
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 0.0
        value: 1.0
        valu