In [1]:
import tensorflow as tf
import os
import multiprocessing
import ipdb
tf.enable_eager_execution()

In [78]:
NUM_DIMENSIONS = 3

def masking_matrix(mask, name=None):
    mask = tf.convert_to_tensor(mask, name='mask')

    mask = tf.expand_dims(mask, 0)
    base = tf.ones([tf.size(mask), tf.size(mask)])
    matrix_mask = base * mask * tf.transpose(mask)

    return matrix_mask
        
def read_protein(record):
    context, features = tf.parse_single_sequence_example(serialized=record,
                            context_features={'id': tf.FixedLenFeature((1,), tf.string)},
                            sequence_features={
                                    'primary':      tf.FixedLenSequenceFeature((1,),               tf.int64),
                                    'secondary':    tf.FixedLenSequenceFeature((1,),               tf.int64,   allow_missing=True),
                                    'tertiary':     tf.FixedLenSequenceFeature((NUM_DIMENSIONS,),  tf.float32, allow_missing=True),
                                    'mask': tf.FixedLenSequenceFeature((1,), tf.float32, allow_missing=True)})
    id_ = context['id'][0]
    #primary =   tf.to_int32(features['primary'][:, 0])
    primary =   features['primary'][:, 0]
    secondary = tf.to_int32(features['secondary'][:, 0])
    tertiary =              features['tertiary']
    mask =                  features['mask'][:, 0]
    # Generate tertiary masking matrix--if mask is missing then assume all residues are present
#     pri_length = tf.size(primary)
#     mask = tf.cond(tf.not_equal(tf.size(mask), 0), lambda: mask, lambda: tf.ones([pri_length]))
#     ter_mask = masking_matrix(mask, name='ter_mask')        

    return id_, primary, secondary, tertiary, mask

In [79]:
data_dir="..\\..\\data\\protein\\structure\\casp12"
running_mode="testing"
cycle_length = 1

In [80]:
print("Loading files from {}".format(data_dir))

Loading files from ..\..\data\protein\structure\casp12


In [81]:
filenames = tf.gfile.Glob(os.path.join(data_dir, running_mode, "*"))
print("Found {} file(s)".format(len(filenames)))

Found 1 file(s)


In [82]:
dataset = tf.data.TFRecordDataset(filenames)
print("Loading process will use {} CPUs".format(multiprocessing.cpu_count()))
dataset = dataset.map(lambda x: read_protein(x), num_parallel_calls=multiprocessing.cpu_count())

iterator = dataset.make_one_shot_iterator()

Loading process will use 4 CPUs


In [94]:
iterator.get_next()

(<tf.Tensor: id=1393, shape=(), dtype=string, numpy=b'TBM#T0873'>,
 <tf.Tensor: id=1394, shape=(501,), dtype=int64, numpy=
 array([10,  5, 15, 15,  6,  6,  6,  6,  6,  6, 15, 13,  2, 12, 11, 15, 10,
         8, 14,  9,  8,  2,  9, 14,  3, 19,  9,  0, 17,  9,  3,  0,  6, 13,
         2, 17, 14,  3,  7,  2,  3, 12, 17,  2, 12,  6,  9,  3,  0,  5,  0,
         0,  0, 14, 18, 16, 19,  3, 11, 14,  5, 12,  0,  9, 10,  9, 11,  2,
         9, 16,  5, 16,  5, 14,  4,  1, 14,  7,  9,  0,  0, 12,  0,  5,  9,
        15, 16,  7, 12,  5, 15, 12,  9,  0, 14, 17,  0,  9, 15,  9,  5,  9,
         2, 17, 15,  0, 16,  0,  6,  3,  7, 17,  2, 15,  9,  0,  0,  0, 14,
        16, 14,  3, 12, 17,  0, 12, 17, 17, 17,  2, 15,  0, 12,  1, 13,  2,
        11, 17,  9,  9,  5,  2,  2,  0, 11,  9,  2, 14,  4, 12,  0, 12,  9,
         9,  6,  3,  5,  2,  5,  5, 12, 19,  9, 11, 16, 18,  5, 16,  7,  7,
        17, 15, 16, 12,  2,  5, 15,  4, 16, 11, 18,  0,  7,  0, 14, 17, 10,
         8,  7,  2,  5,  8, 14, 10, 16,  