## Loading preprosed data

In [1]:
LEVEL="Level_1"

In [2]:
import numpy as np
train_data = np.load("..//data//train_features_"+LEVEL+".npy")
train_label = np.load("..//data//train_labels_"+LEVEL+".npy")
val_data = np.load("..//data//val_features_"+LEVEL+".npy")
val_label = np.load("..//data//val_labels_"+LEVEL+".npy")

In [3]:
PATH = "..//weights//cnn_{}_v2//version8.ckpt".format(LEVEL)

In [4]:
train_data.shape, train_label.shape, val_data.shape, val_label.shape

((164674, 500), (164674,), (41169, 500), (41169,))

In [5]:
NUM_OF_ACIDS = 21
EMBEDDING_SIZE = 8
NUM_CLASSES = np.amax(val_label, axis=0)+1

In [6]:
NUM_CLASSES

6

In [7]:
NUM_EPOCH=7
BATCH_SIZE=128

## Model

In [8]:
import tensorflow as tf
tf.__version__

  from ._conv import register_converters as _register_converters


'1.6.0'

# Setting up model

In [9]:
def model(features, is_training):
    acid_embeddings = tf.get_variable("acid_embeddings", [NUM_OF_ACIDS, EMBEDDING_SIZE])
    embedded_acids = tf.nn.embedding_lookup(acid_embeddings, features)
    embedded_acids = tf.expand_dims(embedded_acids, 3)
    # Convolutional Layer #1
    conv1 = tf.layers.conv2d(
      inputs=embedded_acids,
      filters=32,
      kernel_size=(3,EMBEDDING_SIZE),
      padding="same",
      activation=tf.nn.selu)

      # Pooling Layer #1
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=2, strides=2)

    # Convolutional Layer #2 and Pooling Layer #2
    conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=(3,EMBEDDING_SIZE),
      padding="same",
      activation=tf.nn.selu)
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=2, strides=2)

    # Dense Layer
    pool2_flat = tf.layers.flatten(pool2)
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.selu)
    dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=is_training)

    # Logits Layer
    x = tf.layers.dense(inputs=dropout, units=NUM_CLASSES)
    return x

In [10]:
def model_fn(features, labels, mode, params):
    """The model_fn argument for creating an Estimator."""
    if mode == tf.estimator.ModeKeys.PREDICT:
        logits = model(features, is_training=False)
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits),
        }
        return tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.PREDICT,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
        logits = model(features, is_training=True)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
        accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, axis=1))
        # Name the accuracy tensor 'train_accuracy' to demonstrate the
        # LoggingTensorHook.
        tf.identity(accuracy[1], name='train_accuracy')
        tf.summary.scalar('train_accuracy', accuracy[1])
        return tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.TRAIN,
            loss=loss,
            train_op=optimizer.minimize(loss, tf.train.get_or_create_global_step()))
    if mode == tf.estimator.ModeKeys.EVAL:
        logits = model(features, is_training=False)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
        return tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL,
            loss=loss,
            eval_metric_ops={
                'accuracy': tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, axis=1))})


In [11]:
enzyme_classifier = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=PATH)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '..//weights//cnn_Level_1_v2//version8.ckpt', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbdc8a67860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


# Training

In [38]:
def train_input():
    return (tf.data.Dataset.from_tensor_slices((train_data, train_label))
            .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
            .batch(BATCH_SIZE)
            .repeat(1))

In [39]:
tensors_to_log = {'train_accuracy': 'train_accuracy'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100)
enzyme_classifier.train(input_fn=train_input, hooks=[logging_hook])


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ..//weights//cnn_Level_1_v2//version8.ckpt/model.ckpt-9009
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 9010 into ..//weights//cnn_Level_1_v2//version8.ckpt/model.ckpt.
INFO:tensorflow:train_accuracy = 0.9296875
INFO:tensorflow:loss = 0.25511307, step = 9010
INFO:tensorflow:global_step/sec: 45.9907
INFO:tensorflow:train_accuracy = 0.93359375 (2.176 sec)
INFO:tensorflow:loss = 0.20976786, step = 9110 (2.176 sec)
INFO:tensorflow:global_step/sec: 47.5782
INFO:tensorflow:train_accuracy = 0.9427083 (2.101 sec)
INFO:tensorflow:loss = 0.16500229, step = 9210 (2.101 sec)
INFO:tensorflow:global_step/sec: 47.6559
INFO:tensorflow:train_accuracy = 0.9394531 (2.099 sec)
INFO:tensorflow:loss = 0.23634267, step = 9310 (2.099 sec)
IN

<tensorflow.python.estimator.estimator.Estimator at 0x7fbdc8a67748>

# Validate

In [40]:
def eval_input():
    return (tf.data.Dataset.from_tensor_slices((val_data, val_label))
            .batch(BATCH_SIZE).repeat(1))

In [41]:
eval_results = enzyme_classifier.evaluate(input_fn=eval_input)
print()
print('Evaluation results: %s' % eval_results)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-22-10:14:25
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ..//weights//cnn_Level_1_v2//version8.ckpt/model.ckpt-10296
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-04-22-10:14:27
INFO:tensorflow:Saving dict for global step 10296: accuracy = 0.83120793, global_step = 10296, loss = 0.6537026

Evaluation results: {'accuracy': 0.83120793, 'loss': 0.6537026, 'global_step': 10296}


# Predict

In [25]:
import pandas as pd
data = pd.read_csv("..//data//test_sequences.csv", sep='\t', skipinitialspace=True)
data["Sequence"] = data.Sequence.str.rjust(500, '0')
letterToIndex = {'0': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12,
                 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}
data["Sequence_vector"] = [[letterToIndex[char] for char in val ] for index, val in data.Sequence.iteritems()]
test_data= np.asarray([ np.asarray(element) for element in data["Sequence_vector"].values])
len(test_data), test_data

(5, array([[ 0,  0,  0, ...,  1,  7,  3],
        [ 0,  0,  0, ..., 16, 18, 16],
        [ 0,  0,  0, ...,  9,  2, 10],
        [ 0,  0,  0, ..., 18, 17, 13],
        [ 0,  0,  0, ..., 13, 16, 10]]))

In [30]:
def test_input():
    test_data_for_tensorflow = np.append(test_data, np.zeros((BATCH_SIZE-len(test_data), 500)), axis=0).astype(int)
    return (tf.data.Dataset.from_tensor_slices((test_data_for_tensorflow))).batch(BATCH_SIZE)

In [42]:
np.set_printoptions(suppress=True)
predict = enzyme_classifier.predict(input_fn=test_input)
count = 0
for p in predict:
    if count == 0:
        print("\n\r")
        print("Oxidoreductases Transferases Hydrolases Lyases Isomerases Ligases")
    count = count + 1
    print(p["probabilities"])
    print( p["classes"]+1)
    if (count == 5):
        break

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ..//weights//cnn_Level_1_v2//version8.ckpt/model.ckpt-10296
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Oxidoreductases Transferases Hydrolases Lyases Isomerases Ligases
[0.02524485 0.42460245 0.3180151  0.08731605 0.1447798  0.0000418 ]
2
[0.01956985 0.63964903 0.10866959 0.23204246 0.00002328 0.00004582]
2
[0.15433058 0.20506094 0.6196978  0.00744062 0.00508434 0.00838569]
3
[0.00855853 0.9912123  0.00000528 0.00011529 0.00000593 0.0001026 ]
2
[0.07928088 0.457729   0.4544403  0.0002656  0.0073913  0.0008929 ]
2
