In [None]:
# Copyright 2021 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This code supports the publication "Using Deep Learning to Annotate the Protein Universe".
[preprint link](https://doi.org/10.1101/626507)


**Note**: We recommend you enable a free GPU by going:

> **Runtime**   →   **Change runtime type**   →   **Hardware Accelerator: GPU**


# Set-up

## Imports

In [None]:
import json
import numpy as np
import tensorflow.compat.v1 as tf
import tqdm

# Suppress noisy log messages.
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

## Library functions: convert sequence to one-hot array (input to model)

The following library functions are copied from the github repo so as to make this colab dependency-free: no installation of packages is required - just the standard colab kernel.

In [None]:
AMINO_ACID_VOCABULARY = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
    'S', 'T', 'V', 'W', 'Y'
]
def residues_to_one_hot(amino_acid_residues):
  """Given a sequence of amino acids, return one hot array.

  Supports ambiguous amino acid characters B, Z, and X by distributing evenly
  over possible values, e.g. an 'X' gets mapped to [.05, .05, ... , .05].

  Supports rare amino acids by appropriately substituting. See
  normalize_sequence_to_blosum_characters for more information.

  Supports gaps and pads with the '.' and '-' characters; which are mapped to
  the zero vector.

  Args:
    amino_acid_residues: string. consisting of characters from
      AMINO_ACID_VOCABULARY

  Returns:
    A numpy array of shape (len(amino_acid_residues),
     len(AMINO_ACID_VOCABULARY)).

  Raises:
    ValueError: if sparse_amino_acid has a character not in the vocabulary + X.
  """
  to_return = []
  normalized_residues = amino_acid_residues.replace('U', 'C').replace('O', 'X')
  for char in normalized_residues:
    if char in AMINO_ACID_VOCABULARY:
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index(char)] = 1.
      to_return.append(to_append)
    elif char == 'B':  # Asparagine or aspartic acid.
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index('D')] = .5
      to_append[AMINO_ACID_VOCABULARY.index('N')] = .5
      to_return.append(to_append)
    elif char == 'Z':  # Glutamine or glutamic acid.
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index('E')] = .5
      to_append[AMINO_ACID_VOCABULARY.index('Q')] = .5
      to_return.append(to_append)
    elif char == 'X':
      to_return.append(
          np.full(len(AMINO_ACID_VOCABULARY), 1. / len(AMINO_ACID_VOCABULARY)))
    elif char == _PFAM_GAP_CHARACTER:
      to_return.append(np.zeros(len(AMINO_ACID_VOCABULARY)))
    else:
      raise ValueError('Could not one-hot code character {}'.format(char))
  return np.array(to_return)

def _test_residues_to_one_hot():
  expected = np.zeros((3, 20))
  expected[0, 0] = 1.   # Amino acid A
  expected[1, 1] = 1.   # Amino acid C
  expected[2, :] = .05  # Amino acid X

  actual = residues_to_one_hot('ACX')
  np.testing.assert_allclose(actual, expected)
_test_residues_to_one_hot()

In [None]:
def pad_one_hot_sequence(sequence: np.ndarray,
                         target_length: int) -> np.ndarray:
  """Pads one hot sequence [seq_len, num_aas] in the seq_len dimension."""
  sequence_length = sequence.shape[0]
  pad_length = target_length - sequence_length
  if pad_length < 0:
    raise ValueError(
        'Cannot set a negative amount of padding. Sequence length was {}, target_length was {}.'
        .format(sequence_length, target_length))
  pad_values = [[0, pad_length], [0, 0]]
  return np.pad(sequence, pad_values, mode='constant')

def _test_pad_one_hot():
  input_one_hot = residues_to_one_hot('ACX')
  expected = np.array(input_one_hot.tolist() + np.zeros((4, 20)).tolist())
  actual = pad_one_hot_sequence(input_one_hot, 7)

  np.testing.assert_allclose(expected, actual)
_test_pad_one_hot()

In [None]:
def batch_iterable(iterable, batch_size):
  """Yields batches from an iterable.

  If the number of elements in the iterator is not a multiple of batch size,
  the last batch will have fewer elements.

  Args:
    iterable: a potentially infinite iterable.
    batch_size: the size of batches to return.

  Yields:
    array of length batch_size, containing elements, in order, from iterable.

  Raises:
    ValueError: if batch_size < 1.
  """
  if batch_size < 1:
    raise ValueError(
        'Cannot have a batch size of less than 1. Received: {}'.format(
            batch_size))

  current = []
  for item in iterable:
    if len(current) == batch_size:
      yield current
      current = []
    current.append(item)

  # Prevent yielding an empty batch. Instead, prefer to end the generation.
  if current:
    yield current

def _test_batch_iterable():
  itr = [1, 2, 3]
  batched_itr = list(batch_iterable(itr, 2))
  assert batched_itr == [[1, 2], [3]]

_test_batch_iterable()

## Download model and vocabulary

In [None]:
# Get a TensorFlow SavedModel
!wget -qN https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/seed_random_32.0/5356760.tar.gz
# unzip
!tar xzf 5356760.tar.gz
# Get the vocabulary for the model, which tells you which output index means which family
!wget https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/trained_model_pfam_32.0_vocab.json

--2021-09-22 14:58:00--  https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/trained_model_pfam_32.0_vocab.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.31.128, 74.125.141.128, 173.194.210.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.31.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 197219 (193K) [application/octet-stream]
Saving to: ‘trained_model_pfam_32.0_vocab.json’


2021-09-22 14:58:00 (28.0 MB/s) - ‘trained_model_pfam_32.0_vocab.json’ saved [197219/197219]



In [None]:
# Find the unzipped path
!ls *5356760*

5356760.tar.gz

trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760:
saved_model.pb	variables


## Load the model into TensorFlow

In [None]:
sess = tf.Session()
graph = tf.Graph()

In [None]:
with graph.as_default():
  saved_model = tf.saved_model.load(sess, ['serve'], 'trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760')

INFO:tensorflow:Restoring parameters from trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760/variables/variables


## Load tensors for class prediction

In [None]:
top_pick_signature = saved_model.signature_def['serving_default']
top_pick_signature_tensor_name = top_pick_signature.outputs['output'].name

sequence_input_tensor_name = saved_model.signature_def['confidences'].inputs['sequence'].name
sequence_lengths_input_tensor_name = saved_model.signature_def['confidences'].inputs['sequence_length'].name

## Load mapping from neural network outputs to Pfam family names 

In [None]:
with open('trained_model_pfam_32.0_vocab.json') as f:
  vocab = json.loads(f.read())

# Download data for inference

In [None]:
%%shell
for i in `seq 0 9`; do
  wget https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/random_split/test/data-0000$i-of-00010;
done

--2021-09-22 14:58:08--  https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/random_split/test/data-00000-of-00010
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.11.128, 74.125.26.128, 172.217.204.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.11.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6106511 (5.8M) [application/octet-stream]
Saving to: ‘data-00000-of-00010’


2021-09-22 14:58:08 (160 MB/s) - ‘data-00000-of-00010’ saved [6106511/6106511]

--2021-09-22 14:58:08--  https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/random_split/test/data-00001-of-00010
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.214.128, 173.194.216.128, 173.194.217.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.214.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6251203 (6.0M) [application/octet



In [None]:
import glob
import pandas as pd
test_dfs = []
for f_name in glob.glob('data*'):
  with open(f_name) as f:
    test_dfs.append(pd.read_csv(f))
test_df = pd.concat(test_dfs)

In [None]:
import math
def infer(batch):
  seq_lens = [len(seq) for seq in batch]
  one_hots = [residues_to_one_hot(seq) for seq in batch]
  padded_sequence_inputs = [pad_one_hot_sequence(seq, max(seq_lens)) for seq in one_hots]
  with graph.as_default():
    return sess.run(
        top_pick_signature_tensor_name,
        {
            sequence_input_tensor_name: padded_sequence_inputs,
            sequence_lengths_input_tensor_name: seq_lens,
        })

In [None]:
# Sort test_df by sequence length so that batches have as little padding as 
# possible -> faster inference.
test_df = test_df.sort_values('sequence', key=lambda col: [len(c) for c in col])

# Predict domain Pfam labels for 126 thousand domains

In [None]:
inference_results = []
batches = list(batch_iterable(test_df.sequence, 32))
for seq_batch in tqdm.tqdm(batches, position=0):
  inference_results.extend(infer(seq_batch))

100%|██████████| 3943/3943 [20:20<00:00,  3.23it/s]


In [None]:
test_df['predicted_label'] = [vocab[i] for i in inference_results]

In [None]:
# Convert true labels from PF00001.21 to PF00001
test_df['true_label'] = test_df.family_accession.apply(lambda s: s.split('.')[0])

# Compute accuracy

Reproduces 5th row of figure 1A

In [None]:
print('family calling error rate (percentage) = {:.03f}'.format(100-sum(test_df.true_label == test_df.predicted_label) / len(test_df) * 100))

family calling error rate (percentage) = 0.495
