```
# Copyright 2021 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

# This code supports the publication "Using Deep Learning to Annotate the Protein Universe".
[preprint link](https://doi.org/10.1101/626507)


**Note**: We recommend you enable a free GPU by going:

> **Runtime**   →   **Change runtime type**   →   **Hardware Accelerator: GPU**


# Set-up

## Imports

In [1]:
import json
import numpy as np
import tensorflow.compat.v1 as tf

# Suppress noisy log messages.
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

## Library functions: convert sequence to one-hot array (input to model)

In [2]:
AMINO_ACID_VOCABULARY = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
    'S', 'T', 'V', 'W', 'Y'
]
def residues_to_one_hot(amino_acid_residues):
  """Given a sequence of amino acids, return one hot array.

  Supports ambiguous amino acid characters B, Z, and X by distributing evenly
  over possible values, e.g. an 'X' gets mapped to [.05, .05, ... , .05].

  Supports rare amino acids by appropriately substituting. See
  normalize_sequence_to_blosum_characters for more information.

  Supports gaps and pads with the '.' and '-' characters; which are mapped to
  the zero vector.

  Args:
    amino_acid_residues: string. consisting of characters from
      AMINO_ACID_VOCABULARY

  Returns:
    A numpy array of shape (len(amino_acid_residues),
     len(AMINO_ACID_VOCABULARY)).

  Raises:
    ValueError: if sparse_amino_acid has a character not in the vocabulary + X.
  """
  to_return = []
  normalized_residues = amino_acid_residues.replace('U', 'C').replace('O', 'X')
  for char in normalized_residues:
    if char in AMINO_ACID_VOCABULARY:
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index(char)] = 1.
      to_return.append(to_append)
    elif char == 'B':  # Asparagine or aspartic acid.
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index('D')] = .5
      to_append[AMINO_ACID_VOCABULARY.index('N')] = .5
      to_return.append(to_append)
    elif char == 'Z':  # Glutamine or glutamic acid.
      to_append = np.zeros(len(AMINO_ACID_VOCABULARY))
      to_append[AMINO_ACID_VOCABULARY.index('E')] = .5
      to_append[AMINO_ACID_VOCABULARY.index('Q')] = .5
      to_return.append(to_append)
    elif char == 'X':
      to_return.append(
          np.full(len(AMINO_ACID_VOCABULARY), 1. / len(AMINO_ACID_VOCABULARY)))
    elif char == _PFAM_GAP_CHARACTER:
      to_return.append(np.zeros(len(AMINO_ACID_VOCABULARY)))
    else:
      raise ValueError('Could not one-hot code character {}'.format(char))
  return np.array(to_return)

def _test_residues_to_one_hot():
  expected = np.zeros((3, 20))
  expected[0, 0] = 1.   # Amino acid A
  expected[1, 1] = 1.   # Amino acid C
  expected[2, :] = .05  # Amino acid X

  actual = residues_to_one_hot('ACX')
  np.testing.assert_allclose(actual, expected)
_test_residues_to_one_hot()

In [3]:
def pad_one_hot_sequence(sequence: np.ndarray,
                         target_length: int) -> np.ndarray:
  """Pads one hot sequence [seq_len, num_aas] in the seq_len dimension."""
  sequence_length = sequence.shape[0]
  pad_length = target_length - sequence_length
  if pad_length < 0:
    raise ValueError(
        'Cannot set a negative amount of padding. Sequence length was {}, target_length was {}.'
        .format(sequence_length, target_length))
  pad_values = [[0, pad_length], [0, 0]]
  return np.pad(sequence, pad_values, mode='constant')

def _test_pad_one_hot():
  input_one_hot = residues_to_one_hot('ACX')
  expected = np.array(input_one_hot.tolist() + np.zeros((4, 20)).tolist())
  actual = pad_one_hot_sequence(input_one_hot, 7)

  np.testing.assert_allclose(expected, actual)
_test_pad_one_hot()

## Download model and vocabulary

In [4]:
# Get a TensorFlow SavedModel
!wget -qN https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/seed_random_32.0/5356760.tar.gz
# unzip
!tar xzf 5356760.tar.gz
# Get the vocabulary for the model, which tells you which output index means which family
!wget https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/trained_model_pfam_32.0_vocab.json

--2021-09-15 21:04:58--  https://storage.googleapis.com/brain-genomics-public/research/proteins/pfam/models/single_domain_per_sequence_zipped_models/trained_model_pfam_32.0_vocab.json
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.214.128, 173.194.216.128, 173.194.217.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.214.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 197219 (193K) [application/octet-stream]
Saving to: ‘trained_model_pfam_32.0_vocab.json’


2021-09-15 21:04:58 (108 MB/s) - ‘trained_model_pfam_32.0_vocab.json’ saved [197219/197219]



In [5]:
# Find the unzipped path
!ls *5356760*

5356760.tar.gz

trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760:
saved_model.pb	variables


## Load the model into TensorFlow

In [6]:
sess = tf.Session()
graph = tf.Graph()

In [7]:
with graph.as_default():
  saved_model = tf.saved_model.load(sess, ['serve'], 'trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760')

INFO:tensorflow:Restoring parameters from trn-_cnn_random__random_sp_gpu-cnn_for_random_pfam-5356760/variables/variables


## Load tensors for class confidence prediction

In [8]:
class_confidence_signature = saved_model.signature_def['confidences']
class_confidence_signature_tensor_name = class_confidence_signature.outputs['output'].name

sequence_input_tensor_name = saved_model.signature_def['confidences'].inputs['sequence'].name
sequence_lengths_input_tensor_name = saved_model.signature_def['confidences'].inputs['sequence_length'].name

# Predict Pfam label for domain

In [9]:
# Run inference
hemoglobin = 'MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR'
globin_domain = hemoglobin[6:107]  # 0-indexed, right inclusive because of the way slices in python work

In [10]:
# If you want to put in different proteins (other than hemoglobin), you
# can run this cell multiple times. Simply replace the variable "hemoglobin"
# with your desired protein domain.

# The first run of this cell will be slower; the subsequent runs will be fast.
# This is because on the first run, the TensorFlow XLA graph is compiled, and
# then is reused.
with graph.as_default():
  confidences_by_class = sess.run(
      class_confidence_signature_tensor_name,
      {
          # Note that this function accepts a batch of sequences which
          # can speed up inference when running on many sequences.
          sequence_input_tensor_name: [residues_to_one_hot(globin_domain)],
          sequence_lengths_input_tensor_name: [len(globin_domain)],
      }
  )

In [11]:
np.array([residues_to_one_hot(globin_domain)]).shape

(1, 101, 20)

In [12]:
confidences_by_class

array([[3.7189863e-20, 4.8992849e-21, 3.2680125e-21, ..., 1.8260855e-19,
        1.8322259e-19, 1.8438370e-19]], dtype=float32)

## Map the model's prediction to a Pfam family accession

In [13]:
# Load vocab
with open('trained_model_pfam_32.0_vocab.json') as f:
  vocab = json.loads(f.read())

In [14]:
# Find what the most likely class is
np.argmax(confidences_by_class)

8505

In [15]:
vocab[8505] # PF00042 is family Globin

'PF00042'

## If you want to predict for a bunch of sequences, you can run inference on a batch instead of one-by-one to make it faster

In [16]:
hemoglobin = 'MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR'
globin_domain = hemoglobin[6:107]  # 0-indexed, right inclusive because of the way slices in python work

# Coronavirus spike glycoprotein S2 (PF01601)
covid_spike_protein_domain = "NSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIM"

In [17]:
# Concatenate and pad sequence inputs
one_hot_sequence_inputs = [
              residues_to_one_hot(globin_domain),
              residues_to_one_hot(covid_spike_protein_domain),
]

max_len_within_batch = max(len(globin_domain), len(covid_spike_protein_domain))
padded_sequence_inputs = [pad_one_hot_sequence(s, max_len_within_batch)
                          for s in one_hot_sequence_inputs]

In [18]:
# The first run of this cell will be slower; the subsequent runs will be fast.
# This is because on the first run, the TensorFlow XLA graph is compiled, and
# then is reused.
with graph.as_default():
  confidences_by_class = sess.run(
      class_confidence_signature_tensor_name,
      {
          sequence_input_tensor_name: padded_sequence_inputs,
          sequence_lengths_input_tensor_name: [
              len(globin_domain),
              len(covid_spike_protein_domain)
          ],
      })

In [19]:
vocab[np.argmax(confidences_by_class[0])] # 0th element is for hemoglobin; PF00042 is family Globin

'PF00042'

In [20]:
vocab[np.argmax(confidences_by_class[1])] # 1th element is for covid; PF01601 is Coronavirus spike glycoprotein S2

'PF01601'

# Compute embedding of domain

In [21]:
embedding_signature = saved_model.signature_def['pooled_representation']
embedding_signature_tensor_name = embedding_signature.outputs['output'].name

In [22]:
# The first run of this cell will be slower; the subsequent runs will be fast.
# This is because on the first run, the TensorFlow XLA graph is compiled, and
# then is reused.
with graph.as_default():
  embedding = sess.run(
      embedding_signature_tensor_name,
      {
          # Note that this function accepts a batch of sequences which
          # can speed up inference when running on many sequences.
          sequence_input_tensor_name: [residues_to_one_hot(globin_domain)],
          sequence_lengths_input_tensor_name: [len(globin_domain)],
      }
  )

In [23]:
# Shape of embedding is (# seqs in batch, number of features in embedding space)
embedding.shape

(1, 1100)