# License

Licensed under the Apache License, Version 2.0 (the "License")
```
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Setup

In [None]:
# Uncomment to install the covid_vhh_design package

# !pip install git+https://github.com/google-research/google-research.git#subdirectory=covid_vhh_design

# Imports

In [None]:
from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from covid_vhh_design import covid
from covid_vhh_design import helper
from covid_vhh_design import models

In [None]:
%config InlineBackend.figure_format = 'retina'

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

# Load the model

In [None]:
# Placeholder for global variables
G = helper.Bunch()

In [None]:
# Random state to control randomness
G.random_state = np.random.RandomState(0)

# The combined regressor/classifier models
G.model = models.CombinedModel.load()

# Encoder to onehot- and AAIndex-encode amino acid sequences
G.encoder = models.SequenceEncoder()

# BLI sequences used for making predictions below.
G.bli = covid.load_df('bli_v2.csv')

# Input sequences
Input sequences must be 125 amino acid long (as long as VHH-72). Only the natural 20 amino acids are allowed, not gaps or special amino acids.

As an example, we make predictions for the parent sequence VHH-72, the best designed sequences with the lowest BLI KD binding values, and several baseline sequences obtained by randomly mutating or shuffling the parent sequence, and sampling amino acids randomly.

In [None]:
def shuffle_sequence(sequence: str, random_state: np.random.RandomState) -> str:
  return ''.join(sequence[i] for i in random_state.permutation(len(sequence)))


def get_random_sequence(
    length: int, random_state: np.random.RandomState
) -> str:
  return ''.join(random_state.choice(models.AMINO_ACIDS, length, replace=True))


def mutate_sequence(
    sequence: str, num_mutations: int, random_state: np.random.RandomState
) -> str:
  mutant = list(sequence)
  for pos in random_state.choice(len(sequence), num_mutations, replace=False):
    mutant[pos] = random_state.choice(
        list(set(models.AMINO_ACIDS) - {sequence[pos]})
    )
  return ''.join(mutant)

In [None]:
# Sequences to be scored
G.seqs = dict(
    # Best BLI sequence
    best=G.bli.query('label == "Seq1"')['source_seq'].iloc[0],
    # Parent sequence
    parent=covid.PARENT_SEQ,
    # Single mutant
    mutant1=mutate_sequence(covid.PARENT_SEQ, 1, G.random_state),
    # Double mutant
    mutant2=mutate_sequence(covid.PARENT_SEQ, 2, G.random_state),
    # Triple mutant
    mutant3=mutate_sequence(covid.PARENT_SEQ, 3, G.random_state),
    # Parent shuffled
    shuffled=shuffle_sequence(covid.PARENT_SEQ, G.random_state),
    # Randomly sampled amino acids
    random=get_random_sequence(len(covid.PARENT_SEQ), G.random_state),
)
G.seqs

# Making predictions
Sequences can be made with `models.score_labeled_sequences`, which calls `encoder.encode_sequences(sequences)` to encode sequences, `model.predict` to predict binding scores, and returns a `DataFrame` with predictions.

We use
* `G.model.regressor.predict`: to predict normalized binding scores with the regressor. A binding score is positive float, where greater values indicate stronger binding. Specifically, predicted scores correspond to inverted AlphaSeq log KD values (score = 5.073646 - log KD), where 5.073646 is the maximum normalized log KD values in the training dataset.
* `G.model.classifier.predict`: to predict binding probabilities between 0 (no binding) and 1 (binding) with the classifier.
* `G.model.predict`: to make predictions with the combined regressor/classifier model. Output values correspond to to `G.model.regressor.predict(x) * G.model.classifier.predict(x)`.


In [None]:
def plot_scores(scores: pd.DataFrame) -> None:
  """Plots model predictions."""
  df = scores.reset_index().melt(
      id_vars='label', var_name='target_name', value_name='value'
  )
  _, ax = plt.subplots(figsize=(15, 5))
  sns.barplot(
      data=df,
      x='target_name',
      y='value',
      hue='label',
      palette='tab10',
      ax=ax,
  )
  ax.set_xlabel('')
  ax.set_ylabel('score')
  ax.figure.canvas.draw()
  ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
  ax.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left', ncol=1, frameon=True)

In [None]:
# Predict combined regressor/classifier scores.
G.scores = models.score_labeled_sequences(G.model, G.encoder, G.seqs)
plot_scores(G.scores)
display(G.scores)

In [None]:
# Predict binding scores with the regressor.
G.reg_scores = models.score_labeled_sequences(G.model.regressor, G.encoder, G.seqs)
plot_scores(G.reg_scores)
display(G.reg_scores)

In [None]:
# Predict binding probabilities with the classifier.
# Output values are probabilities with `proba=True`, and 0/1 with `proba=False`.
G.cla_scores = models.score_labeled_sequences(G.model.classifier, G.encoder, G.seqs, proba=True)
plot_scores(G.cla_scores)
display(G.cla_scores)