In [1]:
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
!pip install --upgrade https://github.com/google-deepmind/nuclease-design.git


In [3]:
import tempfile

import numpy as np
import pandas as pd

from nuclease_design import cnn_model
from nuclease_design import constants
from nuclease_design import utils


# Copy the models to a local directory

In [4]:
# The code for loading models requires that the model files have been copied
# to local disk.
LOCAL_DATA_DIR = tempfile.mkdtemp()

In [5]:
cnn_model.copy_all_models_to_local_dir(LOCAL_DATA_DIR)

# Understand the model outputs

The model outputs a vector of predicted probabilities that the activity for the input sequence belongs to these four classes.

See the comments for `cnn_model.OUTPUT_CLASSES` for an explanation of the distinction between `cnn_model.OUTPUT_CLASSES` and `constants.LANDSCAPE_ACTIVITY_LEVELS`.

In [6]:
cnn_model.OUTPUT_CLASSES

('<WT', 'WT', '>WT', '>=A73R')

# Apply a single model to the wildtype sequence

In [7]:
model = cnn_model.load_cnn_model(model_index=0, data_dir=LOCAL_DATA_DIR)
predict_fn = cnn_model.get_predict_fn(model)
predict_fn([constants.FULL_REFERENCE_SEQ])



array([[1.2467912e-03, 9.9805295e-01, 9.1224230e-08, 7.0020667e-04]],
      dtype=float32)

# Apply the ensemble of 5 models to the wildtype sequence

In [8]:
ensemble_models = cnn_model.load_cnn_ensemble(data_dir=LOCAL_DATA_DIR)
ensemble_predict_fn = cnn_model.get_ensemble_predict_fn(ensemble_models)
ensemble_predict_fn([constants.FULL_REFERENCE_SEQ])



array([[2.5281562e-03, 9.9700385e-01, 1.7654112e-06, 4.6627864e-04]],
      dtype=float32)

# Apply the model to its training data from G1,G2,G3



In [9]:
df = utils.load_landscape()

In [10]:
g123_df = df[df['generations'] != ('g4',)].reset_index()

In [11]:
predicted_probabilities = ensemble_predict_fn(g123_df['sequence'])
int_predictions = np.argmax(predicted_probabilities, axis=1)
g123_df['predicted_activity_level'] = pd.Series(int_predictions).apply(
    lambda i: cnn_model.OUTPUT_CLASSES[i]
)



## Compare the activity label from the experiment to the activity label predicted by the model

We performed model selection using a train-test split of this data, but then we re-trained the model on all of the data. Therefore, we expect the model to have
reasonably high accuracy.

Note that many of the off-diagonal elements likely result from the slight difference between the labeling scheme used to train the model and the labeling
scheme in the final landscape file. For example,  'activity_greater_than_A73R' is a higher bar than '>=A73R' (i.e., 'not significantly worse than A73R').

In [12]:
pd.crosstab(g123_df['activity_level'], g123_df['predicted_activity_level']).loc[
    list(constants.LANDSCAPE_ACTIVITY_LEVELS)
][list(cnn_model.OUTPUT_CLASSES)]

predicted_activity_level,<WT,WT,>WT,>=A73R
activity_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
non-functional,24596,511,53,6
activity_greater_than_0,912,6705,1256,166
activity_greater_than_WT,183,278,4408,1521
activity_greater_than_A73R,2,1,63,91
