In [1]:
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# Comment out if running from local environment
# !pip install --upgrade git+https://github.com/google-deepmind/nuclease_design.git

# Imports

In [3]:
import itertools
import pandas as pd
import pathlib

from nuclease_design import utils
from nuclease_design import constants


# Load data

In [4]:
DATA_DIR = constants.DATA_DIR

In [5]:
LOCAL_OUTPUT_DIR = pathlib.Path('data')
LOCAL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
%%time
all_data_df = utils.load_all_data(DATA_DIR)

CPU times: user 623 ms, sys: 90.5 ms, total: 713 ms
Wall time: 3.94 s


# Aggregate data across generations

In [7]:
# By sorting and aggregating the "first", we keep the activity label from the
# most recent generation
all_data_df['generations'] = all_data_df['generation']
all_data_df = all_data_df.sort_values(by='generation', ascending=False)  # order by generation
agg_dict = {
    'sublibrary_names': lambda x: tuple(itertools.chain.from_iterable(x)),
    'generations': lambda x: tuple(sorted(x)),
    'activity_greater_than_neg_control': 'first',
    'activity_greater_than_wt': 'first',
    'activity_greater_than_a73r': 'first',
    'num_mutations': 'first',
    'sequence': 'first',
}
df = all_data_df.groupby('mutations').agg(agg_dict).reset_index()

In [8]:
# Use the highest activity level found in the most recent generation.
def find_highest_activity_level(row, default_label='non-functional'):
  references = ['neg_control', 'wt', 'a73r']
  cols = [f'activity_greater_than_{ref}' for ref in references]
  true_indices = [
      i for i, col in enumerate(cols) if row[col] is not None and row[col]
  ]
  if not true_indices:
    return default_label
  if tuple(true_indices) == (1,):  # label as nonfunctional variants that fail the negative control test,
    return default_label
  else:
    return cols[max(true_indices)]


df['activity_level'] = (
    df.apply(find_highest_activity_level, axis=1)
    .str.replace('a73r', 'A73R')
    .str.replace('a73r', 'A73R')
    .str.replace('wt', 'WT')
    .str.replace('neg_control', '0')
)
df['is_functional'] = df['activity_level'] != 'non-functional'

In [9]:
cols_to_keep = [
    'mutations',
    'num_mutations',
    'sublibrary_names',
    'generations',
    'activity_level',
    'is_functional',
    'sequence',
]

In [10]:
output_path = LOCAL_OUTPUT_DIR / pathlib.Path(constants.LANDSCAPE_PATH)
output_path.parent.mkdir(parents=True, exist_ok=True)

with utils.open_file(constants.LANDSCAPE_PATH, 'w', LOCAL_OUTPUT_DIR) as f:
  utils.encode_df(df[cols_to_keep]).to_csv(f, index=False)

# Conflicting labels across generations

In [11]:
order = ['non-functional', 'activity_greater_than_0', 'activity_greater_than_WT', 'activity_greater_than_A73R']

Note that some variants have conflicting functional labels:

For the hypothesis tests corresponding to [neg_control, wt, a73r], there were were 5 variants with [F, T, T] (labeled as activity > A73R), and 973 variants with [F, T, F] (labeled as non-functional).

In [12]:
pd.crosstab(df['activity_level'], df['activity_greater_than_neg_control']).reindex(order)

activity_greater_than_neg_control,False,True
activity_level,Unnamed: 1_level_1,Unnamed: 2_level_1
non-functional,33890,0
activity_greater_than_0,0,11099
activity_greater_than_WT,0,10572
activity_greater_than_A73R,5,194


In [13]:
pd.crosstab(df['activity_level'], df['activity_greater_than_wt']).reindex(order)

activity_greater_than_wt,False,True
activity_level,Unnamed: 1_level_1,Unnamed: 2_level_1
non-functional,32917,973
activity_greater_than_0,11099,0
activity_greater_than_WT,0,10572
activity_greater_than_A73R,0,199


In [14]:
pd.crosstab(df['activity_level'], df['activity_greater_than_a73r']).reindex(order)

activity_greater_than_a73r,False,True
activity_level,Unnamed: 1_level_1,Unnamed: 2_level_1
non-functional,15189,0
activity_greater_than_0,8442,0
activity_greater_than_WT,9941,0
activity_greater_than_A73R,0,199


In [15]:
df[['activity_greater_than_neg_control', 'activity_greater_than_wt', 'activity_greater_than_a73r']].value_counts()

activity_greater_than_neg_control  activity_greater_than_wt  activity_greater_than_a73r
False                              False                     False                         14216
True                               True                      False                          9941
                                   False                     False                          8442
False                              True                      False                           973
True                               True                      True                            194
False                              True                      True                              5
Name: count, dtype: int64

In [16]:
# 20K variants were not assessed for activity relative to A73R
# these correspond to variants seen only in g1/g2.
df[df['activity_greater_than_a73r'].isna()].generations.explode().value_counts()

generations
g2    14219
g1     8670
Name: count, dtype: int64

In [17]:
# 5 variants conflict in their a73r + wt vs neg_control label.
df[df['activity_greater_than_a73r'] & df['activity_greater_than_wt'] & ~df['activity_greater_than_neg_control']].generations.value_counts()

generations
(g3,)    4
(g4,)    1
Name: count, dtype: int64

In [18]:
# 973 variants conflict in their wt vs neg_control label.
df[df['activity_greater_than_wt'] & ~df['activity_greater_than_neg_control']].generations.value_counts()

generations
(g4,)           660
(g3,)           303
(g2, g4)          3
(g1, g2, g4)      3
(g1, g3)          3
(g3, g4)          2
(g2, g3)          2
(g1, g3, g4)      1
(g2, g3, g4)      1
Name: count, dtype: int64