In [None]:
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [None]:
# Comment out if running from local environment
!pip install --upgrade git+https://github.com/google-deepmind/nuclease_design.git

# Imports

In [1]:
import functools

import pandas as pd

from nuclease_design import utils
from nuclease_design import constants


# Constants

In [2]:
DATA_DIR = constants.DATA_DIR

In [3]:
all_data_df = utils.load_all_data(DATA_DIR)
df = utils.expand_sublibraries(all_data_df)
df['campaign'] = df['sublibrary_name'].apply(utils.sublibrary_name_to_campaign_name)

In [4]:
df.groupby(['campaign', 'generation'])['sequence'].nunique()

campaign   generation
HR         g2             951
           g3            1712
           g4            1540
ML         g2            9363
           g3            9108
           g4            8577
epPCR      g1            9441
other      g2              44
           g3              44
           g4            1735
unmatched  g2            4365
           g3            7796
           g4            2360
zero_shot  g4            1235
Name: sequence, dtype: int64

In [5]:

# This is cached in case we want to re-run the next cell, which is expensive.
@functools.cache
def _hamming_distance(a, b):
  return sum(a != b for a, b in zip(a, b))


In [6]:
def _get_min_distance_to_set(query_sequence, reference_sequences: pd.Series):
  return reference_sequences.apply(_hamming_distance, b=query_sequence).min()


df['generation_index'] = df['generation'].apply(lambda x: int(x[1]))


def get_num_unmatched_but_close_to_hr(df, generation_index):
  hr_seqs = df[
      (df['campaign'] == 'HR') & (df['generation_index'] == generation_index)
  ]['sequence']
  prior_unmatched_seqs = df[
      (df['campaign'] == 'unmatched')
      & (df['generation_index'] == generation_index)
  ]['sequence']
  distances = prior_unmatched_seqs.apply(
      _get_min_distance_to_set, reference_sequences=hr_seqs
  )
  return (distances == 1).sum()


def get_hr_stats(df):
  rows = []
  for generation in [2, 3, 4]:
    num_unmatched_in_prev_generation = 0
    if generation > 2:
      num_unmatched_in_prev_generation = get_num_unmatched_but_close_to_hr(
          df, generation - 1
      )
      num_from_campaign_in_prev_generation = (
          (df['campaign'] == 'HR') & (df['generation_index'] == generation - 1)
      ).sum()
    else:
      num_from_campaign_in_prev_generation = (df['campaign'] == 'epPCR').sum()

    num_data_points_in_current_generation = (
        (df['campaign'] == 'HR') & (df['generation_index'] == generation)
    ).sum()

    rows.append(
        dict(
            generation=generation,
            total_num_variants_used_as_input=num_unmatched_in_prev_generation
            + num_from_campaign_in_prev_generation,
            num_data_points_in_current_generation=num_data_points_in_current_generation,
        )
    )
  to_return = pd.DataFrame(rows)
  to_return['total_num_variants_considered_so_far'] = to_return[
      'total_num_variants_used_as_input'
  ].cumsum()
  return to_return


get_hr_stats(df)

Unnamed: 0,generation,total_num_variants_used_as_input,num_data_points_in_current_generation,total_num_variants_considered_so_far
0,2,9441,951,9441
1,3,1159,1712,10600
2,4,2652,1540,13252


In [7]:
def get_ml_stats(df):
  rows = []
  for generation in [2, 3, 4]:
    num_inputs = (df['generation_index'] < generation).sum()
    num_data_points_in_current_generation = (
        (df['campaign'] == 'ML') & (df['generation_index'] == generation)
    ).sum()
    rows.append(
        dict(
            generation=generation,
            total_num_variants_used_as_input=num_inputs,
            num_data_points_in_current_generation=num_data_points_in_current_generation,
        )
    )
  return pd.DataFrame(rows)


get_ml_stats(df)

Unnamed: 0,generation,total_num_variants_used_as_input,num_data_points_in_current_generation
0,2,9441,9370
1,3,24173,9557
2,4,43284,8588


In [8]:
df[df['campaign'] == 'other']['sublibrary_name'].value_counts()

sublibrary_name
prosar+screen_g2_redux        1464
g4_g3_hit_constituents         241
g2_stratified_sample            40
g3_g1_stratified_sample         38
g4_mbo_seeds                    27
g4_g3_plate_assay_variants      14
g4_homolog_graft                11
g3_g2_plate_assay_variants       6
g2_g1_plate_assay_variants       5
g2_wt_synonyms                   1
g3_wt_synonyms                   1
g3_a73r_synonyms                 1
g4_wt_synonyms                   1
g4_a73r_synonyms                 1
g4_double_synonyms               1
g4_quad_synonyms                 1
Name: count, dtype: int64