In [None]:
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [None]:
!pip install --upgrade https://github.com/google-deepmind/nuclease-design.git


In [None]:
import functools

import pandas as pd

from nuclease_design import utils
from nuclease_design import constants


In [None]:
DATA_DIR = constants.DATA_DIR

In [None]:
all_data_df = utils.load_all_data(DATA_DIR)
df = utils.expand_sublibraries(all_data_df)
df['campaign'] = df['sublibrary_name'].apply(utils.sublibrary_name_to_campaign_name)

In [None]:
df.groupby(['campaign', 'generation'])['sequence'].nunique()

In [None]:

# This is cached in case we want to re-run the next cell, which is expensive.
@functools.cache
def _hamming_distance(a, b):
  return sum(a != b for a, b in zip(a, b))


In [None]:
def _get_min_distance_to_set(query_sequence, reference_sequences: pd.Series):
  return reference_sequences.apply(_hamming_distance, b=query_sequence).min()


df['generation_index'] = df['generation'].apply(lambda x: int(x[1]))


def get_num_unmatched_but_close_to_hr(df, generation_index):
  hr_seqs = df[
      (df['campaign'] == 'HR') & (df['generation_index'] == generation_index)
  ]['sequence']
  prior_unmatched_seqs = df[
      (df['campaign'] == 'unmatched')
      & (df['generation_index'] == generation_index)
  ]['sequence']
  distances = prior_unmatched_seqs.apply(
      _get_min_distance_to_set, reference_sequences=hr_seqs
  )
  return (distances == 1).sum()


def get_hr_stats(df):
  rows = []
  for generation in [2, 3, 4]:
    num_unmatched_in_prev_generation = 0
    if generation > 2:
      num_unmatched_in_prev_generation = get_num_unmatched_but_close_to_hr(
          df, generation - 1
      )
      num_from_campaign_in_prev_generation = (
          (df['campaign'] == 'HR') & (df['generation_index'] == generation - 1)
      ).sum()
    else:
      num_from_campaign_in_prev_generation = (df['campaign'] == 'epPCR').sum()

    num_data_points_in_current_generation = (
        (df['campaign'] == 'HR') & (df['generation_index'] == generation)
    ).sum()

    rows.append(
        dict(
            generation=generation,
            total_num_variants_used_as_input=num_unmatched_in_prev_generation
            + num_from_campaign_in_prev_generation,
            num_data_points_in_current_generation=num_data_points_in_current_generation,
        )
    )
  to_return = pd.DataFrame(rows)
  to_return['total_num_variants_considered_so_far'] = to_return[
      'total_num_variants_used_as_input'
  ].cumsum()
  return to_return


get_hr_stats(df)

In [None]:
def get_ml_stats(df):
  rows = []
  for generation in [2, 3, 4]:
    num_inputs = (df['generation_index'] < generation).sum()
    num_data_points_in_current_generation = (
        (df['campaign'] == 'ML') & (df['generation_index'] == generation)
    ).sum()
    rows.append(
        dict(
            generation=generation,
            total_num_variants_used_as_input=num_inputs,
            num_data_points_in_current_generation=num_data_points_in_current_generation,
        )
    )
  return pd.DataFrame(rows)


get_ml_stats(df)

In [None]:
df[df['campaign'] == 'other']['sublibrary_name'].value_counts()