##### Copyright 2020 Google LLC.


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
import os
import numpy
import pandas
from six.moves import zip
from sklearn import mixture
import gzip

!pip install python-Levenshtein

import Levenshtein



## Code to fit GMM

In [None]:
R1_TILE21_WT_SEQ = 'DEEEIRTTNPVATEQYGSVSTNLQRGNR'

# Covariance type to use in Gaussian Mixture Model.
_COVAR_TYPE = 'full'
# Number of components to use in Gaussian Mixture Model.
_NUM_COMPONENTS = 2

class BinningLabeler(object):
  """Emits class labels from provided cutoff values.

  Input cutoffs are encoded as 1-D arrays.  Given a cutoffs array of
  size n, creates n+1 labels for cutoffs, where the first bin is
  [-inf, cutoffs[0]], and last bin is (cutoffs[-1], inf].
  """

  def __init__(self, cutoffs):
    """Constructor.

    Args:
      cutoffs: (numpy.ndarray or list or numeric) values to bin data at.  First bin
        is [-inf, cutoffs[0]], and last bin is (cutoffs[-1], inf].

    Raises:
      ValueError: If no cutoff(s) (i.e. an empty list) is provided.
    """
    cutoffs = numpy.atleast_1d(cutoffs)
    if cutoffs.size:
      self._cutoffs = numpy.sort(cutoffs)
    else:
      raise ValueError('Invalid cutoffs. At least one cutoff value required.')

  def predict(self, values):
    """Provides model labels for input value(s) using the cutoff bins.

    Args:
      values: (numpy.ndarray or numeric) Value(s) to infer a label on.

    Returns:
      A numpy array with length len(values) and labels corresponding to
      categories defined by the cutoffs array intervals.  The labels are
      [0, 1, . . ., n], where n = len(cutoffs).  Note, labels correspond to bins
      in sorted order from smallest to largest cutoff value.
    """
    return numpy.digitize(values, self._cutoffs)


class TwoGaussianMixtureModelLabeler(object):
  """Emits class labels from Gaussian Mixture given input data.

  Input data is encoded as 1-D arrays.  Allows for an optional ambiguous label
  between the two modelled Gaussian distributions. Without the optional
  ambigouous category, the two labels are:
     0 - For values more likely derived from the Gaussian with smaller mean
     2 - For values more likely derived from the Gaussian with larger mean

  When allowing for an ambiguous category the three labels are:
     0 - For values more likely derived from the Gaussian with smaller mean
     1 - For values which fall within an ambiguous probability cutoff.
     2 - For values more likely derived from the Gaussian with larger mean
  """

  def __init__(self, data):
    """Constructor.

    Args:
      data: (numpy.ndarray or list) Input data to model with Gaussian Mixture.
      Input data is presumed to be in the form [x1, x2, ...., xn].
    """
    self._data = numpy.array([data]).T
    self._gmm = mixture.GaussianMixture(
        n_components=_NUM_COMPONENTS,
        covariance_type=_COVAR_TYPE).fit(self._data)

    # Re-map the gaussian with smaller mean to the "0" label.
    self._label_by_index = dict(
        list(zip([0, 1],
                 numpy.argsort(self._gmm.means_[:, 0]).tolist())))
    self._label_by_index_fn = numpy.vectorize(lambda x: self._label_by_index[x])

  def predict(self, values, probability_cutoff=0.):
    """Provides model labels for input value(s) using the GMM.

    Args:
      values: (array or single float value) Value(s) to infer a label on.
        When values=None, predictions are run on self._data.
      probability_cutoff: (float) Proability between 0 and 1 to identify which
        values correspond to ambiguous labels.  At probablity_cutoff=0 (default)
        it only returns the original two state predictions.

    Returns:
      A numpy array with length len(values) and labels corresponding to 0,1 if
      probability_cutoff = 0 and 0, 1, 2 otherwise.  In the latter, 0
      corresponds to the gaussian with smaller mean, 1 corresponds to the
      ambiguous label, and 2 corresponds to the gaussian with larger mean.
    """
    values = numpy.atleast_1d(values)
    values = numpy.array([values]).T
    predictions = self._label_by_index_fn(self._gmm.predict(values))
    # Re-map the initial 0,1 predictions to 0,2.
    predictions *= 2
    if probability_cutoff > 0:
      probas = self._gmm.predict_proba(values)
      max_probas = numpy.max(probas, axis=1)
      ambiguous_values = max_probas < probability_cutoff

      # Set ambiguous label as 1.
      predictions[ambiguous_values] = 1
    return predictions

## Load validation experiment dataframe

In [None]:
with gzip.open('GAS1_target_20190516.csv.gz', 'rb') as f:
    gas1 = pandas.read_csv(f, index_col=None)

gas1 = gas1.rename({
    'aa': 'sequence',
    'mask': 'mutation_sequence',
    'mut': 'num_mutations',
    'category': 'partition',
}, axis=1)

gas1_orig = gas1.copy() ## for comparison below if needed

gas1.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sequence,partition,chip,control,is_wt_aa,is_wt_nt,mutation_sequence,num_mutations,rep_i,rep_original,rep_total,EK269_GAS1_p1_rep1a_plasmid_x,EK269_GAS1_p1_rep1b_plasmid_x,EK269_GAS1_p1_rep1c_plasmid_x,EK269_GAS1_p1_rep1d_plasmid_x,EK269_GAS1_v3_rep1a_virus,EK269_GAS1_v3_rep1b_virus,EK269_GAS1_v3_rep1c_virus,EK269_GAS1_v3_rep1d_virus,EK269_GAS1_v4_rep2a_virus,EK269_GAS1_v4_rep2b_virus,EK269_GAS1_v4_rep2c_virus,EK269_GAS1_v4_rep2d_virus,EK269_GAS1_v5_rep3a_virus,EK269_GAS1_v5_rep3b_virus,EK269_GAS1_p1_rep1a_plasmid_y,EK269_GAS1_p1_rep1b_plasmid_y,EK269_GAS1_p1_rep1c_plasmid_y,EK269_GAS1_p1_rep1d_plasmid_y,old_EK269_GAS1_v3_rep1a_virus,old_EK269_GAS1_v3_rep1b_virus,old_EK269_GAS1_v3_rep1c_virus,old_EK269_GAS1_v3_rep1d_virus,old_EK269_GAS1_v4_rep2a_virus,old_EK269_GAS1_v4_rep2b_virus,old_EK269_GAS1_v4_rep2c_virus,old_EK269_GAS1_v4_rep2d_virus,old_EK269_GAS1_v5_rep3a_virus,old_EK269_GAS1_v5_rep3b_virus,old_EK269_GAS1_v5_rep3c_virus,old_EK269_GAS1_v5_rep3d_virus,GAS1_plasmid_N,v0_GAS1_plasmid_N,v1_GAS1_plasmid_N,v0_GAS1_virus_N,v1_GAS1_virus_N,GAS1_virus_N,v0_GAS1_plasmid_F,v1_GAS1_plasmid_F,GAS1_plasmid_F,v0_GAS1_virus_F,v1_GAS1_virus_F,GAS1_virus_F,v0_GAS1_virus_S,v1_GAS1_virus_S,GAS1_virus_S
0,0,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,DEDEIRTTNPVATEQYGSVSTNLQDnGnNdR,rnn_designed_plus_rand_train_walked,1,0,0,0,__D_____________________Dn_n_d_,5,1,1,1,13,89,51,47,176,47,53,211,78,188,84,29,45,42,6.0,8.0,8.0,8.0,16.0,17.0,16.0,12.0,17.0,18.0,19.0,8.0,21.0,16.0,10.0,20.0,230.0,30.0,200,190.0,953,1143.0,4e-06,3e-06,3e-06,6e-06,5e-06,5e-06,0.579811,0.561301,0.572274
1,1,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,DEDEIRTTNPVATEQYGAVSTNLQGdGNdR,rnn_designed_plus_rand_train_walked,1,0,0,0,__D______________A______Gd__d_,5,1,1,1,29,129,100,63,439,131,130,563,269,513,270,98,88,85,11.0,12.0,13.0,13.0,36.0,47.0,34.0,50.0,41.0,51.0,25.0,20.0,27.0,36.0,42.0,47.0,370.0,49.0,321,456.0,2586,3042.0,7e-06,5e-06,6e-06,1.4e-05,1.4e-05,1.4e-05,1.135026,1.318902,1.298578
2,2,GACGAAGAGGAAATCGCTACAACCAATCCCGTGGCTACGGAGCAGT...,DEEEIATTNPVATEQYGSVSTNLQHdGDeR,rnn_designed_plus_rand_train_walked,1,0,0,0,_____A__________________Hd_De_,5,1,1,1,27,121,91,65,171,66,50,176,356,648,338,128,46,42,12.0,9.0,14.0,11.0,14.0,12.0,8.0,12.0,44.0,53.0,36.0,42.0,13.0,14.0,18.0,20.0,350.0,46.0,304,286.0,2021,2307.0,6e-06,5e-06,5e-06,9e-06,1.1e-05,1e-05,0.553155,1.041751,0.979746
3,3,GACGAACACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,DEHEIRTTNPVATEQYGNVSTNLQGgGdNR,rnn_designed_plus_rand_train_walked,1,0,0,0,__H______________N______Gg_d__,5,1,1,1,66,288,243,190,830,244,238,979,651,1158,588,185,241,229,12.0,11.0,37.0,32.0,76.0,64.0,63.0,50.0,88.0,89.0,86.0,52.0,73.0,81.0,87.0,91.0,879.0,92.0,787,900.0,5343,6243.0,1.2e-05,1.3e-05,1.3e-05,2.9e-05,2.8e-05,2.8e-05,1.207065,1.072039,1.087459
4,4,GACGAACATGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,DEHEIRTTNPVATEQYGSVSTNLQpGGNDg,rnn_designed_plus_rand_train_walked,1,0,0,0,__H_____________________pG__Dg,5,1,1,1,36,172,122,97,552,188,157,616,487,845,457,146,75,76,15.0,17.0,10.0,10.0,50.0,59.0,22.0,29.0,63.0,65.0,77.0,57.0,18.0,26.0,37.0,24.0,479.0,52.0,427,527.0,3599,4126.0,7e-06,7e-06,7e-06,1.7e-05,1.9e-05,1.9e-05,1.258065,1.384113,1.365801


#### Validate that N->F columns computed as expected

In [None]:
numpy.testing.assert_allclose(
    gas1.GAS1_plasmid_F, 
    gas1.GAS1_plasmid_N / gas1.GAS1_plasmid_N.sum())

numpy.testing.assert_allclose(
    gas1.GAS1_virus_F,
    gas1.GAS1_virus_N / gas1.GAS1_virus_N.sum())

### Filter sequences with insufficient plasmids

#### Find zero-plasmid sequences

In [None]:
zero_plasmids_mask = gas1.GAS1_plasmid_N == 0
zero_plasmids_mask.sum()

446

#### Find low-plasmid count sequences
These selection values are unreliable, more noisy

In [None]:
low_plasmids_mask = (gas1.GAS1_plasmid_N < 10) & ~zero_plasmids_mask
low_plasmids_mask.sum()

1887

#### Drop sequences that don't meet the plasmid count bars

In [None]:
seqs_to_remove = (low_plasmids_mask | zero_plasmids_mask)
seqs_to_remove.sum()

2333

In [None]:
num_seqs_before_plasmid_filter = len(gas1)
num_seqs_before_plasmid_filter

243481

In [None]:
gas1 = gas1[~seqs_to_remove].copy()

In [None]:
num_seqs_before_plasmid_filter - len(gas1)

2333

In [None]:
len(gas1)

241148

###  Add pseudocounts

In [None]:
PSEUDOCOUNT = 1

def counts_to_frequency(counts):
  return counts / counts.sum()


gas1['virus_N'] = gas1.GAS1_virus_N + PSEUDOCOUNT
gas1['plasmid_N'] = gas1.GAS1_plasmid_N + PSEUDOCOUNT

gas1['virus_F'] = counts_to_frequency(gas1.virus_N)
gas1['plasmid_F'] = counts_to_frequency(gas1.plasmid_N)

### Compute viral selection 

In [None]:
gas1['viral_selection'] = numpy.log2(gas1.virus_F / gas1.plasmid_F)
assert 0 == gas1.viral_selection.isna().sum()
assert not numpy.any(numpy.isinf(gas1.viral_selection))
gas1.viral_selection.describe()

count    241148.000000
mean         -2.601064
std           3.393798
min         -11.176109
25%          -5.814133
50%          -2.479348
75%           0.761311
max           5.935761
Name: viral_selection, dtype: float64

### Compute GMM threshold

In [None]:
# Classify the selection coeff series after fitting to a GMM
gmm_model = TwoGaussianMixtureModelLabeler(
    gas1[gas1.partition.isin(['stop', 'wild_type'])].viral_selection)
gas1['viral_selection_gmm'] = gmm_model.predict(gas1.viral_selection)

# Compute the threshold for the viable class from the GMM labels
selection_coeff_threshold = gas1.loc[gas1.viral_selection_gmm == 2, 'viral_selection'].min()
print('selection coeff cutoff = %.3f' % selection_coeff_threshold)

# Add a label column
def is_viable_mutant(mutant_data):
  return mutant_data['viral_selection'] > selection_coeff_threshold
gas1['is_viable'] = gas1.apply(is_viable_mutant, axis=1)

print(gas1.is_viable.mean())

selection coeff cutoff = -2.811
0.5094091595202946


----

### De-dupe model-designed sequences

#### Partition the sequences that should not be de-deduped

Split off the partitions for which we want to retain replicates, such as controls/etc.



In [None]:
ml_generated_seqs = [
    'cnn_designed_plus_rand_train_seed',
    'cnn_designed_plus_rand_train_walked',
    'cnn_rand_doubles_plus_single_seed',
    'cnn_rand_doubles_plus_single_walked',
    'cnn_standard_seed',
    'cnn_standard_walked',
    'lr_designed_plus_rand_train_seed',
    'lr_designed_plus_rand_train_walked',
    'lr_rand_doubles_plus_single_seed',
    'lr_rand_doubles_plus_single_walked',
    'lr_standard_seed',
    'lr_standard_walked',
    'rnn_designed_plus_rand_train_seed',
    'rnn_designed_plus_rand_train_walked',
    'rnn_rand_doubles_plus_singles_seed',
    'rnn_rand_doubles_plus_singles_walked',
    'rnn_standard_seed',
    'rnn_standard_walked',
]

is_ml_generated_mask = gas1.partition.isin(ml_generated_seqs)
ml_gen_df = gas1[is_ml_generated_mask].copy()
non_ml_gen_df = gas1[~is_ml_generated_mask].copy()

ml_gen_df.partition.value_counts()

rnn_standard_walked                     21337
cnn_designed_plus_rand_train_walked     21251
rnn_designed_plus_rand_train_walked     21218
lr_standard_walked                      20951
cnn_rand_doubles_plus_single_walked     20936
cnn_standard_walked                     20878
rnn_rand_doubles_plus_singles_walked    20623
lr_rand_doubles_plus_single_walked      20473
lr_designed_plus_rand_train_walked      20164
lr_rand_doubles_plus_single_seed         2071
rnn_designed_plus_rand_train_seed        2065
rnn_rand_doubles_plus_singles_seed       2045
lr_designed_plus_rand_train_seed         2030
cnn_rand_doubles_plus_single_seed        2022
lr_standard_seed                         1989
cnn_standard_seed                        1924
rnn_standard_seed                        1916
cnn_designed_plus_rand_train_seed        1898
Name: partition, dtype: int64

In [None]:
ml_gen_deduped = ml_gen_df.groupby('sequence').apply(
    lambda dupes: dupes.loc[dupes.plasmid_N.idxmax()]).copy()

In [None]:
display(ml_gen_deduped.shape)
ml_gen_deduped.head()

(201426, 65)

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,sequence,partition,chip,control,is_wt_aa,is_wt_nt,mutation_sequence,num_mutations,rep_i,rep_original,rep_total,EK269_GAS1_p1_rep1a_plasmid_x,EK269_GAS1_p1_rep1b_plasmid_x,EK269_GAS1_p1_rep1c_plasmid_x,EK269_GAS1_p1_rep1d_plasmid_x,EK269_GAS1_v3_rep1a_virus,EK269_GAS1_v3_rep1b_virus,EK269_GAS1_v3_rep1c_virus,EK269_GAS1_v3_rep1d_virus,EK269_GAS1_v4_rep2a_virus,EK269_GAS1_v4_rep2b_virus,EK269_GAS1_v4_rep2c_virus,EK269_GAS1_v4_rep2d_virus,EK269_GAS1_v5_rep3a_virus,EK269_GAS1_v5_rep3b_virus,EK269_GAS1_p1_rep1a_plasmid_y,EK269_GAS1_p1_rep1b_plasmid_y,EK269_GAS1_p1_rep1c_plasmid_y,EK269_GAS1_p1_rep1d_plasmid_y,old_EK269_GAS1_v3_rep1a_virus,old_EK269_GAS1_v3_rep1b_virus,old_EK269_GAS1_v3_rep1c_virus,old_EK269_GAS1_v3_rep1d_virus,old_EK269_GAS1_v4_rep2a_virus,old_EK269_GAS1_v4_rep2b_virus,old_EK269_GAS1_v4_rep2c_virus,old_EK269_GAS1_v4_rep2d_virus,old_EK269_GAS1_v5_rep3a_virus,old_EK269_GAS1_v5_rep3b_virus,old_EK269_GAS1_v5_rep3c_virus,old_EK269_GAS1_v5_rep3d_virus,GAS1_plasmid_N,v0_GAS1_plasmid_N,v1_GAS1_plasmid_N,v0_GAS1_virus_N,v1_GAS1_virus_N,GAS1_virus_N,v0_GAS1_plasmid_F,v1_GAS1_plasmid_F,GAS1_plasmid_F,v0_GAS1_virus_F,v1_GAS1_virus_F,GAS1_virus_F,v0_GAS1_virus_S,v1_GAS1_virus_S,GAS1_virus_S,virus_N,plasmid_N,virus_F,plasmid_F,viral_selection,viral_selection_gmm,is_viable
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
AAEEIATTNPVATEQYGSVcAaNmGEeApDaQaEgGd,82181,GCAGCCGAGGAAATCGCTACAACCAATCCCGTGGCTACGGAGCAGT...,AAEEIATTNPVATEQYGSVcAaNmGEeApDaQaEgGd,cnn_designed_plus_rand_train_walked,1,0,0,0,AA___A_____________cAaNmGEeApDaQaEgGd,21,1,1,1,36,129,94,91,322,94,94,378,277,557,277,87,61,63,14.0,15.0,15.0,12.0,19.0,35.0,23.0,26.0,43.0,35.0,31.0,11.0,21.0,29.0,32.0,18.0,406.0,56.0,350,323.0,2210,2533.0,8e-06,6e-06,6e-06,1.025716e-05,1.163842e-05,1.144194e-05,0.444881,0.967444,0.90045,2534.0,407.0,1.14347e-05,6e-06,0.900998,2,True
AAEEIATTNPVAtYAQWGCnTAgNnGLTtAeTgNlDs,85396,GCGGCTGAGGAAATCGCGACAACCAATCCCGTGGCTACCTATGCTC...,AAEEIATTNPVAtYAQWGCnTAgNnGLTtAeTgNlDs,cnn_designed_plus_rand_train_walked,1,0,0,0,AA___A______tYA_W_CnTAgNnG_TtAeTg_lDs,24,1,1,1,20,118,79,68,4,1,0,6,8,5,5,3,1,0,6.0,4.0,8.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,309.0,24.0,285,2.0,33,35.0,3e-06,5e-06,5e-06,6.35118e-08,1.737864e-07,1.581002e-07,-5.668117,-4.801599,-4.883025,36.0,310.0,1.624504e-07,5e-06,-4.843517,0,False
AAEEIFTTNPEALEQVwASCSeRnPfQhLGsPEI,172406,GCTGCGGAGGAAATCTTTACAACCAATCCCGAGGCTCTCGAGCAGG...,AAEEIFTTNPEALEQVwASCSeRnPfQhLGsPEI,lr_rand_doubles_plus_single_walked,1,0,0,0,AA___F____E_L__VwA_C_eRnPfQhLGsPEI,22,1,1,1,17,67,44,51,0,0,1,1,0,3,0,0,0,0,6.0,2.0,6.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,198.0,19.0,179,1.0,5,6.0,3e-06,3e-06,3e-06,3.17559e-08,2.633127e-08,2.71029e-08,-6.331082,-6.853063,-6.785239,7.0,199.0,3.158758e-08,3e-06,-6.566587,0,False
AAEEIHPTNPEALEQViASCSeRnPfQhLGGEaH,172429,GCCGCGGAGGAAATCCACCCGACCAATCCCGAAGCTCTTGAGCAGG...,AAEEIHPTNPEALEQViASCSeRnPfQhLGGEaH,lr_rand_doubles_plus_single_walked,1,0,0,0,AA___HP___E_L__ViA_C_eRnPfQhLG_EaH,22,1,1,1,35,245,181,150,2,1,2,2,2,3,3,2,1,0,19.0,23.0,27.0,11.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,691.0,80.0,611,4.0,18,22.0,1.1e-05,1e-05,1e-05,1.270236e-07,9.479256e-08,9.93773e-08,-6.405082,-6.776279,-6.713956,23.0,692.0,1.037878e-07,1e-05,-6.648384,0,False
AAEEIHPTNPEALEQVwASCAeRnPfQhLGGEI,172420,GCGGCTGAGGAAATCCATCCAACCAATCCCGAAGCGCTAGAGCAGG...,AAEEIHPTNPEALEQVwASCAeRnPfQhLGGEI,lr_rand_doubles_plus_single_walked,1,0,0,0,AA___HP___E_L__VwA_CAeRnPfQhLG_EI,22,1,1,1,42,204,143,130,5,1,1,3,5,5,3,2,2,1,17.0,19.0,16.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,581.0,62.0,519,2.0,28,30.0,8e-06,9e-06,9e-06,6.35118e-08,1.474551e-07,1.355145e-07,-7.037351,-5.903411,-6.016349,31.0,582.0,1.398879e-07,9e-06,-5.967996,0,False


#### Concatenate de-deduped ML-generated seqs with rest

In [None]:
gas1_deduped = pandas.concat([ml_gen_deduped, non_ml_gen_df], axis=0)
print(gas1_deduped.shape)
gas1_deduped.partition.value_counts()

(236783, 65)


random_doubles                          25040
rnn_standard_walked                     20838
cnn_designed_plus_rand_train_walked     20759
rnn_designed_plus_rand_train_walked     20731
lr_standard_walked                      20456
cnn_rand_doubles_plus_single_walked     20454
cnn_standard_walked                     20395
rnn_rand_doubles_plus_singles_walked    20154
lr_rand_doubles_plus_single_walked      19999
lr_designed_plus_rand_train_walked      19680
singles                                  3952
previous_chip_viable                     2997
previous_chip_nonviable                  2997
lr_rand_doubles_plus_single_seed         2071
rnn_designed_plus_rand_train_seed        2065
rnn_rand_doubles_plus_singles_seed       2045
lr_designed_plus_rand_train_seed         2030
cnn_rand_doubles_plus_single_seed        2022
lr_standard_seed                         1989
cnn_standard_seed                        1924
rnn_standard_seed                        1916
cnn_designed_plus_rand_train_seed 

## Compute edit distance for chip

In [None]:
gas1 = gas1_deduped

In [None]:
gas1['num_edits'] = gas1.sequence.apply(
    lambda s: Levenshtein.distance(R1_TILE21_WT_SEQ, s))
gas1.num_edits.describe()

count    236783.000000
mean         13.563174
std           7.489709
min           0.000000
25%           7.000000
50%          14.000000
75%          20.000000
max          33.000000
Name: num_edits, dtype: float64

In [None]:
COLUMN_SCHEMA = [ 
    'sequence',
    'partition',
    'mutation_sequence',
    'num_mutations',
    'num_edits',    
    'viral_selection',
    'is_viable',
]

gas1a = gas1[COLUMN_SCHEMA].copy()

#### Concat with training data chip

In [None]:
harvard = pandas.read_csv('r0r1_with_partitions_and_labels.csv', index_col=None)

harvard = harvard.rename({
    'S': 'viral_selection',
    'aa_seq': 'sequence',
    'mask': 'mutation_sequence',
    'mut': 'num_mutations',
}, axis=1)

designed_mask = harvard.partition.isin(['min_fit', 'thresh', 'temp'])
harvard.loc[designed_mask, ['partition']] = 'designed'

harvard['num_edits'] = harvard.sequence.apply(
    lambda s: Levenshtein.distance(R1_TILE21_WT_SEQ, s))
harvard.num_edits.describe()

harvard1 = harvard[COLUMN_SCHEMA].copy()

harvard1.head(3)

Unnamed: 0,sequence,partition,mutation_sequence,num_mutations,num_edits,viral_selection,is_viable
0,ADEEIRATNPIATEMYGSVSTNLQLGNR,designed,AD____A___I___M_________L___,6,6,-2.027259,False
1,ADEEIRATNPVATEQYGSVSTNQQRQNR,designed,AD____A_______________Q__Q__,5,5,-0.429554,True
2,ADEEIRTTNPVATEQWGGVSTNLQIGNY,designed,AD_____________W_G______I__Y,6,6,-0.527843,True


In [None]:
harvard1['chip'] = 'harvard'
gas1a['chip'] = 'gas1'

combined = pandas.concat([
    harvard1, 
    gas1a,
], axis=0, sort=False)
print(combined.shape)
combined.partition.value_counts()

(304152, 8)


designed                                56372
random_doubles                          25040
rnn_standard_walked                     20838
cnn_designed_plus_rand_train_walked     20759
rnn_designed_plus_rand_train_walked     20731
lr_standard_walked                      20456
cnn_rand_doubles_plus_single_walked     20454
cnn_standard_walked                     20395
rnn_rand_doubles_plus_singles_walked    20154
lr_rand_doubles_plus_single_walked      19999
lr_designed_plus_rand_train_walked      19680
rand                                     9885
singles                                  3952
previous_chip_viable                     2997
previous_chip_nonviable                  2997
lr_rand_doubles_plus_single_seed         2071
rnn_designed_plus_rand_train_seed        2065
rnn_rand_doubles_plus_singles_seed       2045
lr_designed_plus_rand_train_seed         2030
cnn_rand_doubles_plus_single_seed        2022
lr_standard_seed                         1989
cnn_standard_seed                 

In [None]:
combined.head()

Unnamed: 0,sequence,partition,mutation_sequence,num_mutations,num_edits,viral_selection,is_viable,chip
0,ADEEIRATNPIATEMYGSVSTNLQLGNR,designed,AD____A___I___M_________L___,6,6,-2.027259,False,harvard
1,ADEEIRATNPVATEQYGSVSTNQQRQNR,designed,AD____A_______________Q__Q__,5,5,-0.429554,True,harvard
2,ADEEIRTTNPVATEQWGGVSTNLQIGNY,designed,AD_____________W_G______I__Y,6,6,-0.527843,True,harvard
3,ADEEIRTTNPVATEQYGEVSTNLQRGNR,designed,AD_______________E__________,3,3,2.887908,True,harvard
4,ADEEIRTTNPVATEQYGSVSTNLQRGNR,designed,AD__________________________,2,2,0.57573,True,harvard
