This is meant to be a final validation of the sequences we've aggregated and plan on submitting to Twist for Chip 1.

The only thing left to do after this point is to stack the GFP and BLAC data frames and subset only the id and dna_to_order column, which is the interface Twist needs. 

In [1]:
import os
import sys
import warnings
import random
import copy
import pickle
import glob

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

sys.path.append('../common')
import data_io_utils
import paths
import constants
import utils

sys.path.append('../A003_policy_optimization/')
import models
import A003_common

import A006_common
from unirep import babbler1900 as babbler
import sequence_selection


%reload_ext autoreload
%autoreload 2

In [2]:
agg_gfp_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 
        'A051a_mlpe-gfp-pilot_Data_Efficiency_Chip_1_GFP_Cloning_Validated.pkl')
agg_blac_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 
        'A051a_mlpe-gfp-pilot_Data_Efficiency_Chip_1_Beta-Lactamase_Cloning_Validated.pkl')

data_io_utils.sync_s3_path_to_local(agg_gfp_file, is_single_file=True)
data_io_utils.sync_s3_path_to_local(agg_blac_file, is_single_file=True)

In [3]:
gfp_df = pd.read_pickle(agg_gfp_file)
blac_df = pd.read_pickle(agg_blac_file)

display(gfp_df.head())
display(blac_df.head())

master_df = pd.concat([gfp_df, blac_df], axis=0)
print(master_df.shape)

Unnamed: 0,id,seq_idx,trajectory_idx,predicted_fitness,ensemble_predicted_fitness,seq,protein,pred_score,lib_aa_seq,constant_where_expected,successfully_clones,error_message,dna_to_order,ligated_gene,ligated_gene_translated,n_mut_rel_wt,oligo_len
0,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,615.0,1516.0,0.835807,"[0.8386957, 0.8833945, 0.6869724, 0.7490394, 0...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKMTLKF...,GFP,,SGEGEGDATYGKMTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYP...,True,True,,GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGAC...,ATGAGTAAAGGCGAAGAGCTGTTCACTGGTGTAGTCCCGATTCTGG...,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKMTLKF...,2,293
1,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,250.0,922.0,0.831929,"[0.83363, 0.8859961, 0.6875845, 0.74543333, 0....",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,GFP,,SGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVKCFSRYP...,True,True,,GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGAC...,ATGAGTAAAGGCGAAGAGCTGTTCACTGGTGTAGTCCCGATTCTGG...,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,2,293
2,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,250.0,195.0,0.830867,"[0.8327829, 0.87653375, 0.6890327, 0.73599446,...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,GFP,,SGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYP...,True,True,,GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGAC...,ATGAGTAAAGGCGAAGAGCTGTTCACTGGTGTAGTCCCGATTCTGG...,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0,293
3,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,354.0,1847.0,0.82864,"[0.82065207, 0.8765222, 0.6913246, 0.747247, 0...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATMGKLTLKF...,GFP,,SGEGEGDATMGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYP...,True,True,,GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGAC...,ATGAGTAAAGGCGAAGAGCTGTTCACTGGTGTAGTCCCGATTCTGG...,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATMGKLTLKF...,3,293
4,GFP_SimAnneal-ET_Global_Init_1-0024-00-3e72164...,933.0,163.0,0.825671,"[0.83166367, 0.8713553, 0.69725585, 0.7542225,...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDASYGKMTIKF...,GFP,,SGEGEGDASYGKMTIKFICTTGKLPVPWPTLVTTLSYGVQCFSRYP...,True,True,,GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGAC...,ATGAGTAAAGGCGAAGAGCTGTTCACTGGTGTAGTCCCGATTCTGG...,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDASYGKMTIKF...,5,293


Unnamed: 0,id,seq_idx,trajectory_idx,predicted_fitness,ensemble_predicted_fitness,seq,protein,pred_score,lib_aa_seq,constant_where_expected,successfully_clones,error_message,dna_to_order,ligated_gene,ligated_gene_translated,n_mut_rel_wt,oligo_len
0,BLAC_SimAnneal-ET_Global_Init_1-0024-00-3e7216...,2595.0,1773.0,0.847448,"[0.7188464, 0.80541486, 0.83650917, 1.0177329,...",MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,BLAC,,ANLLLTSIGGPMELTHFLHNMGDHVTRLDRWEEHLNEAIPNDERDT...,True,True,,CGCGTCGAGTAGGGAAGACAATGCGGCCAACTTACTTCTGACAAGT...,ATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGG...,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,7.0,293.0
1,BLAC_SimAnneal-ET_Global_Init_1-0024-00-3e7216...,855.0,2580.0,0.84327,"[0.71885014, 0.805406, 0.8365229, 0.9768362, 0...",MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,BLAC,,ANCLLFTIGGPKELVAFLKNMGDHVTRLDRWNTELNEAIPNDERDA...,True,False,NoValidCodonFoundException(),,,,,
2,BLAC_SimAnneal-ET_Global_Init_1-0024-00-3e7216...,2269.0,1955.0,0.83894,"[0.7188311, 0.8054163, 0.8365104, 1.0334604, 0...",MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,BLAC,,ANLLLTTIGGPKELTAFLHNMGDHVTRLDKWHPELARSIPNDQRDT...,True,True,,CGCGTCGAGTAGGGAAGACAATGCGGCCAACTTACTTCTGACAACG...,ATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGG...,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,6.0,293.0
3,BLAC_SimAnneal-ET_Global_Init_1-0024-00-3e7216...,2720.0,1554.0,0.838937,"[0.7188321, 0.8054168, 0.8365092, 1.0048122, 0...",MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,BLAC,,ANLLLTTIGGPKELTAFLHNMGDHVTRHDQYKPQLNEAIPNDERDT...,True,True,,CGCGTCGAGTAGGGAAGACAATGCGGCCAACTTACTTCTGACAACG...,ATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGG...,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,7.0,293.0
4,BLAC_SimAnneal-ET_Global_Init_1-0024-00-3e7216...,2742.0,1929.0,0.838183,"[0.71883655, 0.8054126, 0.8365047, 1.0402751, ...",MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,BLAC,,ANLLLTTIGGPAELTVFTHNMGDHVTRLDSWNPELNEAIPNDTRDT...,True,True,,CGCGTCGAGTAGGGAAGACAATGCGGCCAACTTACTTCTGACAACG...,ATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGG...,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,7.0,293.0


(24137, 17)


In [4]:
UNIREP_BATCH_SIZE = 320


def load_base_model(protein, model_name):
    if protein == 'GFP':
        if model_name == 'ET_Global_Init_1':
            base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
            print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_1_WEIGHT_PATH)
        elif model_name == 'ET_Global_Init_2':
            base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
            print('Loading weights from:', paths.GFP_ET_GLOBAL_INIT_2_WEIGHT_PATH)
        elif model_name == 'ET_Random_Init_1':
            base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
            print('Loading weights from:', paths.GFP_ET_RANDOM_INIT_1_WEIGHT_PATH)
        elif model_name =='OneHot':
            # Just need it to generate one-hot reps.
            # Top model created within OneHotRegressionModel doesn't actually get used.
            base_model = models.OneHotRegressionModel('EnsembledRidge') 
        else:
            assert False, 'Unsupported base model'

        return base_model

    elif protein == 'BLAC':
        class BetaLacOneHotEncoder(object):
            def __init__(self):
                pass

            def encode_seqs(self, seqs):
                return utils.encode_aa_seq_list_as_matrix_of_flattened_one_hots(seqs)

        if model_name == 'ET_Global_Init_1':
            base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.BLAC_ET_GLOBAL_INIT_1_WEIGHT_PATH)
            print('Loading weights from:', paths.BLAC_ET_GLOBAL_INIT_1_WEIGHT_PATH)
        elif model_name == 'ET_Random_Init_1':
            base_model = babbler(batch_size=UNIREP_BATCH_SIZE, model_path=paths.BLAC_ET_RANDOM_INIT_1_WEIGHT_PATH)
            print('Loading weights from:', paths.BLAC_ET_RANDOM_INIT_1_WEIGHT_PATH)
        elif model_name =='OneHot':
            # Just need it to generate one-hot reps.
            # Doing it this way to be consistent with the GFP pipeline
            base_model = BetaLacOneHotEncoder()
        else:
            assert False, 'Unsupported base model'

        return base_model

# Generate representations
def generate_reps(seq_list, base_model, sess):        
    if 'babbler1900' == base_model.__class__.__name__:
        hiddens = base_model.get_all_hiddens(seq_list, sess)
        rep = np.stack([np.mean(s, axis=0) for s in hiddens],0)

    else: # one hot model
        rep = base_model.encode_seqs(seq_list)

    return rep

In [5]:
def get_all_seqs_using_base_model_mask(df, protein, base_model):
    base_model_mask = df['id'].apply(lambda x: base_model in x)
    protein_mask = df['protein'] == protein
    
    return np.logical_and(base_model_mask, protein_mask)

def get_associated_sa_run_file(seq_id):
    file_basename = '-'.join(seq_id.split('-')[:-1]) + '.p'
    
    gfp_sa_run_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 'simulated_annealing', 
            'GFP', file_basename)
    blac_sa_run_file = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1', 'simulated_annealing', 
            'beta_lactamase', file_basename)
    
    if os.path.exists(gfp_sa_run_file):
        return gfp_sa_run_file
    elif os.path.exists(blac_sa_run_file):
        return blac_sa_run_file
    else:
        return 'Simulated annealing run file doesnt exist for %s' % seq_id

In [None]:
master_df['sa_run_file'] = master_df['id'].apply(lambda x: get_associated_sa_run_file(x))

# Should just be Grigory's sequences.
print(np.sum(master_df['sa_run_file'].apply(lambda x: 'Simulated annealing run file doesnt exist' in x)))
print(np.sum(np.isnan(master_df['trajectory_idx']))) # These are an indirect index into Grigory's sequences.
      
# Make sure ^^ those two numbers match

662
662


## Validate sequence selections

This involves:

1) Regenerating reps and top model predictions for every selected sequence and asserting that these from scratch predictions match those we have stored in the aggregated dataframe (kept from the time of simulated annealing).

2) Validated that the sequence and trajectory indices for each sequence are indeed correct. Here we double check the fitness value and sequence we've extracted matches a dirty re-extraction.

In [None]:
base_model_protein_combos = [
    ('GFP', 'ET_Global_Init_1'),
    ('GFP', 'ET_Global_Init_2'),
    ('GFP', 'ET_Random_Init_1'),
    ('GFP', 'OneHot'),
    ('BLAC', 'ET_Global_Init_1'),
    ('BLAC', 'ET_Random_Init_1'),
    ('BLAC', 'OneHot')
]

A006h_BURNIN = 250

for prot, bm in base_model_protein_combos:
    print()
    print(prot, bm)
    
    if prot == 'GFP':
        wt_aa_seq = constants.AVGFP_AA_SEQ
    elif prot == 'BLAC':
        wt_aa_seq = constants.BETA_LAC_AA_SEQ
    else:
        assert False
    
    
    mask = get_all_seqs_using_base_model_mask(master_df, prot, bm)
    
    # All seqs for a given protein & base model.
    sub_df = master_df[mask]
    
    tf.reset_default_graph()
    base_model = load_base_model(prot, bm)
    
    # generate reps for all of the sequences
    with tf.Session() as sess:
        print('TF init')
        sess.run(tf.global_variables_initializer())

        # Now further subdivide by the simulated annealing run.
        # load the top model within each 
        print('Subdividing by SA run')
        sa_run_files = np.unique(sub_df['sa_run_file'])
        for sa_run_file in sa_run_files:
            print(sa_run_file)
            
            with open(sa_run_file, 'rb') as f:
                res = pickle.load(f)

            top_model = res['top_model']

            sa_mask = np.array(sub_df['sa_run_file'] == sa_run_file)
            sa_sub_df = sub_df[sa_mask]
            
            print('Generating reps')
            reps = generate_reps(list(sa_sub_df['seq']), base_model, sess)
            yhat = top_model.predict(reps)
            
            print('Validating sequence selections using dirty orthogonal check')
            # Assert deviations in predictions vs the values we've stored
            # are small (numerically zero)
            assert np.std(yhat - sa_sub_df['predicted_fitness'])/np.std(sa_sub_df['predicted_fitness']) < 5e-4
            
            # Open up the fitness history from the SA run again, and reselect what we think
            # the most func seqs should be in a dirty manner. Make sure our officially selected
            # seqs & qfunc matches this dirty extraction
            fit_mat = np.array(res['sa_results']['fitness_history'])
            fit_mat[:A006h_BURNIN,:] = -np.inf
            seq_mat = np.array(res['sa_results']['seq_history'])
            
            for i,r in sa_sub_df.iterrows():
                seq_idx = int(r['seq_idx'])
                traj_idx = int(r['trajectory_idx'])
                
                assert r['seq'] == seq_mat[seq_idx, traj_idx]
                assert r['predicted_fitness'] == fit_mat[seq_idx, traj_idx]
                assert np.all(fit_mat[seq_idx, traj_idx] >= fit_mat[:, traj_idx]) # we picked the best?
                
                
            top_qfunc = -np.sort(-np.max(fit_mat, axis=0))[:sa_sub_df.shape[0]]
            assert np.std(top_qfunc - sa_sub_df['predicted_fitness'])/np.std(sa_sub_df['predicted_fitness']) < 1e-4
            
            print('Hackily storing predicted WT qfunc')
            # If we're here, we're happy and we trust the top model and the base_model.
            # NOTE: HACKY!!
            # We haven't stored the predicted WT value previously. This should be done 
            # in earlier notebooks like A006h. However, for convenience generate it here
            wt_rep = generate_reps([wt_aa_seq], base_model, sess)
            wt_yhat = top_model.predict(wt_rep).reshape(-1)
            
            output_sel_seq_dir = sa_run_file.replace('.p', '-selected_seqs')
            assert os.path.exists(output_sel_seq_dir)
            wt_out_file = os.path.join(os.path.join(output_sel_seq_dir, 'A006j_wt_pred_qfunc.npy'))
            np.save(wt_out_file, wt_yhat)
            data_io_utils.sync_local_path_to_s3(wt_out_file, is_single_file=True)
            


GFP ET_Global_Init_1


  from ._conv import register_converters as _register_converters


Loading weights from: /notebooks/analysis/common/../../data/s3/evotuning_checkpoints/gfp/unirep_global_init_1
TF init
Subdividing by SA run
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-ET_Global_Init_1-0024-00-3e721641.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-ET_Global_Init_1-0024-01-3a0e3d4.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-ET_Global_Init_1-0024-02-31e54146.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-ET_Global_Init_1-0024-03-3764e943.p
Generating reps
Validating sequence sele

Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-OneHot-0024-01-1723ee07.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-OneHot-0024-02-bb99846.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-OneHot-0024-03-20ad79a0.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc
/notebooks/analysis/common/../../data/s3/chip_1/simulated_annealing/GFP/GFP_SimAnneal-OneHot-0024-04-4279eeb.p
Generating reps
Validating sequence selections using dirty orthogonal check
Hackily storing predicted WT qfunc

In [11]:
with open('A006j_validation_worked.txt', 'w') as f:
    f.write('it surree didd\n')

Some example pieces of DNA to clone in Geneious

GFP

In [12]:
master_df['dna_to_order'].iloc[34]

'GGGTCACGCGTAGGGGTCTCACGTGAGCGGCGAGGGTGAAGGTGACGCAACTTATGGTAGCCTGACGCTGAAGTTCATCTGTACTACTGGTAAAATGCCGGTACCTTGGCCGACTCTGGTAACGACGCTGAGCTATGGTGTTCAGTGCTTTAGCCGTTATCCGGACCACATGAAGCAGCATGACTTCTTCAAGTCCGCCATGCCGGAAGGCTATGTGCAGGAACGCATGATTTTTTTTCGTGATGACGGCAATTACAAAACGCGTGCGGAAGTGAGACCGTGTGGCTGCGGAA'

BLAC

In [13]:
master_df['dna_to_order'].iloc[-34]

'CGCGTCGAGTAGGGAAGACAATGCGGCCAACTTACTTGATACAGCAATCGGAGGACCGAAGCATCTAACCGCTTTTTTGCACAACATGGGGGATTTTGTAACTTGGCTTGATAATTGGGAACCGGAGCTGAATGAAGCCGGCCCAAACGACGTGCGTGACTGGAAAATGCCTCATTATATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCTTCCAAAATGCAATTAATAGACTGGATGGAGCAGGATAAAGTTGTTGTCTTCCCAGCTTCACACG'