In [1]:
!pip install python-Levenshtein

[33mDEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.[0m


In [2]:
import os
import sys
import warnings
import random
import copy
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../common')
import data_io_utils
import paths
import constants
import utils

sys.path.append('../A003_policy_optimization/')
import A003_common

import A006_common

## Generate an example config to play around with

In [3]:
output_dir = 'hyperborg'

In [4]:
N_seq = 3500
sa_output_dir = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1/simulated_annealing/GFP')

example_config = {
    'seed': 1,
    'n_train_seqs': 96,
    'model': 'ET_Global_Init_1',
    'n_chains': N_seq,
    'T_max': np.ones(N_seq)*(10**(-2)),
    'sa_n_iter': 500, # debug
    'temp_decay_rate': 1.0,
    'min_mut_pos': A006_common.GFP_LIB_REGION[0],
    'max_mut_pos': A006_common.GFP_LIB_REGION[1],
    'nmut_threshold': 15,
    'output_file': 'example_sa_hyperborg.p'
}

with open(os.path.join(output_dir, 'example_config.p'), 'wb') as f:
    pickle.dump(file=f, obj=example_config)

In [5]:
N_seq = 3500
sa_output_dir = os.path.join(data_io_utils.S3_DATA_ROOT, 'chip_1/simulated_annealing/GFP')

example_config = {
    'seed': 2,
    'n_train_seqs': 96,
    'model': 'OneHot',
    'n_chains': N_seq,
    'T_max': np.ones(N_seq)*(10**(-2)),
    'sa_n_iter': 3, # debug
    'temp_decay_rate': 1.0,
    'min_mut_pos': A006_common.GFP_LIB_REGION[0],
    'max_mut_pos': A006_common.GFP_LIB_REGION[1],
    'nmut_threshold': 15,
    'output_file': 'example_sa_hyperborg_2.p'
}

with open(os.path.join(output_dir, 'example_config_2.p'), 'wb') as f:
    pickle.dump(file=f, obj=example_config)

Examine the example results

In [6]:
res_file = os.path.join(sa_output_dir, example_config['output_file'])
data_io_utils.sync_s3_path_to_local(res_file, is_single_file=True)

with open(res_file, 'rb') as f:
    res = pickle.load(f)

In [7]:
res.keys()

dict_keys(['sa_results', 'train_seq_reps', 'top_model', 'train_df', 'base_model'])

In [8]:
res['sa_results'].keys()

dict_keys(['max_mut_pos', 'decay_rate', 'seq_history', 'k', 'mu_muts_per_seq', 'fitness_history', 'fitness_mem_pred_history', 'fitness_std_history', 'n_iter', 'T_max', 'min_mut_pos', 'init_seqs'])

Check amino acids outside of the mutation region are not mutated.

In [9]:
all_seqs = list(np.array(res['sa_results']['seq_history']).reshape(-1))
for s in all_seqs:
    assert s[:A006_common.GFP_LIB_REGION[0]] == constants.AVGFP_AA_SEQ[:A006_common.GFP_LIB_REGION[0]]
    assert s[A006_common.GFP_LIB_REGION[1]:] == constants.AVGFP_AA_SEQ[A006_common.GFP_LIB_REGION[1]:]

## Create all the config dictionaries we would ever want!

In [10]:
models = ['ET_Global_Init_1', 'ET_Global_Init_2', 'ET_Random_Init_1', 'OneHot']
n_train_seqs = [8, 24, 96]
seeds = list(np.arange(10)) # Defines training sets that will be used.

# Note that although the seed defines the training set, 
#there is some additional randomness due to the 

In [11]:
np.random.seed(1)
random.seed(1)

output_dir = 'hyperborg'
N_seq = 3500

for m in models:
    for n_train in n_train_seqs:
        for seed in seeds:

            output_file_base = '%s-%04d-%02d-%x.p' % (m, n_train, seed, random.randrange(2**30))
            sa_results_file = 'GFP_SimAnneal-' + output_file_base
            config_file = 'GFP_SA_config-' + output_file_base
            
            print(sa_results_file, config_file)
            
            
            config = {
                'seed': seed,
                'n_train_seqs': n_train,
                'model': m,
                'n_chains': N_seq,
                'T_max': np.ones(N_seq)*(10**(-2)),
                'sa_n_iter': 3000, # debug
                'temp_decay_rate': 1.0,
                'min_mut_pos': A006_common.GFP_LIB_REGION[0],
                'max_mut_pos': A006_common.GFP_LIB_REGION[1],
                'nmut_threshold': 15,
                'output_file': sa_results_file
            }
            
            with open(os.path.join(output_dir, config_file), 'wb') as f:
                pickle.dump(file=f, obj=config)

GFP_SimAnneal-ET_Global_Init_1-0008-00-1132d8fa.p GFP_SA_config-ET_Global_Init_1-0008-00-1132d8fa.p
GFP_SimAnneal-ET_Global_Init_1-0008-01-813e268.p GFP_SA_config-ET_Global_Init_1-0008-01-813e268.p
GFP_SimAnneal-ET_Global_Init_1-0008-02-20a61a1e.p GFP_SA_config-ET_Global_Init_1-0008-02-20a61a1e.p
GFP_SimAnneal-ET_Global_Init_1-0008-03-f17f5c4.p GFP_SA_config-ET_Global_Init_1-0008-03-f17f5c4.p
GFP_SimAnneal-ET_Global_Init_1-0008-04-3f6a6abd.p GFP_SA_config-ET_Global_Init_1-0008-04-3f6a6abd.p
GFP_SimAnneal-ET_Global_Init_1-0008-05-3988ec51.p GFP_SA_config-ET_Global_Init_1-0008-05-3988ec51.p
GFP_SimAnneal-ET_Global_Init_1-0008-06-3c728830.p GFP_SA_config-ET_Global_Init_1-0008-06-3c728830.p
GFP_SimAnneal-ET_Global_Init_1-0008-07-30973b4b.p GFP_SA_config-ET_Global_Init_1-0008-07-30973b4b.p
GFP_SimAnneal-ET_Global_Init_1-0008-08-1adfcc96.p GFP_SA_config-ET_Global_Init_1-0008-08-1adfcc96.p
GFP_SimAnneal-ET_Global_Init_1-0008-09-c039746.p GFP_SA_config-ET_Global_Init_1-0008-09-c039746.p
GFP_Si

Spot check

In [12]:
with open('hyperborg/GFP_SA_config-ET_Random_Init_1-0096-00-2277bff5.p', 'rb') as f:
    c = pickle.load(f)
    
c

{'T_max': array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01]),
 'max_mut_pos': 110,
 'min_mut_pos': 29,
 'model': 'ET_Random_Init_1',
 'n_chains': 3500,
 'n_train_seqs': 96,
 'nmut_threshold': 15,
 'output_file': 'GFP_SimAnneal-ET_Random_Init_1-0096-00-2277bff5.p',
 'sa_n_iter': 3000,
 'seed': 0,
 'temp_decay_rate': 1.0}

## Special configs

First let's generate a couple of configs that dont' use sparse refit. We'll do 3 replicates of training on 96 sequences with ET_Global_Init_1

In [13]:
np.random.seed(352)
random.seed(352)

output_dir = 'hyperborg'
N_seq = 3500

for m in ['ET_Global_Init_1', 'OneHot']:
    for n_train in [96]:
        for seed in seeds[:3]:

            output_file_base = '%s-%04d-%02d-SparseRefit_False-%x.p' % (m, n_train, seed, random.randrange(2**30))
            sa_results_file = 'GFP_SimAnneal-' + output_file_base
            config_file = 'GFP_SA_config-' + output_file_base
            
            print(sa_results_file, config_file)
            
            
            config = {
                'seed': seed,
                'n_train_seqs': n_train,
                'model': m,
                'n_chains': N_seq,
                'T_max': np.ones(N_seq)*(10**(-2)),
                'sa_n_iter': 3000, # debug
                'temp_decay_rate': 1.0,
                'min_mut_pos': A006_common.GFP_LIB_REGION[0],
                'max_mut_pos': A006_common.GFP_LIB_REGION[1],
                'nmut_threshold': 15,
                'output_file': sa_results_file,
                'sparse_refit': False,
            }
            
            with open(os.path.join(output_dir, config_file), 'wb') as f:
                pickle.dump(file=f, obj=config)

GFP_SimAnneal-ET_Global_Init_1-0096-00-SparseRefit_False-24658718.p GFP_SA_config-ET_Global_Init_1-0096-00-SparseRefit_False-24658718.p
GFP_SimAnneal-ET_Global_Init_1-0096-01-SparseRefit_False-12585ac9.p GFP_SA_config-ET_Global_Init_1-0096-01-SparseRefit_False-12585ac9.p
GFP_SimAnneal-ET_Global_Init_1-0096-02-SparseRefit_False-81db765.p GFP_SA_config-ET_Global_Init_1-0096-02-SparseRefit_False-81db765.p
GFP_SimAnneal-OneHot-0096-00-SparseRefit_False-22b53790.p GFP_SA_config-OneHot-0096-00-SparseRefit_False-22b53790.p
GFP_SimAnneal-OneHot-0096-01-SparseRefit_False-11cb2259.p GFP_SA_config-OneHot-0096-01-SparseRefit_False-11cb2259.p
GFP_SimAnneal-OneHot-0096-02-SparseRefit_False-19011e96.p GFP_SA_config-OneHot-0096-02-SparseRefit_False-19011e96.p


In [14]:
with open('hyperborg/GFP_SA_config-ET_Global_Init_1-0096-01-SparseRefit_False-12585ac9.p', 'rb') as f:
    c = pickle.load(f)
    
c

{'T_max': array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01]),
 'max_mut_pos': 110,
 'min_mut_pos': 29,
 'model': 'ET_Global_Init_1',
 'n_chains': 3500,
 'n_train_seqs': 96,
 'nmut_threshold': 15,
 'output_file': 'GFP_SimAnneal-ET_Global_Init_1-0096-01-SparseRefit_False-12585ac9.p',
 'sa_n_iter': 3000,
 'seed': 1,
 'sparse_refit': False,
 'temp_decay_rate': 1.0}

### Small trust radius configs

Here we're generating run configs that use a small trust radius. i.e. instead of 15 mutations, we'll use 7. We'll compare 3 replicates of ET_Global_Init_1 to 3 replicates of OneHot all trained on 96 sequences. Instaed of doing 3K iterations we'll do 2K. 

In [15]:
np.random.seed(1984)
random.seed(1984)

output_dir = 'hyperborg'
N_seq = 3500

TRUST_RADIUS = 7

for m in ['ET_Global_Init_1', 'OneHot']:
    for n_train in [96]:
        for seed in seeds[:3]:

            output_file_base = '%s-%04d-%02d-SmallTrust-%x.p' % (m, n_train, seed, random.randrange(2**30))
            sa_results_file = 'GFP_SimAnneal-' + output_file_base
            config_file = 'GFP_SA_config-' + output_file_base
            
            print(sa_results_file, config_file)
            
            config = {
                'seed': seed,
                'n_train_seqs': n_train,
                'model': m,
                'n_chains': N_seq,
                'T_max': np.ones(N_seq)*(10**(-2)),
                'sa_n_iter': 2000, # debug
                'temp_decay_rate': 1.0,
                'min_mut_pos': A006_common.GFP_LIB_REGION[0],
                'max_mut_pos': A006_common.GFP_LIB_REGION[1],
                'nmut_threshold': TRUST_RADIUS, ## SMALL TRUST RADIUS
                'output_file': sa_results_file,
                'sparse_refit': True, 
            }
            
            with open(os.path.join(output_dir, config_file), 'wb') as f:
                pickle.dump(file=f, obj=config)

GFP_SimAnneal-ET_Global_Init_1-0096-00-SmallTrust-2e8818d2.p GFP_SA_config-ET_Global_Init_1-0096-00-SmallTrust-2e8818d2.p
GFP_SimAnneal-ET_Global_Init_1-0096-01-SmallTrust-3f4221b8.p GFP_SA_config-ET_Global_Init_1-0096-01-SmallTrust-3f4221b8.p
GFP_SimAnneal-ET_Global_Init_1-0096-02-SmallTrust-2b7857eb.p GFP_SA_config-ET_Global_Init_1-0096-02-SmallTrust-2b7857eb.p
GFP_SimAnneal-OneHot-0096-00-SmallTrust-272dab56.p GFP_SA_config-OneHot-0096-00-SmallTrust-272dab56.p
GFP_SimAnneal-OneHot-0096-01-SmallTrust-1bb5df52.p GFP_SA_config-OneHot-0096-01-SmallTrust-1bb5df52.p
GFP_SimAnneal-OneHot-0096-02-SmallTrust-31616a94.p GFP_SA_config-OneHot-0096-02-SmallTrust-31616a94.p


In [16]:
with open('hyperborg/GFP_SA_config-ET_Global_Init_1-0096-01-SmallTrust-3f4221b8.p', 'rb') as f:
    c = pickle.load(f)
    
c

{'T_max': array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01]),
 'max_mut_pos': 110,
 'min_mut_pos': 29,
 'model': 'ET_Global_Init_1',
 'n_chains': 3500,
 'n_train_seqs': 96,
 'nmut_threshold': 7,
 'output_file': 'GFP_SimAnneal-ET_Global_Init_1-0096-01-SmallTrust-3f4221b8.p',
 'sa_n_iter': 2000,
 'seed': 1,
 'sparse_refit': True,
 'temp_decay_rate': 1.0}

Now let's also do an "upper bound run", where we give linear regression and UniRep access to lots of training data. Sarkisyan Split 1 has >17K sequences, so let's use those. We'll do one replicate since we're using most of the training data.

In [17]:
np.random.seed(982)
random.seed(982)

output_dir = 'hyperborg'
N_seq = 3500

TRUST_RADIUS = 7

for m in ['ET_Global_Init_1', 'OneHot']:
    for n_train in [17000]:
        for seed in seeds[:1]:

            output_file_base = '%s-%04d-%02d-SmallTrustLargeN-%x.p' % (m, n_train, seed, random.randrange(2**30))
            sa_results_file = 'GFP_SimAnneal-' + output_file_base
            config_file = 'GFP_SA_config-' + output_file_base
            
            print(sa_results_file, config_file)
            
            config = {
                'seed': seed,
                'n_train_seqs': n_train,
                'model': m,
                'n_chains': N_seq,
                'T_max': np.ones(N_seq)*(10**(-2)),
                'sa_n_iter': 2000, # debug
                'temp_decay_rate': 1.0,
                'min_mut_pos': A006_common.GFP_LIB_REGION[0],
                'max_mut_pos': A006_common.GFP_LIB_REGION[1],
                'nmut_threshold': TRUST_RADIUS, ## SMALL TRUST RADIUS
                'output_file': sa_results_file,
                'sparse_refit': True, 
            }
            
            with open(os.path.join(output_dir, config_file), 'wb') as f:
                pickle.dump(file=f, obj=config)

GFP_SimAnneal-ET_Global_Init_1-17000-00-SmallTrustLargeN-c9bd934.p GFP_SA_config-ET_Global_Init_1-17000-00-SmallTrustLargeN-c9bd934.p
GFP_SimAnneal-OneHot-17000-00-SmallTrustLargeN-2f7b1159.p GFP_SA_config-OneHot-17000-00-SmallTrustLargeN-2f7b1159.p


In [18]:
with open('hyperborg/GFP_SA_config-OneHot-17000-00-SmallTrustLargeN-2f7b1159.p', 'rb') as f:
    c = pickle.load(f)
    
c

{'T_max': array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01]),
 'max_mut_pos': 110,
 'min_mut_pos': 29,
 'model': 'OneHot',
 'n_chains': 3500,
 'n_train_seqs': 17000,
 'nmut_threshold': 7,
 'output_file': 'GFP_SimAnneal-OneHot-17000-00-SmallTrustLargeN-2f7b1159.p',
 'sa_n_iter': 2000,
 'seed': 0,
 'sparse_refit': True,
 'temp_decay_rate': 1.0}