# sample sequences based on local structure
use the residue preferences learned from structural microenvironment to sample combinatorial variants with Boltzmann function

### inputs:
- the structure based residue preferences
- a set of positions and mutations to be sampled

### outputs:
- generated samples 

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import sys
import importlib

sys.path.append('../src/')
import covesTools
importlib.reload(covesTools)
import covesTools as ct

### file paths
din_preferences = '../data/coves/preferences/'
dout_at_sample = '../data/coves/samples/at/'
dout_gfp_sample = '../data/coves/samples/gfp/'

dout_test_sample = '../data/coves/samples/at/test_to_delete/'


fin_gfp_exp = '../data/coves/scores/all_scores/df_all_gfp.csv'

# antitoxin generation

In [29]:
# read residue mutaiton preference file
df_gvp_pred_at = ct.read_res_pred(din_preferences +'gvp_100_m_at_1646945484.3030427_8_220711.csv')


In [30]:
# generate seqs for only the 10 positions: L47, D51, I52, R54, L55, F73, R77, E79, A80, R81 .
mut_pos = ['L47', 'D51', 'I52', 'R54', 'L55', 'F73', 'R77', 'E79', 'A80', 'R81'] # M0 indexed
# convert this to m1 indexed positions
mut_pos_m1 = [m[0] + str(int(m[1:])+1) for m in mut_pos]

In [32]:
# sampling mutations for antitoxin
n_sample = 500
for t in [0.1, 0.5,0.7,1,1.5,2,2.25,2.5,2.75,3,4,5]:
    sampled_mutkeys = ct.sample_coves(df_gvp_pred_at, mut_pos_m1, n_sample = n_sample, t=t)
    # write to file
    with open(dout_at_sample + f'/gvp_100_m_RES_1646945484_3030427_8_220711_samples_t{t}_n{n_sample}.csv', 'w') as fout:
        for m in sampled_mutkeys:
            fout.write(m + '\n')


# GFP generation

In [39]:
# read residue mutaiton preference file
df_gvp_pred_gfp = ct.read_res_pred(din_preferences +'gvp_100_m_gfp_230523_.csv')


In [40]:
# to limit generation to mutants that can be scored with the oracle function, read in the individual substitutions that were observed in experiment
# read experiment file to get mutations that are observed.
df_gfp = pd.read_csv(fin_gfp_exp, 
                     index_col = 0)

In [42]:
muts_seen_dms = set([ind_m for m in df_gfp.mutant for ind_m in m.split(':') ])
#adding the wt residues to each seen position
wt_pos_wt_seen = list(set([m[:-1]+m[0] for m in muts_seen_dms]))
muts_seen_dms = set(list(muts_seen_dms) + wt_pos_wt_seen)
num_pos_mutate_exp = len(set([m[1:-1] for m in muts_seen_dms]))
print(f'{num_pos_mutate_exp} positions mutated in exp')

# only get the sitewise preferences for which the mutants have been seen
df_gvp_pred_gfp['seen_exp'] = df_gvp_pred_gfp.mut.apply(lambda x: x in muts_seen_dms)
df_gvp_pred_gfp_seen = df_gvp_pred_gfp.loc[df_gvp_pred_gfp.seen_exp]

# get list of wt_aa_pos that are seen in both:
mut_pos_both_exp_res = set(df_gvp_pred_gfp_seen.mut.str[:-1])
print('generating for # of seen variants only',len(mut_pos_both_exp_res))


233 positions mutated in exp
generating for # of seen variants only 222


In [53]:
def gen_samples_gfp(df_gvp_pred_use, 
                    mut_pos_both_exp_res,
                    n_sample =1000, 
                    n_pos_mutate_range=[10], 
                    t_range = [ 0.1, 0.3,0.4, 0.5, 0.6, 0.7,1,2, 3, 4, 5, 8, 10, 20], 
                    dout = './'):
    # sample sequences for gfp by choosing a subset of positions to mutate
    
    # Args:
    #    df_gvp_pred_use (df): residue preferences from structures
    #                          filtered for mutants that should be sampled
    #    mut_pos_both_exp_res (list): list of wtaa+pos to choose sampling from
    #    n_sample (int): number of samples to generated
    #    n_pos_mutate_range (list of ints): the maximum number of positions that are 
    #                                       allowed to be designed
    #    t_range (list of floats): the sampling temperatures to use
    #    dout (str): path to sampling files
    
    
    for t in t_range: 
        for n_pos_mutate in n_pos_mutate_range:
            print(t, n_pos_mutate)
            mutkeys_sampled = []
            for i in range(n_sample):
                # pick a subset of random positions to mutate
                mut_pos_to_sample = random.sample(mut_pos_both_exp_res, n_pos_mutate)
                # sample mutkeys
                sampled_mutkeys = ct.sample_coves(
                    df_gvp_pred_use, 
                    mut_pos_to_sample, 
                    n_sample = 1, 
                    t=t)
                sampled_mutkey = sampled_mutkeys[0]
                
                # exclude 'wt mutations'
                if sampled_mutkey[0] != sampled_mutkey[-1]:
                    mutkeys_sampled.append(sampled_mutkey)

            with open(dout + f'gvp_100_m_RES_1646945484_3030427_8_230519_gfp_samples_t{t}_n{n_sample}_downselect{n_pos_mutate}_230528.csv', 'w') as fout:
                for m in mutkeys_sampled:
                    fout.write(m + '\n')

In [54]:
# sampling at 10 positions that are fixed
n_sample = 300
n_pos_mutate_range = [10]

gen_samples_gfp(
    df_gvp_pred_gfp_seen, 
    n_sample =n_sample, 
    n_pos_mutate_range=n_pos_mutate_range, 
    t_range = [ 0.1, 0.3,0.4, 0.5, 0.6, 0.7,1,2, 3, 4, 5, 8, 10, 20], 
    dout = dout_gfp_sample
)


0.1 10


since Python 3.9 and will be removed in a subsequent version.
  mut_pos_to_sample = random.sample(mut_pos_both_exp_res, n_pos_mutate)


0.3 10
0.4 10
0.5 10
0.6 10
0.7 10
1 10
2 10
3 10
4 10
5 10
8 10
10 10
20 10


In [13]:
# vanilla but with higher number of possible mutations
n_sample = 300
n_pos_mutate_range = [15]

gen_samples_gfp(
    df_gvp_pred_gfp_seen, 
    n_sample =n_sample, 
    n_pos_mutate_range=n_pos_mutate_range, 
    t_range = [ 0.1, 0.3,0.4, 0.5, 0.6, 0.7,1,2, 3, 4, 5, 8, 10, 20], 
    dout = dout_gfp_sample
)



TypeError: gen_samples() got an unexpected keyword argument 'n_pos_mutate'

In [17]:
# scanning mutations across allowed num mutations, adding 2
n_sample = 300
n_pos_mutate_range = [5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20]

gen_samples_gfp(
    df_gvp_pred_gfp_seen, 
    n_sample =n_sample, 
    n_pos_mutate_range=n_pos_mutate_range, 
    t_range = [0.5, 1], 
    dout = dout_gfp_sample
)

0.5 5


since Python 3.9 and will be removed in a subsequent version.
  mut_pos_to_sample = random.sample(mut_pos_both_exp_res, n_pos_mutate)


0.5 6
0.5 7
0.5 8
0.5 9
0.5 10
0.5 11
0.5 12
0.5 13
0.5 14
0.5 15
0.5 16
0.5 17
0.5 18
0.5 19
0.5 20
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
