In [None]:
from correlations import *
from utils.various_tools import *
from utils.msa_manipulation import *
from utils.sample_MSA_from_EVE_model import *
from utils.plot_performance_over_training import *

%load_ext autoreload
%autoreload 2

MSA_data_folder='./data/MSA'
MSA_weights_location='./data/weights'
VAE_checkpoint_location='./results/VAE_parameters/'
correlations_location='./results/correlations'

In this notebook we sample MSAs from EVE checkpoints saved over training and check their performance in capturing 1-site marginals and 2-site covariation of the protein family.

# Training MSA_38k and its marginals:

We first compute 1- and 2-site marginals and correlations of the training set, and save the resulting `Correlations` object to file.

In [None]:
model_name = "PABP_38k"
MSA_location = MSA_data_folder + os.sep + model_name + ".a2m"
correlations_subfolder = correlations_location + os.sep + model_name + os.sep
shell_run('mkdir -p ' + correlations_subfolder)
file_name_label_to_seq_out = correlations_subfolder + model_name + "_label_to_seq.npy"
file_name_weights_out = MSA_weights_location + os.sep + model_name + ".npy"

PABP_38k = Correlations(MSA_location = MSA_location, MFA = False,
                        file_name_label_to_seq_out = file_name_label_to_seq_out,
                        file_name_weights_out = file_name_weights_out
                       )

In [None]:
PABP_38k.compute_all()
save_instance_to_file(correlations_subfolder + model_name + ".Correlations", PABP_38k)

## Test MSA and its marginals:

The test MSA is drawn from the file `PABP_245k.a2m` containing 245K sequences, as the 100K sequences with highest average Hamming distance with respect to the training MSA.

In [None]:
model_name='PABP_245k'
correlations_subfolder = correlations_location + os.sep + model_name + os.sep
shell_run('mkdir -p ' + correlations_subfolder)

MSA_location = MSA_data_folder + os.sep + model_name + ".a2m"
file_name_label_to_seq_out = correlations_subfolder + model_name + "_label_to_seq.npy"

PABP_245k = Correlations(MSA_location = MSA_location, preprocess_MSA=True, advanced_preprocess_MSA=True,
                          file_name_label_to_seq_out = file_name_label_to_seq_out, keep_cols=[8,89]
                          )

In [None]:
[PABP_38k.L, PABP_245k.L]

In [None]:
initial_and_final_label_to_seq_focus(PABP_38k.label_to_seq)

In [None]:
initial_and_final_label_to_seq_focus(PABP_245k.label_to_seq)

In [None]:
output_MSA_filename = MSA_data_folder + os.sep + "PABP_38k_test_100k.a2m"
N_out = 100*1000 

_, indices_candidates = find_furthest_sequences(label_to_seq_whole = PABP_245k.label_to_seq, 
                                              label_to_seq_subset = PABP_38k.label_to_seq, 
                                              output_MSA_filename = output_MSA_filename, 
                                              N_out = N_out)



Let us quickly check the position of the drawn sequences in the 245k MSA:

In [None]:
plot_histograms([indices_candidates])

We then compute the 1- and 2-site marginals and correlations of the training set as a reference.

In [None]:
model_name = "PABP_38k_test_100k"
MSA_location = MSA_data_folder + os.sep + model_name + ".a2m"
correlations_subfolder = correlations_location + os.sep + model_name + os.sep
shell_run('mkdir -p ' + correlations_subfolder)
file_name_label_to_seq_out = correlations_subfolder + model_name + "_label_to_seq.npy"
file_name_weights_out = MSA_weights_location + os.sep + model_name + ".npy"

PABP_test = Correlations(MSA_location = MSA_location, MFA = False,
                        file_name_label_to_seq_out = file_name_label_to_seq_out,
                        file_name_weights_out = file_name_weights_out,
                        advanced_preprocess_MSA = False
                       )

In [None]:
PABP_test.compute_all()
save_instance_to_file( correlations_subfolder + model_name + ".Correlations", PABP_test)

## EVE's performance over training against training and test MSAs

We now sample a MSA of 10K sequences from EVE model checkpoints saved during training, and then visualise their performance in reproducing 1- and 2-site marginals of training and test sets.

In [None]:
ref_weights_file_name = "PABP_38k"
ref_Correlations_name = "PABP_38k"
ref_MSA_name = "PABP_38k.a2m"
in_VAE_checkpoint_location = VAE_checkpoint_location + "PABP_38k/PABP_38k"
out_Correlations_name = "PABP_38k_VAE"
initial_epoch = 10 
final_epoch = 1400


generate_MSA_and_Correlations_from_EVE_models(ref_weights_file_name,
                                            ref_Correlations_name,
                                            ref_MSA_name,
                                            in_VAE_checkpoint_location,
                                            out_Correlations_name = out_Correlations_name,
                                            initial_epoch = initial_epoch*1000,
                                            final_epoch = final_epoch*1000,
                                            step = 10*1000,
                                            N_samples = 10000,
                                            threshold = 0.3,
                                            model_parameters_location='./EVE/default_model_params.json'
                                        )

In [None]:
name_performance = "PABP_38k_training"
ref_Correlations_name = "PABP_38k"
labels = ['f1', 'f2', 'CM2', 'MI']

compute_performance_EVE_models_vs_ref(name_performance,
                                        ref_Correlations_name = ref_Correlations_name,
                                        EVE_Correlations_name = out_Correlations_name,
                                        labels = labels, 
                                        input_results_location = None, 
                                        initial_epoch = initial_epoch*1000,
                                        final_epoch = final_epoch*1000,
                                        step = 10*1000
                                       )

In [None]:
name_performance = "PABP_38k_test"
ref_Correlations_name = "PABP_38k_test_100k"
labels = ['f1', 'f2', 'CM2', 'MI']

compute_performance_EVE_models_vs_ref(name_performance,
                                        ref_Correlations_name = ref_Correlations_name,
                                        EVE_Correlations_name = out_Correlations_name,
                                        labels = labels, 
                                        input_results_location = None, 
                                        initial_epoch = initial_epoch*1000,
                                        final_epoch = final_epoch*1000,
                                        step = 10*1000
                                       )

In [None]:
file_name_1 = './results/correlations/_performances/PABP_38k_training.npy'
file_name_2 = './results/correlations/_performances/PABP_38k_test.npy'

visualise_performance_train_test_1column(file_name_1, file_name_2, reduced_index = ['f1', 'f2', 'CM2'], 
                                        filename_out = './results/plots_performance/PABP_38k_EVE.pdf',
                                        title = 'EVE_38K')
