# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [1]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import utils
import data_utils

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [2]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [3]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
    'de_missing_4_base_prots_runs',
    'de_mc_de-mc_de-remc_4prot',
]

In [4]:
def list_contains_substring(container, string):
    return any(map(lambda x: x in string, container))


def find_experiment_folder(run):
    dirs = [f for f in os.listdir() if os.path.isdir(f)]
    exp_folder = [f for f in dirs if list_contains_substring(runs, f)]
    
    if len(exp_folder) > 1:
        raise Exception('Found more than one experiment folder. Aborting\n%s' % exp_folder)
        
    return exp_folder[0]


def load_repack_data():
    os.chdir('repack')
    d = utils.get_by_best_rmsd()
    os.chdir('..')
    
    return d


def load_hooke_jeeves_data():
    os.chdir('hooke-jeeves')
    d = []
    wanted_data = ['hooke_time', 'spent_evals']
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                has_data = any([wanted == tokens[0][:-1] for wanted in wanted_data])
                if has_data:
                    new_data[tokens[0][:-1]] = float(tokens[1])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_stats_data():
    os.chdir('stats')
    d = []
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                new_data['evals'] = float(tokens[0])
                new_data['time'] = float(tokens[8])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_data():
    reset_path()
    
    data = {}
    data_hooke = {}
    data_stats = {}
    
    for run in runs:
        os.chdir(run)
        
        exp_folder = find_experiment_folder(run)
        os.chdir(exp_folder)
        
        protein_folders = [f for f in os.listdir() if len(f) == 4 and os.path.isdir(f)]
        
        for pf in protein_folders:
            os.chdir(pf)
            if pf not in data.keys():
                data[pf] = {}
                data_hooke[pf] = {}
                data_stats[pf] = {}
            
            for exp in os.listdir():
                os.chdir(exp)
                    
                data[pf][exp] = load_repack_data()
                data_hooke[pf][exp] = load_hooke_jeeves_data()
                data_stats[pf][exp] = load_stats_data()
                
                os.chdir('..')
            
            os.chdir('..')
        
        os.chdir('..')
        os.chdir('..')
        
    return data, data_hooke, data_stats
    
    
data, data_hooke, data_stats = load_data()
print('finished')

INFO: Parsing     500 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     496 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing     732 repack.dat files
INFO: Parsing     663 rep

In [5]:
def get_metric(data, protein, experiment, metric):
    data_source = data[protein][experiment]
    data_metric = [d[metric] for d in data_source]
    
    return data_metric

In [6]:
protein_blacklist = ['1ab1', '1dfn', '2p5k', '2pmr', '3v1a']

In [7]:
def custom_metric_table(wanted_experiments=['sade_mc_final', 'sade_remc_final']):
    for experiment in wanted_experiments:
        for protein in sorted(list(data.keys())):
            if protein in protein_blacklist:
                continue
                
            if experiment not in wanted_experiments:
                continue
                                
#             data_metric = get_metric(data, protein, experiment, 'gdt_ts_after')
            data_metric = get_metric(data, protein, experiment, 'tm_score_after')

            print("%s & $%0.4f$ & $%0.4f$ & $%0.4f$ \\\\ \\hline" % (protein, max(data_metric), np.mean(data_metric), np.std(data_metric)))
        print()
    
custom_metric_table()

1acw & $0.2475$ & $0.1930$ & $0.0202$ \\ \hline
1ail & $0.4468$ & $0.3039$ & $0.0457$ \\ \hline
1crn & $0.3986$ & $0.2762$ & $0.0462$ \\ \hline
1enh & $0.3327$ & $0.2643$ & $0.0267$ \\ \hline
1l2y & $0.2495$ & $0.1924$ & $0.0294$ \\ \hline
1rop & $0.6229$ & $0.4588$ & $0.0646$ \\ \hline
1utg & $0.4938$ & $0.3676$ & $0.0632$ \\ \hline
1wqc & $0.3757$ & $0.2852$ & $0.0346$ \\ \hline
1zdd & $0.3178$ & $0.2797$ & $0.0276$ \\ \hline
2mr9 & $0.7514$ & $0.5117$ & $0.0900$ \\ \hline

1acw & $0.2583$ & $0.1945$ & $0.0280$ \\ \hline
1ail & $0.5461$ & $0.3336$ & $0.0697$ \\ \hline
1crn & $0.3987$ & $0.2669$ & $0.0468$ \\ \hline
1enh & $0.3200$ & $0.2607$ & $0.0236$ \\ \hline
1l2y & $0.2304$ & $0.1771$ & $0.0362$ \\ \hline
1rop & $0.5006$ & $0.3915$ & $0.0565$ \\ \hline
1utg & $0.5154$ & $0.3756$ & $0.0506$ \\ \hline
1wqc & $0.3861$ & $0.2831$ & $0.0463$ \\ \hline
1zdd & $0.3104$ & $0.2628$ & $0.0202$ \\ \hline
2mr9 & $0.7456$ & $0.5186$ & $0.0769$ \\ \hline



In [15]:
def custom_metric_table_raw(wanted_experiments=['sade_mc_final', 'sade_remc_final']):    
    proteins =  sorted(list(set(data.keys()) - set(protein_blacklist)))
    
    for experiment in wanted_experiments:
        data_raw = {}
        
        print(' & '.join(proteins))
        
        for protein in proteins:
            
            if protein in protein_blacklist:
                continue
                
            if experiment not in wanted_experiments:
                continue
                                
#             data_metric = get_metric(data, protein, experiment, 'gdt_ts_after')
            data_metric = get_metric(data, protein, experiment, 'tm_score_after')

            data_raw[protein] = data_metric
    
#         print(list(map(lambda x: (x[0], len(x[1])), data_raw.items())))

        min_size = min(list(map(lambda x:len(x[1]), data_raw.items())))
    
        for index in range(min_size):
            print(' & '.join(['$%4.4f$' % data_raw[protein][index] for protein in proteins]))
    
        print()

custom_metric_table_raw()

1acw & 1ail & 1crn & 1enh & 1l2y & 1rop & 1utg & 1wqc & 1zdd & 2mr9
$0.2378$ & $0.3157$ & $0.3525$ & $0.2517$ & $0.1117$ & $0.4559$ & $0.2829$ & $0.2349$ & $0.2675$ & $0.4912$
$0.1548$ & $0.2992$ & $0.2005$ & $0.2668$ & $0.2061$ & $0.5085$ & $0.4012$ & $0.2855$ & $0.2668$ & $0.5452$
$0.1636$ & $0.2652$ & $0.2320$ & $0.2536$ & $0.2025$ & $0.4395$ & $0.3467$ & $0.3018$ & $0.2645$ & $0.5890$
$0.1774$ & $0.2707$ & $0.3264$ & $0.2806$ & $0.1875$ & $0.4203$ & $0.4556$ & $0.2686$ & $0.2638$ & $0.4450$
$0.1711$ & $0.2881$ & $0.2611$ & $0.3021$ & $0.1931$ & $0.4784$ & $0.3996$ & $0.3303$ & $0.2363$ & $0.3887$
$0.1941$ & $0.2427$ & $0.2115$ & $0.2666$ & $0.1915$ & $0.5935$ & $0.3653$ & $0.2745$ & $0.3057$ & $0.4404$
$0.1914$ & $0.3416$ & $0.2637$ & $0.2320$ & $0.2399$ & $0.4548$ & $0.2530$ & $0.3546$ & $0.2690$ & $0.4457$
$0.2027$ & $0.2528$ & $0.3009$ & $0.2542$ & $0.1858$ & $0.4089$ & $0.2927$ & $0.2251$ & $0.2615$ & $0.4740$
$0.1927$ & $0.3026$ & $0.1848$ & $0.2439$ & $0.2229$ & $0.3643$ & $0