# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [1]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import utils
import data_utils

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [2]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [3]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
]

In [4]:
def list_contains_substring(container, string):
    return any(map(lambda x: x in string, container))


def find_experiment_folder(run):
    dirs = [f for f in os.listdir() if os.path.isdir(f)]
    exp_folder = [f for f in dirs if list_contains_substring(runs, f)]
    
    if len(exp_folder) > 1:
        raise Exception('Found more than one experiment folder. Aborting\n%s' % exp_folder)
        
    return exp_folder[0]


def load_repack_data():
    os.chdir('repack')
    d = utils.get_by_best_rmsd()
    os.chdir('..')
    
    return d


def load_hooke_jeeves_data():
    os.chdir('hooke-jeeves')
    d = []
    wanted_data = ['hooke_time', 'spent_evals']
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                has_data = any([wanted == tokens[0][:-1] for wanted in wanted_data])
                if has_data:
                    new_data[tokens[0][:-1]] = float(tokens[1])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_stats_data():
    os.chdir('stats')
    d = []
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                new_data['evals'] = float(tokens[0])
                new_data['time'] = float(tokens[8])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_data():
    reset_path()
    
    data = {}
    data_hooke = {}
    data_stats = {}
    
    for run in runs:
        os.chdir(run)
        
        exp_folder = find_experiment_folder(run)
        os.chdir(exp_folder)
        
        protein_folders = [f for f in os.listdir() if len(f) == 4 and os.path.isdir(f)]
        
        for pf in protein_folders:
            os.chdir(pf)
            if pf not in data.keys():
                data[pf] = {}
                data_hooke[pf] = {}
                data_stats[pf] = {}
            
            for exp in os.listdir():
                os.chdir(exp)
                    
                data[pf][exp] = load_repack_data()
                data_hooke[pf][exp] = load_hooke_jeeves_data()
                data_stats[pf][exp] = load_stats_data()
                
                os.chdir('..')
            
            os.chdir('..')
        
        os.chdir('..')
        os.chdir('..')
        
    return data, data_hooke, data_stats
    
    
data, data_hooke, data_stats = load_data()
print('finished')

INFO: Parsing     500 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     496 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing     732 repack.dat files
INFO: Parsing     663 rep

In [5]:
def get_metric(data_, protein, experiment, metric):
    data_source = data_[protein][experiment]
    data_metric = [d[metric] for d in data_source]
    
    return data_metric

In [6]:
protein_blacklist = ['1ab1', '1dfn', '2p5k', '2pmr', '3v1a']

In [8]:
def spent_time_table(wanted_experiments=['sade_mc_final', 'sade_remc_final']):
    for experiment in wanted_experiments:
        for protein in sorted(list(data.keys())):
            if protein in protein_blacklist:
                continue
                
            if experiment not in wanted_experiments:
                continue
                                
            d1 = get_metric(data, protein, experiment, 'repack_time')
            d2 = get_metric(data_hooke, protein, experiment, 'hooke_time')
            d3 = get_metric(data_stats, protein, experiment, 'time')
            
            d_mean = np.mean(d1) + np.mean(d2) * 10 + np.mean(d3)
            d_std = np.std(d1) + np.std(d2) * 10 + np.std(d3)

            print("%s & $%0.4f$ & $%0.4f$ \\\\ \\hline" % (protein, d_mean, d_std))
        print()
    
spent_time_table()

1acw & $381.8566$ & $76.8664$ \\ \hline
1ail & $1141.1461$ & $142.2935$ \\ \hline
1crn & $721.0492$ & $118.6126$ \\ \hline
1enh & $763.7677$ & $86.2599$ \\ \hline
1l2y & $252.8980$ & $19.1262$ \\ \hline
1rop & $963.2695$ & $124.7610$ \\ \hline
1utg & $1010.0412$ & $142.8927$ \\ \hline
1wqc & $341.4484$ & $35.1510$ \\ \hline
1zdd & $462.3010$ & $127.2581$ \\ \hline
2mr9 & $702.5817$ & $95.6492$ \\ \hline

1acw & $397.3399$ & $39.2449$ \\ \hline
1ail & $1127.3899$ & $139.9729$ \\ \hline
1crn & $696.0591$ & $78.4302$ \\ \hline
1enh & $783.4028$ & $95.5297$ \\ \hline
1l2y & $252.7463$ & $18.8516$ \\ \hline
1rop & $940.5557$ & $203.5212$ \\ \hline
1utg & $1018.6922$ & $193.5863$ \\ \hline
1wqc & $346.3862$ & $36.6248$ \\ \hline
1zdd & $483.3383$ & $50.8998$ \\ \hline
2mr9 & $695.9580$ & $189.4521$ \\ \hline



In [14]:
def spent_evals_table(wanted_experiments=['sade_mc_final', 'sade_remc_final']):
    base = 100000

    for experiment in wanted_experiments:
        for protein in sorted(list(data.keys())):
            if protein in protein_blacklist:
                continue
                
            if experiment not in wanted_experiments:
                continue
                              
            d = get_metric(data_hooke, protein, experiment, 'spent_evals')
            
            print("%s & $%0.4f$ & $%0.4f$ & $%0.4f$ \\\\ \\hline" % (protein, max(d), np.mean(d), np.std(d)))
        print()
    
spent_evals_table()

1acw & $65764.0000$ & $19894.6690$ & $9323.3251$ \\ \hline
1ail & $63898.0000$ & $23215.9890$ & $12096.6411$ \\ \hline
1crn & $85706.0000$ & $23704.9947$ & $11297.4331$ \\ \hline
1enh & $63347.0000$ & $21141.6667$ & $11922.8046$ \\ \hline
1l2y & $41279.0000$ & $16022.7659$ & $7145.3393$ \\ \hline
1rop & $74935.0000$ & $21008.9153$ & $11595.2773$ \\ \hline
1utg & $82554.0000$ & $23257.8303$ & $13708.3805$ \\ \hline
1wqc & $50201.0000$ & $18342.0987$ & $8163.8972$ \\ \hline
1zdd & $56218.0000$ & $21848.3618$ & $10229.3850$ \\ \hline
2mr9 & $67762.0000$ & $22260.8176$ & $11415.6200$ \\ \hline

1acw & $57595.0000$ & $19818.5598$ & $8795.8194$ \\ \hline
1ail & $66802.0000$ & $24927.3000$ & $11983.8881$ \\ \hline
1crn & $59047.0000$ & $22657.6200$ & $10469.2788$ \\ \hline
1enh & $83720.0000$ & $24419.4684$ & $13052.6594$ \\ \hline
1l2y & $37358.0000$ & $15968.0817$ & $6859.0816$ \\ \hline
1rop & $64987.0000$ & $23404.5686$ & $11476.0593$ \\ \hline
1utg & $446525.0000$ & $26221.2802$ & $20265