# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [1]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import utils
import data_utils

import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [2]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [3]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
]

In [4]:
protein_blacklist = ['1ab1', '1dfn', '2p5k', '2pmr', '3v1a']

def remove_blacklisted_proteins(alldata, protein_blacklist):
    proteins_before = list(alldata.keys())

    for protein in protein_blacklist:
        alldata.pop(protein.lower(), False)

    proteins_after = list(alldata.keys())

    print('removed %d proteins. Blacklist had %d' % (
        len(proteins_before) - len(proteins_after),
        len(protein_blacklist)
    ))


def remove_not_common_methods(alldata):
    first_key = list(alldata.keys())[0]
    methods = set(alldata[first_key].keys())

    for _, protein_methods in alldata.items():
        methods_ = set(protein_methods.keys())
        methods = methods & methods_

    proteins = list(alldata.keys())
    for protein in proteins:
        protein_methods = list(alldata[protein].keys())

        for protein_method in protein_methods:
            if protein_method not in methods:
                print('[WARN] removing %s from %s' % (protein_method, protein))
                alldata[protein].pop(protein_method, True)

In [5]:
def list_contains_substring(container, string):
    return any(map(lambda x: x in string, container))


def find_experiment_folder(run):
    dirs = [f for f in os.listdir() if os.path.isdir(f)]
    exp_folder = [f for f in dirs if list_contains_substring(runs, f)]
    
    if len(exp_folder) > 1:
        raise Exception('Found more than one experiment folder. Aborting\n%s' % exp_folder)
        
    return exp_folder[0]


def load_repack_data():
    os.chdir('repack')
    d = utils.get_by_best_rmsd()
    os.chdir('..')
    
    return d


def get_best_by_score_3(data):
    pass


def load_full_repack_data(data):
    os.chdir('repack')
    
    d = {}
    d['best_by_rmsd'] = utils.get_by_best_rmsd()
    d['best_by_energy'] = utils.get_by_best_energy()
    d['best_by_score3'] = get_best_by_score_3(data)
    d['all_repacks'] = utils.extract_all_repack_data()
    
    os.chdir('..')
    
    return d


def load_hooke_jeeves_data():
    os.chdir('hooke-jeeves')
    d = []
    wanted_data = ['hooke_time', 'spent_evals']
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                has_data = any([wanted == tokens[0][:-1] for wanted in wanted_data])
                if has_data:
                    new_data[tokens[0][:-1]] = float(tokens[1])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_stats_data():
    os.chdir('stats')
    d = []
    
    for datafile in os.listdir():
        with open(datafile, 'rt') as f:
            new_data = {}
            for line in f.readlines():
                tokens = re.sub(' {2,}', ' ', line.strip()).split(' ')

                new_data['evals'] = float(tokens[0])
                new_data['time'] = float(tokens[8])
                    
            d.append(new_data)
            
    os.chdir('..')
    
    return d


def load_data():
    reset_path()
    
    data = {}
    data_full = {}
    data_hooke = {}
    data_stats = {}
    
    for run in runs:
        os.chdir(run)
        
        exp_folder = find_experiment_folder(run)
        os.chdir(exp_folder)
        
        protein_folders = [f for f in os.listdir() if len(f) == 4 and os.path.isdir(f)]
        
        for pf in protein_folders:
            if pf in protein_blacklist:
                continue
                
            os.chdir(pf)
            if pf not in data.keys():
                data[pf] = {}
                data_full[pf] = {}
                data_hooke[pf] = {}
                data_stats[pf] = {}
            
            for exp in os.listdir():
                os.chdir(exp)
                    
                data[pf][exp] = load_repack_data()
                data_full[pf][exp] = load_full_repack_data(data)
                data_hooke[pf][exp] = load_hooke_jeeves_data()
                data_stats[pf][exp] = load_stats_data()
                
                os.chdir('..')
            
            os.chdir('..')
        
        os.chdir('..')
        os.chdir('..')
        
    remove_not_common_methods(data)
    remove_not_common_methods(data_full)
    remove_not_common_methods(data_hooke)
    remove_not_common_methods(data_stats)
        
    return data, data_full, data_hooke, data_stats
    
    
data, data_full, data_hooke, data_stats = load_data()
print('finished')

INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     448 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     430 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     501 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     438 repack.dat files
INFO: Parsing     500 repack.dat files
INFO: Parsing     500 rep

INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 repack.dat files
INFO: Parsing      50 rep

In [6]:
def get_metric(data, protein, experiment, metric):
    data_source = data[protein][experiment]
    data_metric = [d[metric] for d in data_source]
    
    return data_metric

In [7]:
def custom_metric_table(wanted_experiments=['classic-abinitio', 'sade_mc_final', 'sade_remc_final']):
    for protein in sorted(list(data.keys())):
        if protein in protein_blacklist:
            continue
                
        to_print = ['%s' % protein]
        for experiment in wanted_experiments:
            if experiment not in wanted_experiments:
                continue
                                
            rmsd_before = get_metric(data, protein, experiment, 'rmsd_before')
            rmsd_after = get_metric(data, protein, experiment, 'rmsd_after')
            
            diff = map(
                lambda x: x[0] - x[1],
                zip(rmsd_before, rmsd_after)
            )
            
            diff = list(diff)
            
            to_print.append("$%0.4f$ & $%0.4f$" % (np.mean(diff), np.std(diff)))
        
        print(' & '.join(to_print), end='')
        print(' \\\\ \\hline')
    
custom_metric_table()

1acw & $-0.1656$ & $0.3372$ & $0.2616$ & $0.4944$ & $0.2589$ & $0.5765$ \\ \hline
1ail & $-0.1113$ & $0.4629$ & $0.2470$ & $0.7731$ & $0.2379$ & $0.7278$ \\ \hline
1crn & $-0.0620$ & $0.4203$ & $-0.0297$ & $0.5865$ & $-0.1128$ & $0.4368$ \\ \hline
1enh & $-0.0558$ & $0.4854$ & $-0.1277$ & $0.5577$ & $-0.1380$ & $0.6081$ \\ \hline
1l2y & $0.2505$ & $0.3773$ & $0.3856$ & $0.5036$ & $0.6653$ & $0.7176$ \\ \hline
1rop & $0.8336$ & $0.4484$ & $0.7814$ & $0.9170$ & $1.2214$ & $1.3530$ \\ \hline
1utg & $-0.3506$ & $0.4712$ & $0.1346$ & $0.5762$ & $0.1793$ & $0.7202$ \\ \hline
1wqc & $-0.2543$ & $0.4381$ & $0.1729$ & $0.5624$ & $0.2160$ & $0.6377$ \\ \hline
1zdd & $0.2283$ & $0.4144$ & $0.7429$ & $0.6098$ & $0.7470$ & $0.9897$ \\ \hline
2mr9 & $-0.2667$ & $0.6094$ & $0.2190$ & $0.5679$ & $0.1875$ & $0.6957$ \\ \hline


In [9]:
def apply_mann_whitney(data, method_a, method_b, protein):
    data_a = data[method_a]['data']['raw']
    data_b = data[method_b]['data']['raw']
    _, p = scipy.stats.mannwhitneyu(data_a, data_b)
    
    mean_a = data[method_a]['data']['mean']
    mean_b = data[method_b]['data']['mean']
    
    return p, mean_a, mean_b


def mann_whitney_summary(alpha=0.05, mode='best_by_energy', metric='score', ref=[], ignore=[], skip_all_ref=False):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    
    methods = sorted(summary[proteins[0]].keys())
    
    proposed_wins = 0
    proposed_loses = 0
    proposed_draws = 0
    
    print('\\begin{table}')
    print('\\centering')
    column_header = '\\begin{tabular}{r|r|r|r}'
    print(column_header)
        
    print('%s \\\\ \\hline \\hline' % ('Protein & Wins & Loses & Draws'))
    
    done = set()
    
    for protein in proteins:
        proposed_wins = 0
        proposed_loses = 0
        proposed_draws = 0
        for i in range(len(methods)):
            method_a = methods[i]
            if all(map(lambda x: method_a != x, ref)):
                continue
                
            for j in range(len(methods)):
                method_b = methods[j]
                
                key = ','.join(sorted([protein, method_a, method_b]))
                
                if key in done:
                    continue
                
                done.add(key)
                
                if any(map(lambda x: method_a == x or method_b == x, ignore)):
                    continue
                    
                if skip_all_ref:
                    method_a_match = any(map(lambda x: method_a == x, ref))
                    method_b_match = any(map(lambda x: method_b == x, ref))
                    
                    if method_a_match and method_b_match:
                        continue
                                    
                p, mean_a, mean_b = apply_mann_whitney(summary[protein], method_a, method_b, protein)
                
                if p > alpha:
                    proposed_draws += 1
#                     print('%5s %20s %20s %f  [DRAW]' % (protein, method_a, method_b, p))
                    continue
                    
                if mean_a < mean_b:
                    proposed_wins += 1
#                     print('%5s %20s %20s %f' % (protein, method_a, method_b, p))
                else:
                    proposed_loses += 1
#                     print('%5s %20s %20s %f' % (protein, method_b, method_a, p))
        
#         print('%5s  wins: %2s  loses %2d  draws: %2d' % (protein, proposed_wins, proposed_loses, proposed_draws))
        print('%5s & %2s & %2d & %2d \\\\ \\hline' % (protein, proposed_wins, proposed_loses, proposed_draws))
    
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD', 'score': 'score3' }
    metric_string = metric_dict[metric]
    print('\\caption{Summary of Mann-Whitney \\texttt{%s} using %s}' % (mode.replace('_', '-'), metric_string))
    print('\\label{tab:mann-whitney-summary-%s-%s}' % (mode.replace('_', '-'), metric_string))
    print('\\end{table}')

alldata = data_full
    
mann_whitney_summary(
    alpha=0.05,
    mode='best_by_energy',
    metric='score',
    ref=['sade_mc_final', 'sade_remc_final'],
    ignore=['sade_remc', 'sade_remc_ffi9_02', 'sade_mc_ffi9_02'],
    skip_all_ref=True
)

\begin{table}
\centering
\begin{tabular}{r|r|r|r}
Protein & Wins & Loses & Draws \\ \hline \hline
 1acw &  2 &  0 &  0 \\ \hline
 1ail &  0 &  0 &  2 \\ \hline
 1crn &  1 &  1 &  0 \\ \hline
 1enh &  1 &  0 &  1 \\ \hline
 1l2y &  2 &  0 &  0 \\ \hline
 1rop &  2 &  0 &  0 \\ \hline
 1utg &  2 &  0 &  0 \\ \hline
 1wqc &  2 &  0 &  0 \\ \hline
 1zdd &  2 &  0 &  0 \\ \hline
 2mr9 &  2 &  0 &  0 \\ \hline
\end{tabular}
\caption{Summary of Mann-Whitney \texttt{best-by-energy} using score3}
\label{tab:mann-whitney-summary-best-by-energy-score3}
\end{table}


In [12]:
def kruskal_wallis_table(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    methods = sorted(summary[proteins[0]].keys())
    
    print('\\begin{table}')
    print('\\centering')
    print('\\begin{tabular}{r|c}')

    methods_header = 'Protein & $p$-value'
    
    print('%s \\\\ \\hline \\hline' % (methods_header))
    
    for protein in proteins:         
        print('%s & ' % protein, end='')
        
        data = [
            summary[protein][method]['data']['raw'] for method in methods
        ]
            
        stats, p = scipy.stats.kruskal(*data)
        
        if p < alpha:
            print('$\\bm{%4.4f}$' % p, end='')
        else:
            print('    $%4.4f$ ' % p, end='')
            
        print(' \\\\ \\hline')   
            
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD', 'score': 'score3' }
    metric_string = metric_dict[metric]
    print('\\caption{Kruskal-Wallis for \\texttt{%s} using %s}' % (mode.replace('_', '-'), metric_string))
    print('\\label{tab:kruskal-wallis-wilk-%s-%s}' % (mode.replace('_', '-'), metric_string))
    print('\\end{table}')

In [13]:
kruskal_wallis_table(alpha=0.05, mode='best_by_energy', metric='score')

\begin{table}
\centering
\begin{tabular}{r|c}
Protein & $p$-value \\ \hline \hline
1acw & $\bm{0.0000}$ \\ \hline
1ail & $\bm{0.0000}$ \\ \hline
1crn & $\bm{0.0000}$ \\ \hline
1enh & $\bm{0.0000}$ \\ \hline
1l2y & $\bm{0.0000}$ \\ \hline
1rop & $\bm{0.0000}$ \\ \hline
1utg & $\bm{0.0000}$ \\ \hline
1wqc & $\bm{0.0000}$ \\ \hline
1zdd & $\bm{0.0000}$ \\ \hline
2mr9 & $\bm{0.0000}$ \\ \hline
\end{tabular}
\caption{Kruskal-Wallis for \texttt{best-by-energy} using score3}
\label{tab:kruskal-wallis-wilk-best-by-energy-score3}
\end{table}
