# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [2]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import data_utils

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [3]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [4]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
]

In [23]:
protein_blacklist = ['1ab1', '1dfn', '2P5K', '2pmr', '3V1A']

dataset = data_utils.load_all_data(runs)
alldata = data_utils.merge_data(dataset, protein_blacklist=protein_blacklist, keep_only_common_methods=False)

INFO: Loaded 8 experiment runs dataset
removed 5 proteins. Blacklist had 5


In [6]:
def pretty_print_experiment_summary(experiment_summary):
    def protein_summary_print(p_data):
        keys = sorted(p_data.keys())
        
        for key in keys:
            experiment = p_data[key]
            name = key
            data = experiment['data']

            print('%25s    min: %8.2f  mean: %8.2f  std: %8.2f' % (
                name, data['min'], data['mean'], data['std']
            ))
    
    for protein in sorted(experiment_summary.keys()): 
        print()
        print(protein.upper())

        protein_summary_print(experiment_summary[protein])

## Experiment Summary with *Best by Energy*

### RMSD data

In [7]:
experiment_summary_rmsd = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='rmsd_after')
pretty_print_experiment_summary(experiment_summary_rmsd)


1AB1
            sade_mc_final    min:     5.02  mean:     8.58  std:     1.70
          sade_remc_final    min:     3.65  mean:     7.81  std:     1.82

1ACW
         classic-abinitio    min:     5.85  mean:     7.48  std:     0.57
               sade_de_mc    min:     5.18  mean:     6.84  std:     0.74
             sade_de_remc    min:     5.42  mean:     7.02  std:     0.80
                  sade_mc    min:     4.85  mean:     6.90  std:     0.66
          sade_mc_ffi9_02    min:     5.05  mean:     7.28  std:     0.79
            sade_mc_final    min:     5.27  mean:     7.30  std:     0.91
                sade_remc    min:     5.41  mean:     6.89  std:     0.64
        sade_remc_ffi9_02    min:     5.74  mean:     6.91  std:     0.68
          sade_remc_final    min:     5.53  mean:     7.36  std:     1.01

1AIL
         classic-abinitio    min:     4.75  mean:     7.95  std:     1.56
          sade_mc_ffi9_02    min:     6.37  mean:     8.54  std:     1.15
            sade_mc_

## Energy data 

In [8]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='scorefxn')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1AB1
            sade_mc_final    min:   -80.00  mean:   -64.31  std:     7.48
          sade_remc_final    min:   -83.50  mean:   -65.42  std:     7.70

1ACW
         classic-abinitio    min:   -52.05  mean:   -39.61  std:     5.32
               sade_de_mc    min:   -48.59  mean:   -35.52  std:     8.67
             sade_de_remc    min:   -49.61  mean:   -30.66  std:    12.08
                  sade_mc    min:   -51.12  mean:   -39.01  std:     6.15
          sade_mc_ffi9_02    min:   -48.77  mean:   -40.92  std:     4.44
            sade_mc_final    min:   -56.71  mean:   -39.81  std:     5.51
                sade_remc    min:   -52.16  mean:   -39.48  std:     5.89
        sade_remc_ffi9_02    min:   -47.83  mean:   -39.18  std:     4.80
          sade_remc_final    min:   -52.66  mean:   -36.85  std:     7.27

1AIL
         classic-abinitio    min:  -172.76  mean:  -131.39  std:    19.03
          sade_mc_ffi9_02    min:  -153.65  mean:  -121.99  std:    19.02
            sade_mc_

## Experiment Summary with *Best by RMSD* 

## RMSD data

In [9]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='rmsd_after')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1AB1
            sade_mc_final    min:     3.89  mean:     6.51  std:     1.15
          sade_remc_final    min:     3.65  mean:     6.24  std:     1.04

1ACW
         classic-abinitio    min:     5.85  mean:     7.48  std:     0.57
               sade_de_mc    min:     5.18  mean:     6.84  std:     0.74
             sade_de_remc    min:     5.42  mean:     7.02  std:     0.80
                  sade_mc    min:     4.85  mean:     6.90  std:     0.66
          sade_mc_ffi9_02    min:     5.05  mean:     7.28  std:     0.79
            sade_mc_final    min:     4.45  mean:     5.86  std:     0.55
                sade_remc    min:     5.41  mean:     6.89  std:     0.64
        sade_remc_ffi9_02    min:     5.74  mean:     6.91  std:     0.68
          sade_remc_final    min:     4.40  mean:     5.79  std:     0.63

1AIL
         classic-abinitio    min:     4.75  mean:     7.95  std:     1.56
          sade_mc_ffi9_02    min:     6.37  mean:     8.54  std:     1.15
            sade_mc_

## Energy Data

In [10]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='scorefxn')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1AB1
            sade_mc_final    min:   -78.68  mean:   -42.66  std:    21.70
          sade_remc_final    min:   -82.96  mean:   -47.75  std:    22.29

1ACW
         classic-abinitio    min:   -52.05  mean:   -39.61  std:     5.32
               sade_de_mc    min:   -48.59  mean:   -35.52  std:     8.67
             sade_de_remc    min:   -49.61  mean:   -30.66  std:    12.08
                  sade_mc    min:   -51.12  mean:   -39.01  std:     6.15
          sade_mc_ffi9_02    min:   -48.77  mean:   -40.92  std:     4.44
            sade_mc_final    min:   -48.45  mean:   -17.51  std:    29.83
                sade_remc    min:   -52.16  mean:   -39.48  std:     5.89
        sade_remc_ffi9_02    min:   -47.83  mean:   -39.18  std:     4.80
          sade_remc_final    min:   -48.66  mean:    -8.08  std:    25.55

1AIL
         classic-abinitio    min:  -172.76  mean:  -131.39  std:    19.03
          sade_mc_ffi9_02    min:  -153.65  mean:  -121.99  std:    19.02
            sade_mc_

In [54]:
def get_entry(data):
    data = data['data']
    
    return '%.2f (%6.2f \pm %6.2f)' % (
        data['min'], data['mean'], data['std']
    )
        
def tabularize(mode='best_by_rmsd'):
    rmsd_summary = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='rmsd_after')
    energy_summary = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='scorefxn')
    proteins = sorted(rmsd_summary.keys())
    
    print('\\begin{table}')
    print('\\centering')
    print('\\begin{tabular}{r|r|c|c}')
    
    for protein in proteins:    
        print('\\hline')
        
        methods = sorted(rmsd_summary[protein].keys())
        
        for method in methods:
            print('%s & %18s & $%s$ & $%s$ \\\\ \\hline' % (
                protein,
                method.replace('_', '-'),
                get_entry(rmsd_summary[protein][method]),
                get_entry(energy_summary[protein][method]),
            ))
            
    print('\\end{tabular}')
    print('\\caption{Caption}')
    print('\\label{label}')
    print('\\end{table}')

In [55]:
tabularize()

\begin{table}
\centering
\begin{tabular}{r|r|c|c}
\hline
1acw &   classic-abinitio & $5.85 (  7.48 \pm   0.57)$ & $-52.05 (-39.61 \pm   5.32)$ \\ \hline
1acw &         sade-de-mc & $5.18 (  6.84 \pm   0.74)$ & $-48.59 (-35.52 \pm   8.67)$ \\ \hline
1acw &       sade-de-remc & $5.42 (  7.02 \pm   0.80)$ & $-49.61 (-30.66 \pm  12.08)$ \\ \hline
1acw &            sade-mc & $4.85 (  6.90 \pm   0.66)$ & $-51.12 (-39.01 \pm   6.15)$ \\ \hline
1acw &    sade-mc-ffi9-02 & $5.05 (  7.28 \pm   0.79)$ & $-48.77 (-40.92 \pm   4.44)$ \\ \hline
1acw &      sade-mc-final & $4.45 (  5.86 \pm   0.55)$ & $-56.71 (-39.81 \pm   5.51)$ \\ \hline
1acw &          sade-remc & $5.41 (  6.89 \pm   0.64)$ & $-52.16 (-39.48 \pm   5.89)$ \\ \hline
1acw &  sade-remc-ffi9-02 & $5.74 (  6.91 \pm   0.68)$ & $-47.83 (-39.18 \pm   4.80)$ \\ \hline
1acw &    sade-remc-final & $4.40 (  5.79 \pm   0.63)$ & $-52.66 (-36.85 \pm   7.27)$ \\ \hline
\hline
1ail &   classic-abinitio & $4.75 (  7.95 \pm   1.56)$ & $-172.76 (-131.