# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [1]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import data_utils

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [2]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [3]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
    'de_missing_4_base_prots_runs',
    'de_mc_de-mc_de-remc_4prot',
]

In [4]:
protein_blacklist = ['1ab1', '1dfn', '2P5K', '2pmr', '3V1A']

dataset = data_utils.load_all_data(runs)
alldata = data_utils.merge_data(dataset, protein_blacklist=protein_blacklist)


def filter_methods(data, allowed=[]):
    proteins = sorted(list(data.keys()))
    
    for protein in proteins:
        methods = sorted(list(data[protein].keys()))
        
        for method in methods:
            if method not in allowed:
                data[protein].pop(method)
#                 print('[INFO] Deleting %s' % method)
        
    return data


def rename_methods(data, renamer={}):
    proteins = sorted(list(data.keys()))

    for protein in proteins:
        methods = sorted(list(data[protein].keys()))
        
        for method in methods:
            if method in renamer.keys():
                tmp = data[protein].pop(method)
                data[protein][renamer[method]] = tmp


filter_methods(alldata, allowed=['classic-abinitio', 'sade_remc_final', 'sade_mc_final', 'sade_remc'])
rename_methods(alldata, renamer={'sade_remc_final': 'rppf-remc', 'sade_mc_final': 'rppf-mc', 'sade_remc': 'sade-remc'})
print('Finished')

# filter_methods(alldata, allowed=['sade_remc_final', 'sade_mc_final'])
# rename_methods(alldata, renamer={'sade_remc_final': 'ppf-remc', 'sade_mc_final': 'ppf-mc', 'sade_remc': 'sade-remc'})
# print('Finished')

INFO: Loaded 10 experiment runs dataset
removed 5 proteins. Blacklist had 5
Finished


In [5]:
def pretty_print_experiment_summary(experiment_summary):
    def protein_summary_print(p_data):
        keys = sorted(p_data.keys())
        
        for key in keys:
            experiment = p_data[key]
            name = key
            data = experiment['data']

            print('%25s    min: %8.2f  mean: %8.2f  std: %8.2f' % (
                name, data['min'], data['mean'], data['std']
            ))
    
    for protein in sorted(experiment_summary.keys()): 
        print()
        print(protein.upper())

        protein_summary_print(experiment_summary[protein])

## Experiment Summary with *Best by Energy*

### RMSD data

In [6]:
experiment_summary_rmsd = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='rmsd_after')
pretty_print_experiment_summary(experiment_summary_rmsd)


1ACW
         classic-abinitio    min:     5.85  mean:     7.48  std:     0.57
                  rppf-mc    min:     5.27  mean:     7.30  std:     0.91
                rppf-remc    min:     5.53  mean:     7.36  std:     1.01
                sade-remc    min:     5.41  mean:     6.89  std:     0.64

1AIL
         classic-abinitio    min:     4.75  mean:     7.95  std:     1.56
                  rppf-mc    min:     5.69  mean:    10.04  std:     2.02
                rppf-remc    min:     5.31  mean:     9.78  std:     3.22
                sade-remc    min:     4.26  mean:     8.57  std:     1.42

1CRN
         classic-abinitio    min:     4.30  mean:     6.57  std:     1.57
                  rppf-mc    min:     5.54  mean:     8.62  std:     1.31
                rppf-remc    min:     4.14  mean:     8.34  std:     1.67
                sade-remc    min:     4.27  mean:     8.74  std:     1.55

1ENH
         classic-abinitio    min:     2.84  mean:     6.43  std:     1.28
              

## Energy data 

In [7]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='scorefxn')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1ACW
         classic-abinitio    min:   -52.05  mean:   -39.61  std:     5.32
                  rppf-mc    min:   -56.71  mean:   -39.81  std:     5.51
                rppf-remc    min:   -52.66  mean:   -36.85  std:     7.27
                sade-remc    min:   -52.16  mean:   -39.48  std:     5.89

1AIL
         classic-abinitio    min:  -172.76  mean:  -131.39  std:    19.03
                  rppf-mc    min:  -164.40  mean:  -137.77  std:    10.67
                rppf-remc    min:  -166.29  mean:  -133.64  std:    11.20
                sade-remc    min:  -167.59  mean:  -124.49  std:    16.50

1CRN
         classic-abinitio    min:   -79.64  mean:   -60.92  std:     8.73
                  rppf-mc    min:   -81.41  mean:   -61.68  std:     6.97
                rppf-remc    min:   -84.79  mean:   -61.76  std:     7.55
                sade-remc    min:   -75.38  mean:   -54.99  std:    12.29

1ENH
         classic-abinitio    min:  -138.48  mean:  -104.29  std:    20.83
              

## Experiment Summary with *Best by RMSD* 

## RMSD data

In [8]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='rmsd_after')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1ACW
         classic-abinitio    min:     5.85  mean:     7.48  std:     0.57
                  rppf-mc    min:     4.45  mean:     5.86  std:     0.55
                rppf-remc    min:     4.40  mean:     5.79  std:     0.63
                sade-remc    min:     5.41  mean:     6.89  std:     0.64

1AIL
         classic-abinitio    min:     4.75  mean:     7.95  std:     1.56
                  rppf-mc    min:     4.92  mean:     7.74  std:     1.20
                rppf-remc    min:     4.11  mean:     7.42  std:     1.21
                sade-remc    min:     4.26  mean:     8.57  std:     1.42

1CRN
         classic-abinitio    min:     4.30  mean:     6.57  std:     1.57
                  rppf-mc    min:     4.18  mean:     6.73  std:     0.95
                rppf-remc    min:     4.05  mean:     6.53  std:     0.91
                sade-remc    min:     4.27  mean:     8.74  std:     1.55

1ENH
         classic-abinitio    min:     2.84  mean:     6.43  std:     1.28
              

## Energy Data

In [9]:
experiment_summary_scorefxn = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='scorefxn')
pretty_print_experiment_summary(experiment_summary_scorefxn)


1ACW
         classic-abinitio    min:   -52.05  mean:   -39.61  std:     5.32
                  rppf-mc    min:   -48.45  mean:   -17.51  std:    29.83
                rppf-remc    min:   -48.66  mean:    -8.08  std:    25.55
                sade-remc    min:   -52.16  mean:   -39.48  std:     5.89

1AIL
         classic-abinitio    min:  -172.76  mean:  -131.39  std:    19.03
                  rppf-mc    min:  -155.02  mean:  -111.78  std:    26.82
                rppf-remc    min:  -166.29  mean:  -107.67  std:    33.95
                sade-remc    min:  -167.59  mean:  -124.49  std:    16.50

1CRN
         classic-abinitio    min:   -79.64  mean:   -60.92  std:     8.73
                  rppf-mc    min:   -70.05  mean:   -39.16  std:    22.74
                rppf-remc    min:   -84.79  mean:   -40.16  std:    29.30
                sade-remc    min:   -75.38  mean:   -54.99  std:    12.29

1ENH
         classic-abinitio    min:  -138.48  mean:  -104.29  std:    20.83
              

# Something something

In [10]:
def get_entry(data):
    data = data['data']
    
    return '%.2f (%6.2f \pm %6.2f)' % (
        data['min'], data['mean'], data['std']
    )
        
def tabularize(mode='best_by_rmsd'):
    rmsd_summary = data_utils.experiment_summary(alldata, mode='best_by_rmsd', metric='rmsd_after')
    energy_summary = data_utils.experiment_summary(alldata, mode='best_by_energy', metric='scorefxn')
    proteins = sorted(rmsd_summary.keys())
    
    print('\\begin{table}')
    print('\\centering')
    print('\\begin{tabular}{r|r|c|c}')
    
    for protein in proteins:    
        print('\\hline')
        
        methods = sorted(rmsd_summary[protein].keys())
        
        for method in methods:
            print('%s & %18s & $%s$ & $%s$ \\\\ \\hline' % (
                protein,
                method.replace('_', '-'),
                get_entry(rmsd_summary[protein][method]),
                get_entry(energy_summary[protein][method]),
            ))
            
    print('\\end{tabular}')
    print('\\caption{Caption}')
    print('\\label{label}')
    print('\\end{table}')

In [11]:
tabularize()

\begin{table}
\centering
\begin{tabular}{r|r|c|c}
\hline
1acw &   classic-abinitio & $5.85 (  7.48 \pm   0.57)$ & $-52.05 (-39.61 \pm   5.32)$ \\ \hline
1acw &            rppf-mc & $4.45 (  5.86 \pm   0.55)$ & $-56.71 (-39.81 \pm   5.51)$ \\ \hline
1acw &          rppf-remc & $4.40 (  5.79 \pm   0.63)$ & $-52.66 (-36.85 \pm   7.27)$ \\ \hline
1acw &          sade-remc & $5.41 (  6.89 \pm   0.64)$ & $-52.16 (-39.48 \pm   5.89)$ \\ \hline
\hline
1ail &   classic-abinitio & $4.75 (  7.95 \pm   1.56)$ & $-172.76 (-131.39 \pm  19.03)$ \\ \hline
1ail &            rppf-mc & $4.92 (  7.74 \pm   1.20)$ & $-164.40 (-137.77 \pm  10.67)$ \\ \hline
1ail &          rppf-remc & $4.11 (  7.42 \pm   1.21)$ & $-166.29 (-133.64 \pm  11.20)$ \\ \hline
1ail &          sade-remc & $4.26 (  8.57 \pm   1.42)$ & $-167.59 (-124.49 \pm  16.50)$ \\ \hline
\hline
1crn &   classic-abinitio & $4.30 (  6.57 \pm   1.57)$ & $-79.64 (-60.92 \pm   8.73)$ \\ \hline
1crn &            rppf-mc & $4.18 (  6.73 \pm   0.95)$ & 