# Data analysis

Experiment information:
- One million function evaluations
- **sade_remc**: is the best method from HM, but with more evals
- **sade_mc_final**: is sade + MC + ffi9 + rmsd crowding + spicker + hooke jeeves on cluster centroids
- **sade_remc_final**: is the same as above, but REMC instead of MC
- **sade_mc_ffi9_02**: is HM method + forced fragment insertion of size 2 with 0.02 chance of happening per individal per generation
- **sade_remc_ffi9_02**: same as above but with REMC instead of MC

In [1]:
import datetime
import string
import random
import pickle
import time
import sys
import os
import re

import data_utils

import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style="whitegrid");

In [2]:
root_path = '/home/h3nnn4n/progs/de_supimpa/tools/notebooks/analysis'
base_path = '/home/h3nnn4n/progs/de_supimpa/src'


def reset_path():
    os.chdir(base_path)
    
def reset_to_root_path():
    os.chdir(root_path)

    
reset_to_root_path()
reset_path()

In [3]:
runs = [
    'de_experiment_final',
    'de_sade_remc',
    'de_rosetta',
    'de_ffi',
    'de_experiment_final_8_prot',
    'de_final_1rop_1wqc_1lwy',
    'de_rosetta_all_prots',
    'de_other_experiments_all_prots',
    'de_missing_4_base_prots_runs',
    'de_mc_de-mc_de-remc_4prot',
]

In [4]:
protein_blacklist = ['1ab1', '1dfn', '2P5K', '2pmr', '3V1A']

dataset = data_utils.load_all_data(runs)
alldata = data_utils.merge_data(dataset, protein_blacklist=protein_blacklist)


def filter_methods(data, allowed=[]):
    proteins = sorted(list(data.keys()))
    
    for protein in proteins:
        methods = sorted(list(data[protein].keys()))
        
        for method in methods:
            if method not in allowed:
                data[protein].pop(method)
#                 print('[INFO] Deleting %s' % method)
        
    return data


def rename_methods(data, renamer={}):
    proteins = sorted(list(data.keys()))

    for protein in proteins:
        methods = sorted(list(data[protein].keys()))
        
        for method in methods:
            if method in renamer.keys():
                tmp = data[protein].pop(method)
                data[protein][renamer[method]] = tmp

                
filter_methods(alldata, allowed=['sade_remc_final', 'sade_mc_final'])
rename_methods(alldata, renamer={'sade_remc_final': 'ppf-remc', 'sade_mc_final': 'ppf-mc', 'sade_remc': 'sade-remc'})
print('Finished')

INFO: Loaded 10 experiment runs dataset
removed 5 proteins. Blacklist had 5
Finished


# Shapiro-Wilk

In [8]:
def summary_to_dataframe(summary, columns=['raw']):
    proteins = sorted(list(summary.keys()))
    methods = sorted(list(summary[proteins[0]].keys()))
    base_columns = ['protein', 'experiment']
    
    d = {}
    for column in columns + base_columns:
        d[column] = []
        
    for protein in proteins:
        for method in methods:
            for raw in summary[protein][method]['data']['raw']:
                d['raw'].append(raw)

                d['protein'].append(protein)
                d['experiment'].append(method)
                                
    return pd.DataFrame(data=d)

In [9]:
mode = 'best_by_rmsd'
metric = 'scorefxn'
protein = '1zdd'
method = 'sade_remc_final'

summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)

# summary[protein][method]['data']['raw']
# summary

In [10]:
def shapiro_table(alpha=0.05, mode='best_by_rmsd', metric='scorefxn'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    methods = sorted(summary[proteins[0]].keys())
    
    print('\\begin{table}')
    print('\\centering')
    column_header = '\\begin{tabular}{r%s} ' % ('|c' * len(methods))
    print(column_header)
    
    methods_header = ' & '.join([ '\\rotatebox[origin=c]{270}{%s}' % m for m in methods]).replace('_', '-')
    
    print('     & %s \\\\ \\hline \\hline' % (methods_header))
    
    for protein in proteins:         
        print('%s & ' % protein, end='')
        
        for index, method in enumerate(methods):
            data = summary[protein][method]['data']['raw']
            
            stats, p = scipy.stats.shapiro(data)
            
            if p < alpha:
                print('$\\bm{%4.4f}$' % p, end='')
            else:
                print('    $%4.4f$ ' % p, end='')
            
            if index < len(methods) -1:
                print(' & ', end='')
            else:
                print(' \\\\ \\hline')   
            
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD' }
    metric_string = metric_dict[metric]
    print('\\caption{Shapiro wilk for \\texttt{%s} using %s}' % (mode.replace('_', '-'), metric_string))
    print('\\label{tab:shapiro-wilk-%s-%s}' % (mode.replace('_', '-'), metric_string))
    print('\\end{table}')

In [11]:
modes = ['best_by_rmsd', 'best_by_energy']
metrics = ['scorefxn', 'rmsd_after']

for mode in modes:
    for metric in metrics:
        shapiro_table(alpha=0.05, mode=mode, metric=metric)
        print()

\begin{table}
\centering
\begin{tabular}{r|c|c|c|c} 
     & \rotatebox[origin=c]{270}{classic-abinitio} & \rotatebox[origin=c]{270}{rppf-mc} & \rotatebox[origin=c]{270}{rppf-remc} & \rotatebox[origin=c]{270}{sade-remc} \\ \hline \hline
1acw &     $0.9402$  & $\bm{0.0000}$ & $\bm{0.0003}$ &     $0.8959$  \\ \hline
1ail & $\bm{0.0282}$ & $\bm{0.0000}$ & $\bm{0.0000}$ &     $0.4708$  \\ \hline
1crn &     $0.1516$  & $\bm{0.0000}$ & $\bm{0.0000}$ &     $0.1411$  \\ \hline
1enh & $\bm{0.0041}$ & $\bm{0.0000}$ & $\bm{0.0000}$ & $\bm{0.0062}$ \\ \hline
1l2y &     $0.4865$  & $\bm{0.0000}$ &     $0.0957$  &     $0.1332$  \\ \hline
1rop &     $0.4577$  & $\bm{0.0150}$ & $\bm{0.0001}$ &     $0.8606$  \\ \hline
1utg &     $0.6548$  & $\bm{0.0003}$ & $\bm{0.0000}$ &     $0.0726$  \\ \hline
1wqc &     $0.5606$  & $\bm{0.0236}$ & $\bm{0.0003}$ &     $0.0707$  \\ \hline
1zdd & $\bm{0.0003}$ & $\bm{0.0000}$ & $\bm{0.0000}$ & $\bm{0.0254}$ \\ \hline
2mr9 &     $0.9846$  & $\bm{0.0026}$ & $\bm{0.0000}$ 

In [5]:
def shapiro_summary(alpha=0.05, mode='best_by_rmsd', metric='scorefxn'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    methods = sorted(summary[proteins[0]].keys())
    
    total_accept_h0 = 0
    total_reject_h0 = 0
    
    for protein in proteins:         
        accept_h0 = 0
        reject_h0 = 0
        
        for index, method in enumerate(methods):
            data = summary[protein][method]['data']['raw']
            
            stats, p = scipy.stats.shapiro(data)
            
            if p < alpha:
                reject_h0 += 1
            else:
                accept_h0 += 1
                
        total_accept_h0 += accept_h0
        total_reject_h0 += reject_h0
                
        print('%s %3d %3d' % (protein, accept_h0, reject_h0))
        
    print('     %3d %3d' % (total_accept_h0, total_reject_h0))
        
        
# modes = ['best_by_rmsd', 'best_by_energy']
# metrics = ['scorefxn', 'rmsd_after']
# for mode in modes:
#     for metric in metrics:
#         print('%s %s' % (mode, metric))
#         shapiro_summary(alpha=0.05, mode=mode, metric=metric)
#         print()
        
mode = 'best_by_rmsd'
metric = 'rmsd_after'

print('%s %s' % (mode, metric))
shapiro_summary(alpha=0.05, mode=mode, metric=metric)
print()

mode = 'best_by_energy'
metric = 'scorefxn'

print('%s %s' % (mode, metric))
shapiro_summary(alpha=0.05, mode=mode, metric=metric)
print()

best_by_rmsd rmsd_after
1acw   2   0
1ail   0   2
1crn   2   0
1enh   2   0
1l2y   2   0
1rop   2   0
1utg   1   1
1wqc   2   0
1zdd   1   1
2mr9   2   0
      16   4

best_by_energy scorefxn
1acw   2   0
1ail   2   0
1crn   2   0
1enh   2   0
1l2y   2   0
1rop   2   0
1utg   1   1
1wqc   2   0
1zdd   1   1
2mr9   2   0
      18   2



# Kruskal-Wallis

In [6]:
def kruskal_wallis_table(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    methods = sorted(summary[proteins[0]].keys())
    
    print('\\begin{table}')
    print('\\centering')
    print('\\begin{tabular}{r|c}')

    methods_header = 'Protein & $p$-value'
    
    print('%s \\\\ \\hline \\hline' % (methods_header))
    
    for protein in proteins:         
        print('%s & ' % protein, end='')
        
        data = [
            summary[protein][method]['data']['raw'] for method in methods
        ]
            
        stats, p = scipy.stats.kruskal(*data)
        
        if p < alpha:
            print('$\\bm{%4.4f}$' % p, end='')
        else:
            print('    $%4.4f$ ' % p, end='')
            
        print(' \\\\ \\hline')   
            
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD' }
    metric_string = metric_dict[metric]
    print('\\caption{Kruskal-Wallis for \\texttt{%s} using %s}' % (mode.replace('_', '-'), metric_string))
    print('\\label{tab:kruskal-wallis-wilk-%s-%s}' % (mode.replace('_', '-'), metric_string))
    print('\\end{table}')

In [7]:
kruskal_wallis_table(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after')
print()
kruskal_wallis_table(alpha=0.05, mode='best_by_energy', metric='scorefxn')

\begin{table}
\centering
\begin{tabular}{r|c}
Protein & $p$-value \\ \hline \hline
1acw &     $0.5987$  \\ \hline
1ail &     $0.0632$  \\ \hline
1crn &     $0.1051$  \\ \hline
1enh &     $0.1317$  \\ \hline
1l2y &     $0.8362$  \\ \hline
1rop & $\bm{0.0000}$ \\ \hline
1utg &     $0.7479$  \\ \hline
1wqc &     $0.4777$  \\ \hline
1zdd &     $0.3172$  \\ \hline
2mr9 &     $0.4406$  \\ \hline
\end{tabular}
\caption{Kruskal-Wallis for \texttt{best-by-rmsd} using RMSD}
\label{tab:kruskal-wallis-wilk-best-by-rmsd-RMSD}
\end{table}

\begin{table}
\centering
\begin{tabular}{r|c}
Protein & $p$-value \\ \hline \hline
1acw & $\bm{0.0099}$ \\ \hline
1ail & $\bm{0.0044}$ \\ \hline
1crn &     $0.9567$  \\ \hline
1enh & $\bm{0.0034}$ \\ \hline
1l2y &     $0.4692$  \\ \hline
1rop & $\bm{0.0000}$ \\ \hline
1utg &     $0.3111$  \\ \hline
1wqc &     $0.6591$  \\ \hline
1zdd &     $0.9021$  \\ \hline
2mr9 & $\bm{0.0004}$ \\ \hline
\end{tabular}
\caption{Kruskal-Wallis for \texttt{best-by-energy} using sco

# Mann-Whitney

In [8]:
def mann_whitney_list(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    
    methods = sorted(summary[proteins[0]].keys())
    
    for protein in proteins:
        for i in range(len(methods)):
            method_a = methods[i]
            for j in range(i + 1, len(methods)):
                method_b = methods[j]
                p, mean_a, mean_b = apply_mann_whitney(summary[protein], method_a, method_b, protein)
                
                if p > alpha:
                    continue
                    
                if mean_a < mean_b:
                    print('%5s %20s %20s %f' % (protein, method_a, method_b, p))
                else:
                    print('%5s %20s %20s %f' % (protein, method_b, method_a, p))
        print()
        

def apply_mann_whitney(data, method_a, method_b, protein):
    data_a = data[method_a]['data']['raw']
    data_b = data[method_b]['data']['raw']
    _, p = scipy.stats.mannwhitneyu(data_a, data_b)
    
    mean_a = data[method_a]['data']['mean']
    mean_b = data[method_b]['data']['mean']
    
    return p, mean_a, mean_b


mann_whitney_list(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after')


 1ail             ppf-remc               ppf-mc 0.031675




 1rop               ppf-mc             ppf-remc 0.000000







In [9]:
def mann_whitney_master(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after'):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    methods = sorted(summary[proteins[0]].keys())
    
    for protein in proteins:
        table_header(methods)
        mann_whitney_table(summary[protein], alpha=alpha)
        table_footer(mode, metric, protein)
        
        print()
                
        
def mann_whitney_table(data, alpha=0.5):
    methods = sorted(data.keys())
    
    for i in range(len(methods)):
        method_a = methods[i]
        print('%20s & ' % method_a.replace('_', '-'), end='')
        
        results = []
        
        for j in range(0, len(methods)):
            method_b = methods[j]
            p, mean_a, mean_b = apply_mann_whitney(data, method_a, method_b, protein)

            if i == j:
                results.append(' -           ')
                continue
              
            if p > alpha:  # Draws
                results.append('$%4.4f$     ' % p)
            elif mean_a < mean_b:  # Wins
                results.append('$\\bm{%4.4f}$' % p)
            else:  # Loses
                results.append('$%4.4f$     ' % p)
            
        print('%s' % ' & '.join(results), end='')
        print(' \\\\ \\hline')
    print('\\hline')
                
                
def table_header(methods):    
    print('\\begin{table}')
    print('\\centering')
    column_header = '\\begin{tabular}{r%s} ' % ('|c' * len(methods))
    print(column_header)
    
    methods_header = ' & '.join([ '\\rotatebox[origin=c]{270}{%s}' % m for m in methods]).replace('_', '-')
    
    print('     & %s \\\\ \\hline \\hline' % (methods_header))
    
    
                
def table_footer(mode, metric, protein):
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD' }
    metric_string = metric_dict[metric]
    print('\\caption{Mann-Whitney for %s \\texttt{%s} using %s}' % (protein, mode.replace('_', '-'), metric_string))
    print('\\label{tab:mann-whitney-%s-%s-%s}' % (protein, mode.replace('_', '-'), metric_string))
    print('\\end{table}')


mann_whitney_master(alpha=0.05, mode='best_by_energy', metric='scorefxn')

\begin{table}
\centering
\begin{tabular}{r|c|c} 
     & \rotatebox[origin=c]{270}{ppf-mc} & \rotatebox[origin=c]{270}{ppf-remc} \\ \hline \hline
              ppf-mc & 

NameError: name 'protein' is not defined

In [10]:
def mann_whitney_summary(alpha=0.05, mode='best_by_rmsd', metric='rmsd_after', ref=[], ignore=[], skip_all_ref=False):
    summary = data_utils.experiment_summary(alldata, mode=mode, metric=metric, with_raw=True)
    proteins = sorted(summary.keys())
    
    methods = sorted(summary[proteins[0]].keys())
    
    proposed_wins = 0
    proposed_loses = 0
    proposed_draws = 0
    
    print('\\begin{table}')
    print('\\centering')
    column_header = '\\begin{tabular}{r|r|r|r}'
    print(column_header)
        
    print('%s \\\\ \\hline \\hline' % ('Protein & Wins & Loses & Draws'))
    
    
    for protein in proteins:
        proposed_wins = 0
        proposed_loses = 0
        proposed_draws = 0
        for i in range(len(methods)):
            method_a = methods[i]
            if all(map(lambda x: method_a != x, ref)):
                continue
                
            for j in range(len(methods)):
                method_b = methods[j]
                
                if any(map(lambda x: method_a == x or method_b == x, ignore)):
                    continue
                    
                if skip_all_ref:
                    method_a_match = any(map(lambda x: method_a == x, ref))
                    method_b_match = any(map(lambda x: method_b == x, ref))
                    
                    if method_a_match and method_b_match:
                        continue
                                    
                p, mean_a, mean_b = apply_mann_whitney(summary[protein], method_a, method_b, protein)
                
                if p > alpha:
                    proposed_draws += 1
                    continue
                    
                if mean_a < mean_b:
                    proposed_wins += 1
#                     print('%5s %20s %20s %f' % (protein, method_a, method_b, p))
                else:
                    proposed_loses += 1
#                     print('%5s %20s %20s %f' % (protein, method_b, method_a, p))
        
#         print('%5s  wins: %2s  loses %2d  draws: %2d' % (protein, proposed_wins, proposed_loses, proposed_draws))
        print('%5s & %2s & %2d & %2d \\\\ \\hline' % (protein, proposed_wins, proposed_loses, proposed_draws))
    
    print('\\end{tabular}')
    metric_dict = { 'scorefxn': 'scorefxn', 'rmsd_after': 'RMSD' }
    metric_string = metric_dict[metric]
    print('\\caption{Summary of Mann-Whitney \\texttt{%s} using %s}' % (mode.replace('_', '-'), metric_string))
    print('\\label{tab:mann-whitney-summary-%s-%s}' % (mode.replace('_', '-'), metric_string))
    print('\\end{table}')

mann_whitney_summary(
    alpha=0.05,
    mode='best_by_rmsd',
    metric='rmsd_after',
    ref=['rppf-remc', 'rppf-mc'],
    ignore=['classic-abinitio'],
    skip_all_ref=True
)

print()


mann_whitney_summary(
    alpha=0.05,
    mode='best_by_energy',
    metric='scorefxn',
    ref=['rppf-remc', 'rppf-mc'],
    ignore=['classic-abinitio'],
    skip_all_ref=True
)

\begin{table}
\centering
\begin{tabular}{r|r|r|r}
Protein & Wins & Loses & Draws \\ \hline \hline
 1acw &  0 &  0 &  0 \\ \hline
 1ail &  0 &  0 &  0 \\ \hline
 1crn &  0 &  0 &  0 \\ \hline
 1enh &  0 &  0 &  0 \\ \hline
 1l2y &  0 &  0 &  0 \\ \hline
 1rop &  0 &  0 &  0 \\ \hline
 1utg &  0 &  0 &  0 \\ \hline
 1wqc &  0 &  0 &  0 \\ \hline
 1zdd &  0 &  0 &  0 \\ \hline
 2mr9 &  0 &  0 &  0 \\ \hline
\end{tabular}
\caption{Summary of Mann-Whitney \texttt{best-by-rmsd} using RMSD}
\label{tab:mann-whitney-summary-best-by-rmsd-RMSD}
\end{table}

\begin{table}
\centering
\begin{tabular}{r|r|r|r}
Protein & Wins & Loses & Draws \\ \hline \hline
 1acw &  0 &  0 &  0 \\ \hline
 1ail &  0 &  0 &  0 \\ \hline
 1crn &  0 &  0 &  0 \\ \hline
 1enh &  0 &  0 &  0 \\ \hline
 1l2y &  0 &  0 &  0 \\ \hline
 1rop &  0 &  0 &  0 \\ \hline
 1utg &  0 &  0 &  0 \\ \hline
 1wqc &  0 &  0 &  0 \\ \hline
 1zdd &  0 &  0 &  0 \\ \hline
 2mr9 &  0 &  0 &  0 \\ \hline
\end{tabular}
\caption{Summary of Mann-

In [12]:
mann_whitney_summary(
    alpha=0.05,
    mode='best_by_rmsd',
    metric='rmsd_after',
    ref=['ppf-remc', 'ppf-mc'],
    ignore=['sade-remc'],
    skip_all_ref=False
)

print()

mann_whitney_summary(
    alpha=0.05,
    mode='best_by_energy',
    metric='scorefxn',
    ref=['ppf-remc', 'ppf-mc'],
    ignore=['sade-remc'],
    skip_all_ref=False
)

\begin{table}
\centering
\begin{tabular}{r|r|r|r}
Protein & Wins & Loses & Draws \\ \hline \hline
 1acw &  0 &  0 &  4 \\ \hline
 1ail &  1 &  1 &  2 \\ \hline
 1crn &  0 &  0 &  4 \\ \hline
 1enh &  0 &  0 &  4 \\ \hline
 1l2y &  0 &  0 &  4 \\ \hline
 1rop &  1 &  1 &  2 \\ \hline
 1utg &  0 &  0 &  4 \\ \hline
 1wqc &  0 &  0 &  4 \\ \hline
 1zdd &  0 &  0 &  4 \\ \hline
 2mr9 &  0 &  0 &  4 \\ \hline
\end{tabular}
\caption{Summary of Mann-Whitney \texttt{best-by-rmsd} using RMSD}
\label{tab:mann-whitney-summary-best-by-rmsd-RMSD}
\end{table}

\begin{table}
\centering
\begin{tabular}{r|r|r|r}
Protein & Wins & Loses & Draws \\ \hline \hline
 1acw &  1 &  1 &  2 \\ \hline
 1ail &  1 &  1 &  2 \\ \hline
 1crn &  0 &  0 &  4 \\ \hline
 1enh &  1 &  1 &  2 \\ \hline
 1l2y &  0 &  0 &  4 \\ \hline
 1rop &  1 &  1 &  2 \\ \hline
 1utg &  0 &  0 &  4 \\ \hline
 1wqc &  0 &  0 &  4 \\ \hline
 1zdd &  0 &  0 &  4 \\ \hline
 2mr9 &  1 &  1 &  2 \\ \hline
\end{tabular}
\caption{Summary of Mann-