In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import yaml

import numpy as np
import pandas as pd

from arsenic import plotting, stats

from PLBenchmarks import targets, ligands, edges

from tqdm.notebook import tqdm
import pint
unit_registry = pint.UnitRegistry()

import benchmarkpl
path = benchmarkpl.__path__[0]
targets.set_data_dir(path)



In [2]:
names = ['Exp.', 
         'OpenFF-1.0', 
         'OpenFF-1.0_converged', 
         'OpenFF-1.2', 
         'OpenFF-1.2_converged', 
         'all edges off-2.0-rc1', 
         'converged II off-2.0-rc1', 
         'all edges off-2.0-part', 
         'converged II off-2.0-part', 
         'GAFF2', 
         'cGenFF', 
         'Consensus_OpenFF_GAFF2',
         'Consensus_OpenFF_GAFF2_cGenFF',
         'opls3e-gap', 
         'opls3e-per', 
         'opls3e-sch'
        ]

identifiers = ['experiment_hahn', 
               'pmx_openff-1.0.0.offxml_hahn', 
               'pmx_repeatfilter_openff-1.0.0.offxml_hahn', 
               'pmx_openff-1.2.0.offxml_gapsys', 
               'pmx_repeatfilter_openff-1.2.0.offxml_gapsys', 
               'pmx_openff-2.0.0-rc.1.offxml_gapsys', 
               'pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys', 
               'pmx_openff-2.0.0.offxml_gapsys', 
               'pmx_repeatfilter_openff-2.0.0.offxml_gapsys', 
               'pmx_gaff_gapsys', 
               'pmx_cgenff_gapsys', 
               'pmx_ogaff_gapsys', 
               'pmx_cgenogaff_gapsys', 
               'fep_opls3e_5_gapsys', 
               'fep+_opls3e_perez', 
               'fep+_opls3e_schindler'
              ] 

In [3]:
data = {}
for target in tqdm(targets.target_dict.keys()):
    data[target] = {}
    for idx in identifiers:
        file_name = os.path.join(path, targets.get_target_dir(target), '10_results',
                                       f'{target}_{idx}.yaml'
                           )
        if os.path.exists(file_name):
            with open(file_name, 'r') as file:
                data[target][idx] = yaml.safe_load(file)
        else:
            print(f"File {file_name} for target {target} not available")

  0%|          | 0/22 [00:00<?, ?it/s]

File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_fep+_opls3e_perez.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_fep+_opls3e_schindler.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_pde2/10_results/pde2_pmx_openff-1.2.0.offxml_gapsys.yaml for target pde2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_pde2/10_results/pde2_fep+_opls3e_perez.yaml for target pde2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_pde2/10_results/pde2_fep+_opls3e_schindler.yaml for target pde2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/be

File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-07-10_pde10/10_results/pde10_fep_opls3e_5_gapsys.yaml for target pde10 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-07-10_pde10/10_results/pde10_fep+_opls3e_schindler.yaml for target pde10 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-07-30_shp2/10_results/shp2_fep_opls3e_5_gapsys.yaml for target shp2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-07-30_shp2/10_results/shp2_fep+_opls3e_perez.yaml for target shp2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-08-11_syk/10_results/syk_fep_opls3e_5_gapsys.yaml for target syk not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkp

In [4]:
def combine_sets(sets, idx_new, name_new):
    for target in targets.target_dict.keys():
        data[target][idx_new] = {}
        for idx in sets:
            if idx in data[target]:
                for key, item in data[target][idx].items():
                    if 'nan' in key:
                        print(idx, target, key, item)
                    if not np.isnan(item['DDG']):
                        data[target][idx_new][key] = item
    if idx_new not in identifiers:
        identifiers.append(idx_new)
    else:
        raise Exception("idx_new already in identifiers")
    if name_new not in names:
        names.append(name_new)
    else:
        raise Exception("name_new already in identifiers")
    print(identifiers, names)
    assert len(identifiers) == len(names)

combine_sets(['fep_opls3e_5_gapsys', 'fep+_opls3e_perez', 'fep+_opls3e_schindler'], 
             'fep+_opls3e',
            'OPLS3e')

combine_sets([f"pmx_openff-2.0.0.offxml_gapsys",
              f"pmx_openff-2.0.0-rc.1.offxml_gapsys"], 
             'openff-2.0', 
             'OpenFF-2.0')
combine_sets(['pmx_repeatfilter_openff-2.0.0.offxml_gapsys',
              'pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys'], 
             'openff-2.0_converged',
             'OpenFF-2.0_converged')

['experiment_hahn', 'pmx_openff-1.0.0.offxml_hahn', 'pmx_repeatfilter_openff-1.0.0.offxml_hahn', 'pmx_openff-1.2.0.offxml_gapsys', 'pmx_repeatfilter_openff-1.2.0.offxml_gapsys', 'pmx_openff-2.0.0-rc.1.offxml_gapsys', 'pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys', 'pmx_openff-2.0.0.offxml_gapsys', 'pmx_repeatfilter_openff-2.0.0.offxml_gapsys', 'pmx_gaff_gapsys', 'pmx_cgenff_gapsys', 'pmx_ogaff_gapsys', 'pmx_cgenogaff_gapsys', 'fep_opls3e_5_gapsys', 'fep+_opls3e_perez', 'fep+_opls3e_schindler', 'fep+_opls3e'] ['Exp.', 'OpenFF-1.0', 'OpenFF-1.0_converged', 'OpenFF-1.2', 'OpenFF-1.2_converged', 'all edges off-2.0-rc1', 'converged II off-2.0-rc1', 'all edges off-2.0-part', 'converged II off-2.0-part', 'GAFF2', 'cGenFF', 'Consensus_OpenFF_GAFF2', 'Consensus_OpenFF_GAFF2_cGenFF', 'opls3e-gap', 'opls3e-per', 'opls3e-sch', 'OPLS3e']
['experiment_hahn', 'pmx_openff-1.0.0.offxml_hahn', 'pmx_repeatfilter_openff-1.0.0.offxml_hahn', 'pmx_openff-1.2.0.offxml_gapsys', 'pmx_repeatfilter_openff-1.2

In [5]:
all_edges = pd.DataFrame()
for target, tdata in tqdm(data.items()):
    dfs = []
    for software, sdata in tdata.items():
        df = pd.DataFrame(sdata).T
        df['target'] = target
        df['edge'] = [f'edge_{row["ligandA"]}_{row["ligandB"]}' for i, row in df.iterrows()] 
        if df.shape[0]==0:
            continue
        for i, row in df.iterrows():
            df.loc[i, f'DDG_{software}'] = unit_registry.Quantity(row['DDG'], row['unit']).to('kilocalories/mole').magnitude
            df.loc[i, f'dDDG_{software}'] = unit_registry.Quantity(row['dDDG'], row['unit']).to('kilocalories/mole').magnitude
        df = df.drop(labels=['DDG', 'dDDG'], axis=1)
        dfs.append(df)
    if len(dfs) > 0:
        df = pd.concat(dfs, axis=1)
        df = df.loc[:,~df.columns.duplicated()]
        all_edges = all_edges.append(df)
all_edges.head()

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,ligandA,ligandB,unit,target,edge,DDG_experiment_hahn,dDDG_experiment_hahn,DDG_pmx_openff-1.0.0.offxml_hahn,dDDG_pmx_openff-1.0.0.offxml_hahn,DDG_pmx_repeatfilter_openff-1.0.0.offxml_hahn,...,DDG_fep+_opls3e,dDDG_fep+_opls3e,DDG_openff-2.0,dDDG_openff-2.0,DDG_openff-2.0_converged,dDDG_openff-2.0_converged,DDG_fep+_opls3e_schindler,dDDG_fep+_opls3e_schindler,DDG_fep+_opls3e_perez,dDDG_fep+_opls3e_perez
jnk1_edge_17124-1_18631-1,17124-1,18631-1,kilocalories / mole,jnk1,edge_17124-1_18631-1,0.26,0.37,1.19,0.096086,1.19,...,1.517686,0.069312,2.303333,0.171359,2.303333,0.174821,,,,
jnk1_edge_17124-1_18634-1,17124-1,18634-1,kilocalories / mole,jnk1,edge_17124-1_18634-1,-0.33,0.29,0.58,0.128639,0.58,...,0.583174,0.043021,0.54,0.112473,0.54,0.10935,,,,
jnk1_edge_18626-1_18624-1,18626-1,18624-1,kilocalories / mole,jnk1,edge_18626-1_18624-1,0.38,0.21,0.556667,0.099301,0.556667,...,1.073136,0.040631,0.993333,0.093137,0.993333,0.094997,,,,
jnk1_edge_18626-1_18625-1,18626-1,18625-1,kilocalories / mole,jnk1,edge_18626-1_18625-1,0.77,0.21,-0.03,0.107462,-0.03,...,1.445985,0.033461,1.026667,0.179128,1.026667,0.178439,,,,
jnk1_edge_18626-1_18627-1,18626-1,18627-1,kilocalories / mole,jnk1,edge_18626-1_18627-1,0.39,0.22,0.14,0.046151,0.14,...,0.39675,0.081262,0.076667,0.1725,0.076667,0.169267,,,,


In [6]:
all_edges.shape

(1128, 43)

In [7]:
all_edges.columns

Index(['ligandA', 'ligandB', 'unit', 'target', 'edge', 'DDG_experiment_hahn',
       'dDDG_experiment_hahn', 'DDG_pmx_openff-1.0.0.offxml_hahn',
       'dDDG_pmx_openff-1.0.0.offxml_hahn',
       'DDG_pmx_repeatfilter_openff-1.0.0.offxml_hahn',
       'dDDG_pmx_repeatfilter_openff-1.0.0.offxml_hahn',
       'DDG_pmx_openff-1.2.0.offxml_gapsys',
       'dDDG_pmx_openff-1.2.0.offxml_gapsys',
       'DDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys',
       'dDDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys',
       'DDG_pmx_openff-2.0.0-rc.1.offxml_gapsys',
       'dDDG_pmx_openff-2.0.0-rc.1.offxml_gapsys',
       'DDG_pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys',
       'dDDG_pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys',
       'DDG_pmx_openff-2.0.0.offxml_gapsys',
       'dDDG_pmx_openff-2.0.0.offxml_gapsys',
       'DDG_pmx_repeatfilter_openff-2.0.0.offxml_gapsys',
       'dDDG_pmx_repeatfilter_openff-2.0.0.offxml_gapsys',
       'DDG_pmx_gaff_gapsys', 'dDDG_pmx_gaff_gapsys', 

In [8]:
# Remove irrelevant columns and reorder
all_edges = all_edges[['target', 'edge', 'ligandA', 'ligandB', 'unit', 
                       'DDG_experiment_hahn',
                       'dDDG_experiment_hahn', 
                       'DDG_pmx_openff-1.0.0.offxml_hahn',
                       'dDDG_pmx_openff-1.0.0.offxml_hahn',
                       'DDG_pmx_repeatfilter_openff-1.0.0.offxml_hahn',
                       'dDDG_pmx_repeatfilter_openff-1.0.0.offxml_hahn',
                       'DDG_pmx_openff-1.2.0.offxml_gapsys',
                       'dDDG_pmx_openff-1.2.0.offxml_gapsys',
                       'DDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys',
                       'dDDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys',
                       'DDG_openff-2.0', 
                       'dDDG_openff-2.0', 
                       'DDG_openff-2.0_converged',
                       'dDDG_openff-2.0_converged',
                       'DDG_fep+_opls3e', 
                       'dDDG_fep+_opls3e',
                       'DDG_pmx_gaff_gapsys', 
                       'dDDG_pmx_gaff_gapsys', 
                       'DDG_pmx_cgenff_gapsys',
                       'dDDG_pmx_cgenff_gapsys',
                       'DDG_pmx_cgenogaff_gapsys',
                       'dDDG_pmx_cgenogaff_gapsys',
                       'DDG_pmx_ogaff_gapsys',
                       'dDDG_pmx_ogaff_gapsys'
                      ]]

In [9]:
all_edges.rename(columns={f'DDG_{i}':f'DDG_{n}' for n, i in zip(names, identifiers)}, inplace=True)
all_edges.rename(columns={f'dDDG_{i}':f'dDDG_{n}' for n, i in zip(names, identifiers)}, inplace=True)

In [10]:
for key, row in all_edges.iterrows():  
    ddg = 0.0
    eddg = 0.0
    consensus_idx = ['OPLS3e',
                     'OpenFF-2.0',
                     'GAFF2',
                     'cGenFF']
    for idx in consensus_idx:
        ddg += row[f'DDG_{idx}']
        eddg += row[f'dDDG_{idx}']**2
    all_edges.loc[key, 'DDG_Consensus_all'] = ddg/float(len(consensus_idx))
    all_edges.loc[key, 'dDDG_Consensus_all'] = np.sqrt(eddg)/float(len(consensus_idx))

In [11]:
identifiers = [idx[4:] for idx in all_edges.columns if idx.startswith("DDG")]
identifiers

['Exp.',
 'OpenFF-1.0',
 'OpenFF-1.0_converged',
 'OpenFF-1.2',
 'OpenFF-1.2_converged',
 'OpenFF-2.0',
 'OpenFF-2.0_converged',
 'OPLS3e',
 'GAFF2',
 'cGenFF',
 'Consensus_OpenFF_GAFF2_cGenFF',
 'Consensus_OpenFF_GAFF2',
 'Consensus_all']

In [12]:
for idx in identifiers:
    if idx != 'Exp.':
        all_edges[f'error_{idx}'] = all_edges[f'DDG_{idx}'] - all_edges['DDG_Exp.']
        all_edges.loc[:,f'abserror_{idx}'] = all_edges.loc[:,f'error_{idx}'].abs()
        print(idx,  all_edges.loc[:,f'abserror_{idx}'].mean())

OpenFF-1.0 1.2591962174940898
OpenFF-1.0_converged 1.0877342549923195
OpenFF-1.2 1.417208106473079
OpenFF-1.2_converged 1.2412426532325778
OpenFF-2.0 1.21385195035461
OpenFF-2.0_converged 1.0171215686274508
OPLS3e 1.0053877700233693
GAFF2 1.1778856354727454
cGenFF 1.3511330538717798
Consensus_OpenFF_GAFF2_cGenFF 1.0764493974569338
Consensus_OpenFF_GAFF2 1.193407425111536
Consensus_all 0.9195052582755381


In [13]:
all_edges.to_csv('03a_all_edges_all_ffs.csv')