# Gather data from various source into the analysis repository

In [1]:
# imports
import os
import shutil
import numpy as np
import pandas as pd
import yaml
import git
import tempfile
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pint
unit_registry = pint.UnitRegistry()

from PLBenchmarks import targets, ligands, edges
from IPython.core.display import HTML

from tqdm.notebook import tqdm

import benchmarkpl
path = benchmarkpl.__path__[0]



AttributeError: 'super' object has no attribute '_ipython_display_'

_ColormakerRegistry()

# Set path of data directory

In [2]:
targets.set_data_dir(os.path.join(path))
# directory name where results for each target are stored
results_dir = '10_results'

# Number of targets, ligands and edges in the data set

In [3]:
nligs, nedgs = 0, 0
print(f'{"Target":10s} {"Num Ligs":>10s} {"Num Edges":>10s}')
print(33 * '-')
for target in tqdm(targets.target_dict):
    print(f'{target:10s} {len(ligands.LigandSet(target)):10d} {len(edges.EdgeSet(target)):10d}')
    nligs += len(ligands.LigandSet(target))
    nedgs += len(edges.EdgeSet(target))
print(33 * '-')
print(f'{"total":10s} {nligs:10d} {nedgs:10d}')

Target       Num Ligs  Num Edges
---------------------------------


  0%|          | 0/22 [00:00<?, ?it/s]

  result[:] = values


jnk1               21         31
pde2               21         34
thrombin           11         16
p38                34         56
ptp1b              23         49
galectin            8          7
cdk2               16         25
cmet               24         57
mcl1               42         71
bace               36         58
bace_hunt          32         60
bace_p2            12         26
tyk2               16         24
ros1               28         61
eg5                28         65
cdk8               33         54
hif2a              42         92
pfkfb3             40         66
pde10              35         59
shp2               26         56
syk                44        101
tnks2              27         60
---------------------------------
total             599       1128


# Experimental values stored in repository
Retrieve the experimental values stored in the dataset

In [12]:
ligs = ligands.LigandSet('pde10')
yaml.load?
#('../benchmarkpl/2020-07-10_pde10/00_data/ligands.yml')

In [4]:
# function to retrieve exp. data from PLBenchmarks 
def getExpResults(target):
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # remove unit of exp. values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    return df
getExpResults('tnks2')

Unnamed: 0,0,1,exp_DDG,exp_dDDG
tnks2_edge_3a_7,3a,7,2.61,0.0
tnks2_edge_3a_5m,3a,5m,-1.71,0.0
tnks2_edge_3a_3b,3a,3b,-0.53,0.0
tnks2_edge_5f_5a,5f,5a,-0.5,0.0
tnks2_edge_5f_5i,5f,5i,-1.82,0.0
tnks2_edge_5f_5l,5f,5l,0.17,0.0
tnks2_edge_8e_5k,8e,5k,-0.72,0.0
tnks2_edge_8e_8a,8e,8a,-0.56,0.0
tnks2_edge_8e_8b,8e,8b,-1.69,0.0
tnks2_edge_8e_8c,8e,8c,0.12,0.0


In [5]:
author = "hahn"
software = "experiment"

In [6]:
for target in targets.target_dict:
    df = getExpResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target)), exist_ok=True)
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'exp_DDG', 'exp_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'exp_DDG': 'DDG', 'exp_dDDG': 'dDDG'}, inplace=True)
        # set a default error for unknown experimental error of 0.64 kcal/mol
        dddg = np.sqrt(2)*0.28
        print(df['dDDG'])
        df['dDDG'] = df['dDDG'].apply(lambda x: dddg if x < 1e-6 else x)
        print(df['dDDG'])
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

  result[:] = values


jnk1_edge_17124-1_18634-1    0.29
jnk1_edge_18626-1_18624-1    0.21
jnk1_edge_18636-1_18625-1    0.36
jnk1_edge_18632-1_18624-1    0.31
jnk1_edge_18635-1_18625-1    0.18
jnk1_edge_18626-1_18658-1    0.17
jnk1_edge_18639-1_18658-1    0.15
jnk1_edge_18626-1_18625-1    0.21
jnk1_edge_18638-1_18658-1    0.12
jnk1_edge_18628-1_18624-1    0.23
jnk1_edge_18631-1_18660-1    0.24
jnk1_edge_18638-1_18634-1    0.09
jnk1_edge_18626-1_18632-1    0.30
jnk1_edge_18626-1_18630-1    0.14
jnk1_edge_18631-1_18624-1    0.27
jnk1_edge_18629-1_18627-1    0.19
jnk1_edge_18634-1_18637-1    0.27
jnk1_edge_18626-1_18627-1    0.22
jnk1_edge_18631-1_18652-1    0.25
jnk1_edge_18637-1_18631-1    0.35
jnk1_edge_18626-1_18634-1    0.15
jnk1_edge_18633-1_18624-1    0.25
jnk1_edge_17124-1_18631-1    0.37
jnk1_edge_18627-1_18630-1    0.17
jnk1_edge_18659-1_18634-1    0.08
jnk1_edge_18636-1_18624-1    0.35
jnk1_edge_18626-1_18628-1    0.23
jnk1_edge_18626-1_18660-1    0.16
jnk1_edge_18626-1_18659-1    0.16
jnk1_edge_1863

ptp1b_edge_23466_23475              0.0
ptp1b_edge_23467_23466              0.0
ptp1b_edge_23467_23468              0.0
ptp1b_edge_23467_23469              0.0
ptp1b_edge_23467_23470              0.0
ptp1b_edge_23467_23473              0.0
ptp1b_edge_23467_23474              0.0
ptp1b_edge_23467_23475              0.0
ptp1b_edge_23467_23476              0.0
ptp1b_edge_23469_23472              0.0
ptp1b_edge_23469_20669_2qbr         0.0
ptp1b_edge_23471_23466              0.0
ptp1b_edge_23471_23468              0.0
ptp1b_edge_23471_23470              0.0
ptp1b_edge_23473_20669_2qbr         0.0
ptp1b_edge_23474_23466              0.0
ptp1b_edge_23476_23466              0.0
ptp1b_edge_23477_23466              0.0
ptp1b_edge_23477_23467              0.0
ptp1b_edge_23477_23479              0.0
ptp1b_edge_23477_23482              0.0
ptp1b_edge_23477_23483              0.0
ptp1b_edge_23477_23330_2qbq         0.0
ptp1b_edge_23480_23479              0.0
ptp1b_edge_23480_23482              0.0


mcl1_edge_50_60    0.15
mcl1_edge_56_35    0.22
mcl1_edge_65_60    0.19
mcl1_edge_26_57    0.28
mcl1_edge_58_60    0.04
                   ... 
mcl1_edge_43_47    0.17
mcl1_edge_67_37    0.30
mcl1_edge_42_64    0.23
mcl1_edge_51_45    0.28
mcl1_edge_68_23    0.26
Name: dDDG, Length: 71, dtype: float64
mcl1_edge_50_60    0.15
mcl1_edge_56_35    0.22
mcl1_edge_65_60    0.19
mcl1_edge_26_57    0.28
mcl1_edge_58_60    0.04
                   ... 
mcl1_edge_43_47    0.17
mcl1_edge_67_37    0.30
mcl1_edge_42_64    0.23
mcl1_edge_51_45    0.28
mcl1_edge_68_23    0.26
Name: dDDG, Length: 71, dtype: float64
bace_edge_CAT-13b_CAT-17g    0.0
bace_edge_CAT-13a_CAT-17g    0.0
bace_edge_CAT-13e_CAT-17g    0.0
bace_edge_CAT-4m_CAT-4c      0.0
bace_edge_CAT-13k_CAT-4d     0.0
bace_edge_CAT-24_CAT-17e     0.0
bace_edge_CAT-13g_CAT-17g    0.0
bace_edge_CAT-13d_CAT-13h    0.0
bace_edge_CAT-13a_CAT-17i    0.0
bace_edge_CAT-4m_CAT-13j     0.0
bace_edge_CAT-13a_CAT-13m    0.0
bace_edge_CAT-4l_CAT-13k     0.

bace_p2_edge_32_L_31_L    0.60
bace_p2_edge_32_L_29_L    0.55
bace_p2_edge_32_L_35_L    0.49
bace_p2_edge_32_L_30_L    0.49
bace_p2_edge_28_L_35_L    0.18
bace_p2_edge_39_L_35_L    0.11
bace_p2_edge_35_L_29_L    0.28
bace_p2_edge_39_L_34_L    0.10
bace_p2_edge_39_L_37_L    0.17
bace_p2_edge_39_L_28_L    0.16
bace_p2_edge_39_L_36_L    0.13
bace_p2_edge_31_L_30_L    0.38
bace_p2_edge_31_L_29_L    0.44
bace_p2_edge_34_L_29_L    0.28
bace_p2_edge_34_L_30_L    0.15
bace_p2_edge_34_L_36_L    0.15
bace_p2_edge_37_L_36_L    0.21
bace_p2_edge_28_L_36_L    0.19
bace_p2_edge_38_L_36_L    0.19
bace_p2_edge_33_L_30_L    0.42
bace_p2_edge_30_L_29_L    0.29
bace_p2_edge_33_L_29_L    0.48
bace_p2_edge_28_L_38_L    0.21
bace_p2_edge_28_L_29_L    0.30
bace_p2_edge_28_L_37_L    0.23
bace_p2_edge_38_L_37_L    0.23
Name: dDDG, dtype: float64
bace_p2_edge_32_L_31_L    0.60
bace_p2_edge_32_L_29_L    0.55
bace_p2_edge_32_L_35_L    0.49
bace_p2_edge_32_L_30_L    0.49
bace_p2_edge_28_L_35_L    0.18
bace_p2_edge

pfkfb3_edge_43_48    0.0
pfkfb3_edge_43_44    0.0
pfkfb3_edge_30_56    0.0
pfkfb3_edge_30_70    0.0
pfkfb3_edge_65_34    0.0
                    ... 
pfkfb3_edge_68_53    0.0
pfkfb3_edge_33_23    0.0
pfkfb3_edge_33_36    0.0
pfkfb3_edge_33_70    0.0
pfkfb3_edge_39_37    0.0
Name: dDDG, Length: 66, dtype: float64
pfkfb3_edge_43_48    0.39598
pfkfb3_edge_43_44    0.39598
pfkfb3_edge_30_56    0.39598
pfkfb3_edge_30_70    0.39598
pfkfb3_edge_65_34    0.39598
                      ...   
pfkfb3_edge_68_53    0.39598
pfkfb3_edge_33_23    0.39598
pfkfb3_edge_33_36    0.39598
pfkfb3_edge_33_70    0.39598
pfkfb3_edge_39_37    0.39598
Name: dDDG, Length: 66, dtype: float64
pde10_edge_7395_3484    0.0
pde10_edge_7395_0738    0.0
pde10_edge_8041_4415    0.0
pde10_edge_4189_4415    0.0
pde10_edge_1939_3032    0.0
pde10_edge_5544_3806    0.0
pde10_edge_4754_4147    0.0
pde10_edge_3806_6221    0.0
pde10_edge_3806_0340    0.0
pde10_edge_3806_1423    0.0
pde10_edge_3806_5973    0.0
pde10_edge_4096_4147

tnks2_edge_3a_7     0.0
tnks2_edge_3a_5m    0.0
tnks2_edge_3a_3b    0.0
tnks2_edge_5f_5a    0.0
tnks2_edge_5f_5i    0.0
tnks2_edge_5f_5l    0.0
tnks2_edge_8e_5k    0.0
tnks2_edge_8e_8a    0.0
tnks2_edge_8e_8b    0.0
tnks2_edge_8e_8c    0.0
tnks2_edge_5l_5e    0.0
tnks2_edge_5l_5j    0.0
tnks2_edge_5l_5p    0.0
tnks2_edge_8b_8a    0.0
tnks2_edge_8b_8f    0.0
tnks2_edge_8b_8d    0.0
tnks2_edge_5g_5a    0.0
tnks2_edge_5g_5b    0.0
tnks2_edge_5g_5d    0.0
tnks2_edge_5g_5h    0.0
tnks2_edge_1b_1a    0.0
tnks2_edge_1b_5h    0.0
tnks2_edge_1b_5d    0.0
tnks2_edge_1b_5n    0.0
tnks2_edge_1b_5c    0.0
tnks2_edge_5e_5a    0.0
tnks2_edge_5e_5i    0.0
tnks2_edge_5e_5h    0.0
tnks2_edge_5e_5k    0.0
tnks2_edge_5e_5p    0.0
tnks2_edge_5e_5n    0.0
tnks2_edge_5i_5a    0.0
tnks2_edge_5i_5o    0.0
tnks2_edge_5i_5m    0.0
tnks2_edge_8d_1a    0.0
tnks2_edge_8d_8f    0.0
tnks2_edge_8d_8c    0.0
tnks2_edge_5b_5o    0.0
tnks2_edge_5b_5n    0.0
tnks2_edge_5b_5c    0.0
tnks2_edge_5b_5j    0.0
tnks2_edge_5h_5m

# pmx calculations with openFF parameters (Hahn et al.)

In [12]:
# function to retrieve data from PLBenchmarks calculations, soted in 00_data/input 
def getRawResults(target, forcefield='openff-1.0.0.offxml'):
    file_path = os.path.join(path, 
                                   '..', 
                                   '00_data', 
                                   'input',  
                                   f'{target}_{forcefield}.dat'
                                  )
    if not os.path.exists(file_path):
        print(f'File {file_path} does not exist.')
        return
    # read in result file
    res = pd.read_csv(file_path,
                      header=None, 
                      comment='#', 
                      skipinitialspace=True, 
                      names=['edge', 'calc DDG', 'calc dDDG' , 'add dDDG'], 
                      sep=' ')
    res.index = res['edge']
    # read in exp. data
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # copy in calculated values
    df['calc_DDG'] = res.loc[:,'calc DDG']
    df['calc_dDDG'] = res.loc[:,'calc dDDG']
    df['calc_dDDG(additional)'] = res.loc[:, 'add dDDG']
    
    # remove unit of calculated values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG', 'calc_DDG', 'calc_dDDG', 'calc_dDDG(additional)'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
    return df
getRawResults('cmet')

Unnamed: 0,0,1,exp_DDG,exp_dDDG,calc_DDG,calc_dDDG,calc_dDDG(additional)
cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7,CHEMBL3402741_400,CHEMBL3402756_2.7,-2.98,0.0,-11.68,2.63,
cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90,CHEMBL3402741_400,CHEMBL3402763_90,-0.89,0.0,1.97,1.01,
cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90,CHEMBL3402741_400,CHEMBL3402764_90,-0.89,0.0,3.64,0.45,
cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7,CHEMBL3402742_23,CHEMBL3402756_2.7,-1.27,0.0,-1.1,1.3,
cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90,CHEMBL3402742_23,CHEMBL3402763_90,0.82,0.0,0.25,0.18,
cmet_edge_CHEMBL3402743_42_CHEMBL3402742_23,CHEMBL3402743_42,CHEMBL3402742_23,-0.36,0.0,1.13,0.21,
cmet_edge_CHEMBL3402743_42_CHEMBL3402756_2.7,CHEMBL3402743_42,CHEMBL3402756_2.7,-1.63,0.0,-0.14,0.38,
cmet_edge_CHEMBL3402743_42_CHEMBL3402758_10,CHEMBL3402743_42,CHEMBL3402758_10,-0.85,0.0,-4.22,1.41,
cmet_edge_CHEMBL3402743_42_CHEMBL3402760_1,CHEMBL3402743_42,CHEMBL3402760_1,-2.22,0.0,-2.14,2.05,
cmet_edge_CHEMBL3402743_42_CHEMBL3402762_1,CHEMBL3402743_42,CHEMBL3402762_1,-2.22,0.0,-3.44,1.07,


In [13]:
author = "hahn"
software = "pmx"
forcefield = "openff-1.0.0.offxml"

In [14]:
for target in targets.target_dict:
    df = getRawResults(target, forcefield)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'calc_DDG', 'calc_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'calc_DDG': 'DDG', 'calc_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

## Gather data from Activity Cliffs, Perez-Benito et al. 

In [10]:
target="ros1"
file_name = f'{path}/../00_data/input/{target}_opls3e.csv'
if not os.path.exists(file_name):
    print(f'File {file_name} does not exist.')
data = pd.read_csv(file_name, header=0, comment='#')
print(data)
data['Ligand1'] = data['Ligand1'].astype(str)
# data['Ligand1'] = [x.split('_')[1] for x in data['Ligand1']]
data['Ligand2'] = data['Ligand2'].astype(str)
# data['Ligand2'] = [x.split('_')[1] for x in data['Ligand2']]
data['edge'] = pd.Series([f'{target}_edge_{a.split("_")[1]}_{b.split("_")[1]}' for a, b in zip(data['Ligand1'].values, data['Ligand2'].values)])
data.index=data['edge']
data.drop(columns='edge', inplace=True)
print(data)
for e, edg in edges.EdgeSet(target).items():
    if f'{target}_{e}' not in data.index:
        print(e)
        edg_dict = edg.get_dict()
#         print(edg_dict)
        n_index = f'{target}_edge_{e.split("_")[2]}_{e.split("_")[1]}'
        if n_index in data.index:
            row = data.loc[n_index, :]
            n_row = pd.Series({"Ligand1": row["Ligand2"],
                               "Ligand2": row["Ligand1"],
                                "edge": e,
                               "FEP": -row["FEP"],
                               "FEP Error": row["FEP Error"]
                              }
                                  )
            data.loc[e,:] = n_row
            data.drop(n_index, inplace=True)
        else:
            print(e)
data.to_csv('tmp.csv', sep=',', columns=["Ligand1", "Ligand2", "FEP", "FEP Error"], index=None)

     Ligand1   Ligand2   FEP  FEP Error
0   lig_0681  lig_1537  0.92       0.06
1   lig_0681  lig_1872 -0.47       0.06
2   lig_0681  lig_5109  1.60       0.15
3   lig_0681  lig_7454  1.14       0.09
4   lig_1537  lig_0529 -2.12       0.07
..       ...       ...   ...        ...
57  lig_1872  lig_6770  2.57       0.24
58  lig_3507  lig_6770  0.53       0.25
59  lig_5602  lig_7454  2.09       0.20
60  lig_6674  lig_6770  1.95       0.23
61  lig_6770  lig_8550 -1.68       0.29

[62 rows x 4 columns]
                      Ligand1   Ligand2   FEP  FEP Error
edge                                                    
ros1_edge_0681_1537  lig_0681  lig_1537  0.92       0.06
ros1_edge_0681_1872  lig_0681  lig_1872 -0.47       0.06
ros1_edge_0681_5109  lig_0681  lig_5109  1.60       0.15
ros1_edge_0681_7454  lig_0681  lig_7454  1.14       0.09
ros1_edge_1537_0529  lig_1537  lig_0529 -2.12       0.07
...                       ...       ...   ...        ...
ros1_edge_1872_6770  lig_1872  lig_6770  

In [11]:
def get_perez_results(target):  
    file_name = f'{path}/../00_data/input/{target}_opls3e.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(file_name, header=0, comment='#')
    data['Ligand1'] = data['Ligand1'].astype(str)
    data['Ligand1'] = [x.split('_')[1] for x in data['Ligand1']]
    data['Ligand2'] = data['Ligand2'].astype(str)
    data['Ligand2'] = [x.split('_')[1] for x in data['Ligand2']]
    data['edge'] = pd.Series([f'{target}_edge_{a}_{b}' for a, b in zip(data['Ligand1'].values, data['Ligand2'].values)])
    data.index=data['edge']
    data.drop(columns='edge', inplace=True)
    return data

In [12]:
author = "perez"
for target in ['ros1', 'pde10']:
    df = get_perez_results(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    forcefield = 'opls3e'
    software = 'fep+'
    unit = 'kilocalories / mole'
    print(df)
    df.rename(columns={'Ligand1': 'ligandA', 'Ligand2': 'ligandB', f'FEP': 'DDG', f'FEP Error': 'dDDG'}, inplace=True)
    df['unit'] = unit
    if forcefield.startswith('fep'):
        forcefield = f'opls3e_{forcefield[-1]}'
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

                    Ligand1 Ligand2   FEP  FEP Error
edge                                                
ros1_edge_0681_1537    0681    1537  0.92       0.06
ros1_edge_0681_1872    0681    1872 -0.47       0.06
ros1_edge_0681_5109    0681    5109  1.60       0.15
ros1_edge_0681_7454    0681    7454  1.14       0.09
ros1_edge_1537_0529    1537    0529 -2.12       0.07
...                     ...     ...   ...        ...
ros1_edge_1872_6770    1872    6770  2.57       0.24
ros1_edge_3507_6770    3507    6770  0.53       0.25
ros1_edge_5602_7454    5602    7454  2.09       0.20
ros1_edge_6674_6770    6674    6770  1.95       0.23
ros1_edge_6770_8550    6770    8550 -1.68       0.29

[62 rows x 4 columns]
                     Ligand1 Ligand2        FEP  FEP Error
edge                                                      
pde10_edge_0340_0738    0340    0738  -1.750000       0.17
pde10_edge_0340_4754    0340    4754  -0.780000       0.10
pde10_edge_1038_5687    1038    5687   0.190000     

# Gather data from Gapsys et al.
retrieve from https://github.com/deGrootLab/pmx

In [4]:
cmet_dict = {'CHEMBL3402753_200': 'CHEMBL3402753_200_13',
 'CHEMBL3402759_5.7': 'CHEMBL3402759_5.7',
 'CHEMBL3402747_3400': 'CHEMBL3402747_3400_7',
 'CHEMBL3402744_300': 'CHEMBL3402744_300_4',
 'CHEMBL3402745_200': 'CHEMBL3402745_200_5',
 'CHEMBL3402761_1': 'CHEMBL3402761_1_21',
 'CHEMBL3402750_400': 'CHEMBL3402750_400_10',
 'CHEMBL3402743_42': 'CHEMBL3402743_42',
 'CHEMBL3402752_30000': 'CHEMBL3402752_30000_12',
 'CHEMBL3402755_4200': 'CHEMBL3402755_4200_15',
 'CHEMBL3402758_10': 'CHEMBL3402758_10',
 'CHEMBL3402749_500': 'CHEMBL3402749_500_9',
 'CHEMBL3402757_6.5': 'CHEMBL3402757_6.5',
 'CHEMBL3402765_11-charged-pKa-8.1': 'CHEMBL3402765_11-charged-pKa-8.1',
 'CHEMBL3402762_1': 'CHEMBL3402762_1',
 'CHEMBL3402742_23': 'CHEMBL3402742_23',
 'CHEMBL3402754_40': 'CHEMBL3402754_40_14',
 'CHEMBL3402748_5300': 'CHEMBL3402748_5300_8',
 'CHEMBL3402741_400': 'CHEMBL3402741_400',
 'CHEMBL3402763_90': 'CHEMBL3402763_90',
 'CHEMBL3402764_90': 'CHEMBL3402764_90',
 'CHEMBL3402751_2100': 'CHEMBL3402751_2100_11',
 'CHEMBL3402756_2.7': 'CHEMBL3402756_2.7',
 'CHEMBL3402760_1': 'CHEMBL3402760_1'}

In [5]:
temp_directory = tempfile.mkdtemp()

git.Repo.clone_from('https://github.com/deGrootLab/pmx', temp_directory, branch='master', depth=1)

<git.repo.base.Repo '/tmp/tmpiqftyrw2/.git'>

In [6]:
def getGapsysResults(target):  
    file_name = f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat'
    print(file_name)
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat', sep='\s+', header=None, comment='#',
                   names=['edge', 'exp', 'gaff', 'dgaff', 'cgenff', 'dcgenff', 'cons', 'dcons', 'fep5', 'dfep5', 'fep1', 'dfep1'])
    data['dexp']=pd.Series([0.0]*data.shape[0])
    df = edges.EdgeSet(target).get_dataframe()
    if target == 'jnk1':
        df.index = pd.Series([f'{str(a).split("-")[0]}_{str(b).split("-")[0]}' for a, b in zip(df[0].values, df[1].values)])
    else:
        df.index = pd.Series([f'{a}_{b}' for a, b in zip(df[0].values, df[1].values)])
    newdata = data.copy()
    print(newdata['edge'], df.index)
    newdata.index=newdata['edge']

    newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
    newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata = newdata.drop(columns=('edge', '', ''))
    newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
    newdata.index=[f'{target}_edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
    return newdata
getGapsysResults('cmet')

/tmp/tmpiqftyrw2/protLig_benchmark/ddg_data/cmet.dat


  result[:] = values


0          CHEMBL3402744_300_4_CHEMBL3402745_200_5
1         CHEMBL3402747_3400_7_CHEMBL3402745_200_5
2        CHEMBL3402747_3400_7_CHEMBL3402748_5300_8
3       CHEMBL3402747_3400_7_CHEMBL3402751_2100_11
4       CHEMBL3402748_5300_8_CHEMBL3402751_2100_11
5         CHEMBL3402749_500_9_CHEMBL3402747_3400_7
6         CHEMBL3402749_500_9_CHEMBL3402748_5300_8
7        CHEMBL3402749_500_9_CHEMBL3402751_2100_11
8          CHEMBL3402749_500_9_CHEMBL3402754_40_14
9        CHEMBL3402750_400_10_CHEMBL3402747_3400_7
10       CHEMBL3402750_400_10_CHEMBL3402748_5300_8
11        CHEMBL3402750_400_10_CHEMBL3402749_500_9
12      CHEMBL3402752_30000_12_CHEMBL3402744_300_4
13     CHEMBL3402752_30000_12_CHEMBL3402747_3400_7
14     CHEMBL3402752_30000_12_CHEMBL3402748_5300_8
15      CHEMBL3402752_30000_12_CHEMBL3402749_500_9
16      CHEMBL3402752_30000_12_CHEMBL3402754_40_14
17    CHEMBL3402752_30000_12_CHEMBL3402755_4200_15
18       CHEMBL3402753_200_13_CHEMBL3402748_5300_8
19        CHEMBL3402753_200_13_

forcefield,ligandA,ligandB,exp,dexp,fep5,dfep5,fep1,dfep1,gaff,dgaff,cgenff,dcgenff,cons,dcons
method,Unnamed: 1_level_1,Unnamed: 2_level_1,exp,exp,fep,fep,fep,fep,pmx,pmx,pmx,pmx,pmx,pmx
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol
cmet_edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,CHEMBL3402744_300_4,CHEMBL3402745_200_5,-1.01,0.0,-3.24,0.27,-3.58,0.68,-3.43,0.4,-2.34,0.41,-2.88,0.56
cmet_edge_nan_nan,,,-7.02,0.0,-6.33,0.49,-5.01,0.89,-6.85,2.33,-18.11,1.66,-12.48,5.62
cmet_edge_nan_nan,,,1.1,0.0,6.19,0.4,4.97,0.39,3.86,0.91,-6.37,1.36,-1.25,5.11
cmet_edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,CHEMBL3402747_3400_7,CHEMBL3402751_2100_11,-1.19,0.0,-2.4,0.64,-5.43,0.41,-1.06,0.83,-0.73,1.16,-0.9,0.57
cmet_edge_CHEMBL3402748_5300_8_CHEMBL3402751_2100_11,CHEMBL3402748_5300_8,CHEMBL3402751_2100_11,-2.29,0.0,-9.22,0.99,-10.0,0.46,-4.83,0.22,2.57,0.59,-1.13,3.69
cmet_edge_nan_nan,,,4.75,0.0,4.97,0.96,5.08,1.53,1.94,0.6,7.3,1.19,4.62,2.65
cmet_edge_CHEMBL3402749_500_9_CHEMBL3402748_5300_8,CHEMBL3402749_500_9,CHEMBL3402748_5300_8,5.85,0.0,11.39,0.75,10.78,0.82,19.71,0.71,9.58,2.21,14.64,5.06
cmet_edge_nan_nan,,,3.56,0.0,2.26,0.76,1.19,1.11,2.82,0.7,6.59,1.83,4.7,1.94
cmet_edge_nan_nan,,,-6.26,0.0,-9.97,0.33,-11.73,0.88,-6.58,0.95,-5.82,1.18,-6.2,0.65
cmet_edge_nan_nan,,,5.31,0.0,8.35,0.22,10.47,0.87,5.81,0.25,4.14,1.03,4.98,0.85


In [52]:
def getGapsysSingleRepeatsResults(target, ff):
    file_name = f'{temp_directory}/protLig_benchmark/dg_data_allRepeats/{target}_{ff}_protein.dat'
    print(file_name)
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data_complex = pd.read_csv(file_name, sep='\s+', header=None, comment='#',
                   names=['edge', 'dgcomplex1', 'e_dgcomplex1', 'dgcomplex2', 'e_dgcomplex2', 'dgcomplex3', 'e_dgcomplex3'])
    data_complex.index=data_complex['edge']
    file_name = f'{temp_directory}/protLig_benchmark/dg_data_allRepeats/{target}_{ff}_water.dat'
    print(file_name)
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data_water = pd.read_csv(file_name, sep='\s+', header=None, comment='#',
                   names=['edge', 'dgwater1', 'e_dgwater1', 'dgwater2', 'e_dgwater2', 'dgwater3', 'e_dgwater3'])
    data_water.index=data_water['edge']
   # data['dexp']=pd.Series([0.0]*data.shape[0])
    df = edges.EdgeSet(target).get_dataframe()
    if target == 'jnk1':
        df.index = pd.Series([f'{str(a).split("-")[0]}_{str(b).split("-")[0]}' for a, b in zip(df[0].values, df[1].values)])
    else:
        df.index = pd.Series([f'{a}_{b}' for a, b in zip(df[0].values, df[1].values)])

    d = []
    for edge, row in df.iterrows():
        print(row)
        nedge=edge
#         print(edge, data_water.index)
#         if target=='cmet':
#             for key, item in cmet_dict.items():
#                 nedge=nedge.replace(key, item)
        if nedge not in data_water.index:
            continue
        for i in range(1,4):
            wc='water'
            d.append(
                dict(
                val=unit_registry.Quantity(
                    data_water.loc[nedge,f'dg{wc}{i}'], 'kilojoules / mole'
                ).to('kilocalories / mole').magnitude,
                err= unit_registry.Quantity(
                    data_water.loc[nedge,f'e_dg{wc}{i}'], 'kilojoules / mole'
                ).to('kilocalories / mole').magnitude,
                aerr= np.nan,
                conv=np.nan,
                env=wc,
                repeat=i,
                target=target,
                edge=f'edge_{row[0]}_{row[1]}',
                failed='False'
                )
            )

            wc='complex'
            d.append(
                dict(
                val=unit_registry.Quantity(
                    data_complex.loc[nedge,f'dg{wc}{i}'], 'kilojoules / mole'
                ).to('kilocalories / mole').magnitude,
                err= unit_registry.Quantity(
                    data_complex.loc[nedge,f'e_dg{wc}{i}'], 'kilojoules / mole'
                ).to('kilocalories / mole').magnitude,
                aerr= np.nan,
                conv=np.nan,
                env=wc,
                repeat=i,
                target=target,
                edge=f'edge_{row[0]}_{row[1]}',
                failed='False'
                )
            )

    newdata=pd.DataFrame(d)    
#     newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
#     newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
#     newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
#     newdata = newdata.drop(columns=('edge', '', ''))
#     newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
#     newdata.index=[f'{target}_edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
    return newdata
getGapsysSingleRepeatsResults('cmet', 'cgenff')

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/cmet_cgenff_protein.dat
/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/cmet_cgenff_water.dat
0                                                         CHEMBL3402741_400
1                                                         CHEMBL3402756_2.7
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][c]1[c]([H])[c]([N]([H])[C](=[O])[O][C]...
Smiles2                       [H][C]1=[C]([H])[C]([c]2[c]([H])[c]([F])[c...
exp. DeltaG [kcal/mol]                             -2.98 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CHEMBL3402741_400_CHEMBL3402756_2.7, dtype: object
0                                                         CHEMBL3402741_400
1                                                          CHEMBL3402763_90
Mol1                         

Unnamed: 0,val,err,aerr,conv,env,repeat,target,edge,failed
0,-2.595602,0.050191,,,water,1,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
1,-3.044933,0.074092,,,complex,1,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
2,-2.581262,0.052581,,,water,2,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
3,-3.3174,0.059751,,,complex,2,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
4,-2.643403,0.064532,,,water,3,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
5,-3.133365,0.054971,,,complex,3,cmet,edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,False
6,2.808317,0.069312,,,water,1,cmet,edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,False
7,2.454589,0.090822,,,complex,1,cmet,edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,False
8,2.805927,0.059751,,,water,2,cmet,edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,False
9,3.169216,0.074092,,,complex,2,cmet,edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,False


In [53]:
for ff in ['gaff', 'cgenff']:
    dfs = []
    for target in targets.target_dict:
        df = getGapsysSingleRepeatsResults(target, ff)
        if df is not None and df.shape[0]>0:
            dfs.append(df)
    df = pd.concat(dfs, axis=0)
    df.to_csv(f'00b_finished_simulations_{ff}.csv')

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/jnk1_gaff_protein.dat
/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/jnk1_gaff_water.dat


  result[:] = values


0                                                                   17124-1
1                                                                   18634-1
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1c(c(cc(n1)NC(=O)Cc2cc(c(cc2OC)Br)OC)N...
Smiles2                       CCOc1c(c(cc(n1)NC(=O)Cc2cc(ccc2OC)OC)N)C#N...
exp. DeltaG [kcal/mol]                             -0.33 kilocalorie / mole
exp. Error [kcal/mol]                               0.29 kilocalorie / mole
Name: 17124_18634, dtype: object
0                                                                   18626-1
1                                                                   18624-1
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1c(c(cc(n1)NC(=O)Cc2c

0                                                                  49220392
1                                                                  49137530
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1ccc(c(c1)c2nnc3n2c4cc(ccc4nc3C)CN5CCO...
Smiles2                       CCCCOc1cc(cnc1)c2nnc3n2c4cc(ccc4nc3C)CN5CC...
exp. DeltaG [kcal/mol]                               0.1 kilocalorie / mole
exp. Error [kcal/mol]                                0.5 kilocalorie / mole
Name: 49220392_49137530, dtype: object
0                                                                  49932714
1                                                                  49137530
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       Cc1c2nnc(n2c3cc(ccc

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/thrombin_gaff_protein.dat
/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/thrombin_gaff_water.dat
0                                                                        1d
1                                                                        6e
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       c1ccc(cc1)C[C@H](C(=O)N2CCC[C@H]2C(=O)NCc3...
Smiles2                       c1ccc(cc1)C[C@H](C(=O)N2CCC[C@H]2C(=O)NCc3...
exp. DeltaG [kcal/mol]                             -0.67 kilocalorie / mole
exp. Error [kcal/mol]                                0.1 kilocalorie / mole
Name: 1d_6e, dtype: object
0                                                                        1d
1                                                                         5
Mol1                          <img data-content="rdkit/

0                                                                   p38a_2u
1                                                                   p38a_2q
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([N]([H])[C]2([H])[C]([H])([...
Smiles2                       [H][C]1=[N][C]([N]([H])[C]2([H])[C]([H])([...
exp. DeltaG [kcal/mol]                               0.0 kilocalorie / mole
exp. Error [kcal/mol]                               0.13 kilocalorie / mole
Name: p38a_2u_p38a_2q, dtype: object
0                                                                 p38a_3fln
1                                                                   p38a_2g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([N]([H

0                                                                     23466
1                                                                     23475
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c(c(c1[H])N([H])[H])[H])C2=C(C(=C...
Smiles2                       [H]c1c(c(c(c(c1[H])N([H])C2(C(C(C(C2([H])[...
exp. DeltaG [kcal/mol]                             -0.88 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: 23466_23475, dtype: object
0                                                                     23467
1                                                                     23466
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c(c(c1[H])OC([H]

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/cdk2_gaff_water.dat
0                                                                        22
1                                                                      1h1r
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]2=[C]([O][C]([H])([H])[C]3([...
Smiles2                       [H]c1c(c(c(c(c1[H])Cl)[H])N([H])c2nc3c(c(n...
exp. DeltaG [kcal/mol]                              0.18 kilocalorie / mole
exp. Error [kcal/mol]                               0.09 kilocalorie / mole
Name: 22_1h1r, dtype: object
0                                                                      1h1s
1                                                                      1oiy
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecu

0                                                         CHEMBL3402741_400
1                                                         CHEMBL3402756_2.7
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][c]1[c]([H])[c]([N]([H])[C](=[O])[O][C]...
Smiles2                       [H][C]1=[C]([H])[C]([c]2[c]([H])[c]([F])[c...
exp. DeltaG [kcal/mol]                             -2.98 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CHEMBL3402741_400_CHEMBL3402756_2.7, dtype: object
0                                                         CHEMBL3402741_400
1                                                          CHEMBL3402763_90
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [

0                                          CHEMBL3402765_11-charged-pKa-8.1
1                                                           CHEMBL3402760_1
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[C]([H])[C]([c]2[c]([H])[c]([F])[c...
Smiles2                       [H][C]1=[C]([H])[C]([c]2[c]([H])[c]([H])[c...
exp. DeltaG [kcal/mol]                             -1.43 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402760_1, dtype: object
0                                          CHEMBL3402765_11-charged-pKa-8.1
1                                                          CHEMBL3402764_90
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1           

0                                                                   CAT-13b
1                                                                   CAT-17g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[C]([H])[C]([C]2=[C]([H])[C]([C@]3...
Smiles2                       [H][C]1=[N][C]([H])=[C]([F])[C]([H])=[C]1[...
exp. DeltaG [kcal/mol]                             -0.62 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CAT-13b_CAT-17g, dtype: object
0                                                                   CAT-13a
1                                                                   CAT-17g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[C]([H])[C]([

0                                                                        33
1                                                                        13
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]([C]2=[C]([...
Smiles2                       [H][C]1=[C]([H])[C]([C]2=[C]([H])[C]([H])=...
exp. DeltaG [kcal/mol]                             -2.51 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: 33_13, dtype: object
0                                                                        33
1                                                                        25
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]

0                                                                      32_L
1                                                                      31_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
Smiles2                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
exp. DeltaG [kcal/mol]                               0.0 kilocalorie / mole
exp. Error [kcal/mol]                                0.6 kilocalorie / mole
Name: 32_L_31_L, dtype: object
0                                                                      32_L
1                                                                      29_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H]

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/tyk2_gaff_water.dat
0                                                                    jmc_23
1                                                                    ejm_55
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c(c(c1[H])Cl)C(=O)N([H])c2c(c(nc(...
Smiles2                       [H]c1c(c(c(c(c1[H])Cl)C(=O)N([H])c2c(c(nc(...
exp. DeltaG [kcal/mol]                              2.52 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: jmc_23_ejm_55, dtype: object
0                                                                    ejm_44
1                                                                    ejm_55
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/

0                                                                   17124-1
1                                                                   18634-1
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1c(c(cc(n1)NC(=O)Cc2cc(c(cc2OC)Br)OC)N...
Smiles2                       CCOc1c(c(cc(n1)NC(=O)Cc2cc(ccc2OC)OC)N)C#N...
exp. DeltaG [kcal/mol]                             -0.33 kilocalorie / mole
exp. Error [kcal/mol]                               0.29 kilocalorie / mole
Name: 17124_18634, dtype: object
0                                                                   18626-1
1                                                                   18624-1
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1c(c(cc(n1)NC(=O)Cc2c

0                                                                  49220392
1                                                                  49137530
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       CCOc1ccc(c(c1)c2nnc3n2c4cc(ccc4nc3C)CN5CCO...
Smiles2                       CCCCOc1cc(cnc1)c2nnc3n2c4cc(ccc4nc3C)CN5CC...
exp. DeltaG [kcal/mol]                               0.1 kilocalorie / mole
exp. Error [kcal/mol]                                0.5 kilocalorie / mole
Name: 49220392_49137530, dtype: object
0                                                                  49932714
1                                                                  49137530
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       Cc1c2nnc(n2c3cc(ccc

0                                                                        1d
1                                                                        6e
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       c1ccc(cc1)C[C@H](C(=O)N2CCC[C@H]2C(=O)NCc3...
Smiles2                       c1ccc(cc1)C[C@H](C(=O)N2CCC[C@H]2C(=O)NCc3...
exp. DeltaG [kcal/mol]                             -0.67 kilocalorie / mole
exp. Error [kcal/mol]                                0.1 kilocalorie / mole
Name: 1d_6e, dtype: object
0                                                                        1d
1                                                                         5
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       c1ccc(cc1)C[C@H](C(=O)N2CCC[C@H

0                                                                   p38a_2u
1                                                                   p38a_2q
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([N]([H])[C]2([H])[C]([H])([...
Smiles2                       [H][C]1=[N][C]([N]([H])[C]2([H])[C]([H])([...
exp. DeltaG [kcal/mol]                               0.0 kilocalorie / mole
exp. Error [kcal/mol]                               0.13 kilocalorie / mole
Name: p38a_2u_p38a_2q, dtype: object
0                                                                 p38a_3fln
1                                                                   p38a_2g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([N]([H

0                                                                     23466
1                                                                     23475
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c(c(c1[H])N([H])[H])[H])C2=C(C(=C...
Smiles2                       [H]c1c(c(c(c(c1[H])N([H])C2(C(C(C(C2([H])[...
exp. DeltaG [kcal/mol]                             -0.88 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: 23466_23475, dtype: object
0                                                                     23467
1                                                                     23466
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c(c(c1[H])OC([H]

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/cdk2_cgenff_protein.dat
/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/cdk2_cgenff_water.dat
0                                                                        22
1                                                                      1h1r
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]2=[C]([O][C]([H])([H])[C]3([...
Smiles2                       [H]c1c(c(c(c(c1[H])Cl)[H])N([H])c2nc3c(c(n...
exp. DeltaG [kcal/mol]                              0.18 kilocalorie / mole
exp. Error [kcal/mol]                               0.09 kilocalorie / mole
Name: 22_1h1r, dtype: object
0                                                                      1h1s
1                                                                      1oiy
Mol1                          <img data-content="rdkit/mo

0                                                         CHEMBL3402741_400
1                                                         CHEMBL3402756_2.7
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][c]1[c]([H])[c]([N]([H])[C](=[O])[O][C]...
Smiles2                       [H][C]1=[C]([H])[C]([c]2[c]([H])[c]([F])[c...
exp. DeltaG [kcal/mol]                             -2.98 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CHEMBL3402741_400_CHEMBL3402756_2.7, dtype: object
0                                                         CHEMBL3402741_400
1                                                          CHEMBL3402763_90
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [

0                                                                        50
1                                                                        60
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c2c(c(c1[H])Cl)C(=C(N2[H])C(=O)[O-]...
Smiles2                       [H]c1c(c(c2c(c1[H])C(=C(S2)C(=O)[O-])C([H]...
exp. DeltaG [kcal/mol]                              0.41 kilocalorie / mole
exp. Error [kcal/mol]                               0.15 kilocalorie / mole
Name: 50_60, dtype: object
0                                                                        56
1                                                                        35
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H]c1c(c(c2c(c1[H])C(=C(N2C([H]

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/bace_cgenff_water.dat
0                                                                   CAT-13b
1                                                                   CAT-17g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[C]([H])[C]([C]2=[C]([H])[C]([C@]3...
Smiles2                       [H][C]1=[N][C]([H])=[C]([F])[C]([H])=[C]1[...
exp. DeltaG [kcal/mol]                             -0.62 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: CAT-13b_CAT-17g, dtype: object
0                                                                   CAT-13a
1                                                                   CAT-17g
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rd

/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/bace_hunt_cgenff_protein.dat
/tmp/tmpiqftyrw2/protLig_benchmark/dg_data_allRepeats/bace_hunt_cgenff_water.dat
0                                                                        33
1                                                                        13
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]([C]2=[C]([...
Smiles2                       [H][C]1=[C]([H])[C]([C]2=[C]([H])[C]([H])=...
exp. DeltaG [kcal/mol]                             -2.51 kilocalorie / mole
exp. Error [kcal/mol]                                0.0 kilocalorie / mole
Name: 33_13, dtype: object
0                                                                        33
1                                                                        25
Mol1                          <img data-content="

0                                                                      32_L
1                                                                      31_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
Smiles2                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
exp. DeltaG [kcal/mol]                               0.0 kilocalorie / mole
exp. Error [kcal/mol]                                0.6 kilocalorie / mole
Name: 32_L_31_L, dtype: object
0                                                                      32_L
1                                                                      29_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H]

0                                                                      33_L
1                                                                      30_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
Smiles2                       [H][C]1=[N][C]([H])=[C]([H])[C]([H])=[C]1[...
exp. DeltaG [kcal/mol]                             -0.11 kilocalorie / mole
exp. Error [kcal/mol]                               0.42 kilocalorie / mole
Name: 33_L_30_L, dtype: object
0                                                                      30_L
1                                                                      29_L
Mol1                          <img data-content="rdkit/molecule" src="da...
Mol2                          <img data-content="rdkit/molecule" src="da...
Smiles1                       [H][C]1=[N][C]([H])=[C]([H]

In [None]:
author = "gapsys"
for target in targets.target_dict:
    original_df = getGapsysResults(target)
    if original_df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    for forcefield in ['fep5', 'fep1', 'gaff', 'cgenff']:
        df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
        software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        if unit == 'kj/mol':
            unit = 'kilojoules / mole'
        df.columns = df.columns.get_level_values(0)
        df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
        df['unit'] = unit
        if forcefield.startswith('fep'):
            forcefield = f'opls3e_{forcefield[-1]}'
        with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
            yaml.dump(df.T.to_dict(), file)
    
    forcefield = 'exp'
    df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
    software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    if unit == 'kj/mol':
        unit = 'kilojoules / mole'
    df.columns = df.columns.get_level_values(0)
    df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
    df['unit'] = unit
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_experiment_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

# Compare experimental values stored in repository with Gapsys exp. data

In [44]:
for target in targets.target_dict:
    author = "hahn"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data1 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    author = "gapsys"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data2 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    for e, edata in data1.items():
        if e in data2:
            v1 = unit_registry.Quantity(edata['DDG'], edata['unit'])
            v2 = unit_registry.Quantity(data2[e]['DDG'], data2[e]['unit'])
            if not np.isclose(v1.to('kilocalories / mole').magnitude, 
                              v2.to('kilocalories / mole').magnitude, 
                              atol=.05,
                              equal_nan=False):
                print(target, e, v1.to('kilocalories / mole'), v2.to('kilocalories / mole'))
        else:
            print(target, e)

cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90
cmet cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402742_23
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402758_10
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402760_1
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402762_1
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402764_90
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402752_30000_12
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402757_6.5
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402742_23
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402743_42
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402748_5300_8
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402752_30000_12
cmet cmet_edge_CHEMBL34027

# Get results from Wang et. al., JACS 2015
Input file taken from https://pubs.acs.org/doi/suppl/10.1021/ja512751q/suppl_file/ja512751q_si_003.xlsx (retrieval 2020-09-21) and converted to csv file

In [18]:
def getWangResults(target):    
    file_name = f'{path}/../00_data/input/ja512751q_si_003.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')        
        return None
    data = pd.read_csv(file_name, sep=',')
    data['system'] = data['system'].apply(lambda x: str(x).lower())
    indeces = data.loc[data['system']!='nan', :].index
    data_per_target = [data.loc[indeces[i]:indeces[i+1]-1,:] for i in range(indeces.shape[0]-1)]
    data_per_target = {d['system'].iloc[0]: d for d in data_per_target}
    for t, d in data_per_target.items():
        d.index = [f'{target}_edge_{x["Ligand1"]}_{x["Ligand2"]}' for i, x in d.iterrows()]
        d = d[['Ligand1', 'Ligand2', 'bennett_ddG', 'bennett_error']]
        d = d.rename(columns={'Ligand1': 'LigandA',
                             'Ligand2': 'LigandB',
                             'bennett_ddG': 'DDG',
                             'bennett_error': 'dDDG'})
        d['unit'] = 'kilocalories / mole'
        data_per_target[t] = d
    if target in data_per_target:
        return data_per_target[target]
    else:
        return None
getWangResults('jnk1').head()

Unnamed: 0,LigandA,LigandB,DDG,dDDG,unit
jnk1_edge_17124-1_18634-1,17124-1,18634-1,0.47,0.08,kilocalories / mole
jnk1_edge_18626-1_18624-1,18626-1,18624-1,0.76,0.08,kilocalories / mole
jnk1_edge_18636-1_18625-1,18636-1,18625-1,-0.3,0.09,kilocalories / mole
jnk1_edge_18632-1_18624-1,18632-1,18624-1,0.6,0.09,kilocalories / mole
jnk1_edge_18635-1_18625-1,18635-1,18625-1,0.97,0.07,kilocalories / mole


In [19]:
author = 'wang'
software = 'fep+'
forcefield = 'opls2.1'

In [20]:
for target in targets.target_dict:
    df = getWangResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

# Gather fep-benchmark data (Schindler, Merck KGaA)

In [13]:
# Create temporary dir
temp_directory = tempfile.mkdtemp()
repo = git.Repo.clone_from('https://github.com/MCompChem/fep-benchmark', temp_directory, branch='v1.0')

In [14]:
# dict to match names
# <name in Schindler et al.> : <name in protein ligand benchmark>
cmet_dict = {'CHEMBL3402753_200': 'CHEMBL3402753_200_13',
 'CHEMBL3402759_5.7': 'CHEMBL3402759_5.7',
 'CHEMBL3402747_3400': 'CHEMBL3402747_3400_7',
 'CHEMBL3402744_300': 'CHEMBL3402744_300_4',
 'CHEMBL3402745_200': 'CHEMBL3402745_200_5',
 'CHEMBL3402761_1': 'CHEMBL3402761_1_21',
 'CHEMBL3402750_400': 'CHEMBL3402750_400_10',
 'CHEMBL3402743_42': 'CHEMBL3402743_42',
 'CHEMBL3402752_30000': 'CHEMBL3402752_30000_12',
 'CHEMBL3402755_4200': 'CHEMBL3402755_4200_15',
 'CHEMBL3402758_10': 'CHEMBL3402758_10',
 'CHEMBL3402749_500': 'CHEMBL3402749_500_9',
 'CHEMBL3402757_6.5': 'CHEMBL3402757_6.5',
 'CHEMBL3402765_11-charged-pKa-8.1': 'CHEMBL3402765_11-charged-pKa-8.1',
 'CHEMBL3402762_1': 'CHEMBL3402762_1',
 'CHEMBL3402742_23': 'CHEMBL3402742_23',
 'CHEMBL3402754_40': 'CHEMBL3402754_40_14',
 'CHEMBL3402748_5300': 'CHEMBL3402748_5300_8',
 'CHEMBL3402741_400': 'CHEMBL3402741_400',
 'CHEMBL3402763_90': 'CHEMBL3402763_90',
 'CHEMBL3402764_90': 'CHEMBL3402764_90',
 'CHEMBL3402751_2100': 'CHEMBL3402751_2100_11',
 'CHEMBL3402756_2.7': 'CHEMBL3402756_2.7',
 'CHEMBL3402760_1': 'CHEMBL3402760_1'}
shp2_dict = {'SHP099-1/Example 7': 'SHP099-1',
 'Example 22': 'E22',
 'Example 29': 'E29',
 '11': '11',
 '4': '4',
 'Example 2': 'E2',
 'Example 14': 'E14',
 'Example 26': 'E26',
 'SHP836-2': 'SHP836-2',
 'Example 6': 'E6',
 'Example 1': 'E1',
 '3': '3',
 'Example 9': 'E9',
 'Example 8': 'E8',
 '7': '7',
 'Example 24': 'E24',
 'Example 28': 'E28',
 '10': '10',
 'Example 25': 'E25',
 'Example 27': 'E27',
 '6': '6',
 'Example 30': 'E30',
 'Example 5': 'E5',
 'Example 4': 'E4',
 'Example 23': 'E23',
 'Example 3': 'E3'}

In [15]:
def getSchindlerResults(target):
    file_name = f'{temp_directory}/{target}/results_edges_5ns.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    fepbenchmark5 = pd.read_csv(file_name)

    fepbenchmark5['ligandA'] = fepbenchmark5['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    fepbenchmark5['ligandB'] = fepbenchmark5['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    if target == 'cmet':
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in cmet_dict:
                    fepbenchmark5.loc[i, lig] = cmet_dict[row[lig]]    
    if target == "shp2":
        file_name = f'{temp_directory}/{target}/results_edges_20ns.csv'
        if not os.path.exists(file_name):
            print(f'File {file_name} does not exist.')
            return None
        fepbenchmark20= pd.read_csv(file_name)

        fepbenchmark20['ligandA'] = fepbenchmark20['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        fepbenchmark20['ligandB'] = fepbenchmark20['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        llist = []
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in shp2_dict:
                    fepbenchmark5.loc[i, lig] = shp2_dict[row[lig]]
            if pd.isna(row["FEP"]):
                nrow = fepbenchmark20.loc[i,:]
                assert row['Ligand1'] == nrow['Ligand1']
                assert row['Ligand2'] == nrow['Ligand2']
                fepbenchmark5.loc[i,"FEP"] = nrow["FEP"]
                fepbenchmark5.loc[i,"FEP Error"] = nrow["FEP Error"]
                llist.append(f'edge_{fepbenchmark5.loc[i,"ligandA"]}_{fepbenchmark5.loc[i,"ligandB"]}')
        print(f"{target} 20 ns runs",  llist)
    fepbenchmark5 = fepbenchmark5.loc[:, ['ligandA', 'ligandB', 'FEP', 'FEP Error']]
    fepbenchmark5 = fepbenchmark5.rename(columns={
                             'FEP': 'DDG',
                             'FEP Error': 'dDDG'})
    fepbenchmark5.index = [f'{target}_edge_{x["ligandA"]}_{x["ligandB"]}' for i, x in fepbenchmark5.iterrows()]
    fepbenchmark5['unit'] = 'kilocalories / mole'
    return fepbenchmark5
getSchindlerResults('hif2a').head()

Unnamed: 0,ligandA,ligandB,DDG,dDDG,unit
hif2a_edge_338_165,338,165,-0.44,0.25,kilocalories / mole
hif2a_edge_338_215,338,215,-0.76,0.13,kilocalories / mole
hif2a_edge_338_163,338,163,-1.02,0.1,kilocalories / mole
hif2a_edge_43_235,43,235,-0.2,0.1,kilocalories / mole
hif2a_edge_43_54,43,54,0.9,0.08,kilocalories / mole


In [33]:
author = 'schindler'
software = 'fep+'
forcefield = 'opls3e'

In [34]:
for target in targets.target_dict:
    df = getSchindlerResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, 
                           targets.get_target_dir(target), 
                           results_dir, 
                           f'{target}_{software}_{forcefield}_{author}.yaml'), 
              'w') as file:
        yaml.dump(df.T.to_dict(), file)

File /tmp/tmpaia1ucnz/jnk1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/pde2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/thrombin/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/p38/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/ptp1b/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/galectin/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/cdk2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/mcl1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace_hunt/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace_p2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/tyk2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/ros1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/pde10/results_edges_5ns.csv does not exist.
shp2 20 ns runs ['edge_SHP099-1_E22', 'edge_SHP099-1_E2', 'edge_SHP099-1_

In [35]:
target='syk'
for target in targets.target_dict:
    author = "hahn"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data1 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    file_name = f'{temp_directory}/{target}/results_edges_5ns.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        continue
    fepbenchmark5 = pd.read_csv(file_name)
    fepbenchmark5['ligandA'] = fepbenchmark5['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    fepbenchmark5['ligandB'] = fepbenchmark5['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    if target == 'cmet':
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in cmet_dict:
                    fepbenchmark5.loc[i, lig] = cmet_dict[row[lig]]    
    if target == "shp2":
        file_name = f'{temp_directory}/{target}/results_edges_20ns.csv'
        if not os.path.exists(file_name):
            print(f'File {file_name} does not exist.')
            continue
        fepbenchmark20= pd.read_csv(file_name)

        fepbenchmark20['ligandA'] = fepbenchmark20['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        fepbenchmark20['ligandB'] = fepbenchmark20['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        llist = []
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in shp2_dict:
                    fepbenchmark5.loc[i, lig] = shp2_dict[row[lig]]
            if pd.isna(row["FEP"]):
                nrow = fepbenchmark20.loc[i,:]
                assert row['Ligand1'] == nrow['Ligand1']
                assert row['Ligand2'] == nrow['Ligand2']
                fepbenchmark5.loc[i,"FEP"] = nrow["FEP"]
                fepbenchmark5.loc[i,"FEP Error"] = nrow["FEP Error"]
                llist.append(f'edge_{fepbenchmark5.loc[i,"ligandA"]}_{fepbenchmark5.loc[i,"ligandB"]}')
        print(f"{target} 20 ns runs",  llist)
    fepbenchmark5 = fepbenchmark5.rename(columns={
                             'FEP': 'DDG',
                             'FEP Error': 'dDDG'})
    fepbenchmark5.index = [f'{target}_edge_{x["ligandA"]}_{x["ligandB"]}' for i, x in fepbenchmark5.iterrows()]
    fepbenchmark5['unit'] = 'kilocalories / mole'
    for e, edata in data1.items():
        v1 = unit_registry.Quantity(edata['DDG'], edata['unit'])
        v2 = unit_registry.Quantity(fepbenchmark5.loc[e, 'Exp.'], fepbenchmark5.loc[e, 'unit'])
        if not np.isclose(v1.to('kilocalories / mole').magnitude, 
                          v2.to('kilocalories / mole').magnitude, 
                              atol=.05,
                              equal_nan=False):
            print(target, e, v1.to('kilocalories / mole'), v2.to('kilocalories / mole'))
        #else:
        #    print(target, e)
    #print(data1, fepbenchmark5)

File /tmp/tmpaia1ucnz/jnk1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/pde2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/thrombin/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/p38/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/ptp1b/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/galectin/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/cdk2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/mcl1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace_hunt/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/bace_p2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/tyk2/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/ros1/results_edges_5ns.csv does not exist.
File /tmp/tmpaia1ucnz/pde10/results_edges_5ns.csv does not exist.
shp2 20 ns runs ['edge_SHP099-1_E22', 'edge_SHP099-1_E2', 'edge_SHP099-1_

# Create a simple, dump null model (all activities set to 0)

In [26]:
def getNullModell(target):
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1])
    df = df.rename(columns={0: "ligandA", 1: "ligandB"})
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df["ligandA"].values, df["ligandB"].values)])

    
    df['ligandA']= df['ligandA'].astype(str)
    df['ligandB']= df['ligandB'].astype(str)
    df['DDG'] = 0.0
    df['dDDG'] = 0.1
    df['unit'] = 'kilocalories / mole'
    return df
getNullModell('cmet').head()

Unnamed: 0,ligandA,ligandB,DDG,dDDG,unit
cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7,CHEMBL3402741_400,CHEMBL3402756_2.7,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90,CHEMBL3402741_400,CHEMBL3402763_90,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90,CHEMBL3402741_400,CHEMBL3402764_90,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7,CHEMBL3402742_23,CHEMBL3402756_2.7,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90,CHEMBL3402742_23,CHEMBL3402763_90,0.0,0.1,kilocalories / mole


In [27]:
author = 'hahn'
software = 'null'
forcefield = 'null'

In [28]:
for target in targets.target_dict:
    df = getNullModell(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)