# Gather data from various source into the analysis repository

In [15]:
# imports
import os
import shutil
import numpy as np
import pandas as pd
import yaml
import git
import tempfile
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pint
unit_registry = pint.UnitRegistry()

from PLBenchmarks import targets, ligands, edges
from IPython.core.display import HTML

from tqdm.notebook import tqdm

import benchmarkpl
path = benchmarkpl.__path__[0]

# Set path of data directory

In [16]:
targets.set_data_dir(os.path.join(path))
# directory name where results for each target are stored
results_dir = '10_results'

# Number of targets, ligands and edges in the data set

In [17]:
nligs, nedgs = 0, 0
print(f'{"Target":10s} {"Num Ligs":>10s} {"Num Edges":>10s}')
print(33 * '-')
for target in tqdm(targets.target_dict):
    print(f'{target:10s} {len(ligands.LigandSet(target)):10d} {len(edges.EdgeSet(target)):10d}')
    nligs += len(ligands.LigandSet(target))
    nedgs += len(edges.EdgeSet(target))
print(33 * '-')
print(f'{"total":10s} {nligs:10d} {nedgs:10d}')

Target       Num Ligs  Num Edges
---------------------------------


  0%|          | 0/22 [00:00<?, ?it/s]

jnk1               21         31
pde2               21         34
thrombin           11         16
p38                34         56
ptp1b              23         49
galectin            8          7
cdk2               16         25
cmet               24         57
mcl1               42         71
bace               36         58
bace_hunt          32         60
bace_p2            12         26
tyk2               16         24
ros1               28         61
eg5                28         65
cdk8               33         54
hif2a              42         92
pfkfb3             40         66
pde10              35         59
shp2               26         56
syk                44        101
tnks2              27         60
---------------------------------
total             599       1128


# Experimental values stored in repository
Retrieve the experimental values stored in the dataset

In [4]:
# function to retrieve exp. data from PLBenchmarks 
def getExpResults(target):
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # remove unit of exp. values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    return df
getExpResults('tyk2').head()

Unnamed: 0,0,1,exp_DDG,exp_dDDG
tyk2_edge_jmc_23_ejm_55,jmc_23,ejm_55,2.52,0.0
tyk2_edge_ejm_44_ejm_55,ejm_44,ejm_55,-1.8,0.0
tyk2_edge_ejm_49_ejm_31,ejm_49,ejm_31,-1.81,0.0
tyk2_edge_ejm_31_ejm_46,ejm_31,ejm_46,-1.79,0.0
tyk2_edge_jmc_28_jmc_27,jmc_28,jmc_27,-0.3,0.0


In [5]:
author = "hahn"
software = "experiment"

In [7]:
for target in targets.target_dict:
    df = getExpResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target)), exist_ok=True)
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'exp_DDG', 'exp_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'exp_DDG': 'DDG', 'exp_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

# pmx calculations with openFF parameters (Hahn et al.)

In [12]:
# function to retrieve data from PLBenchmarks calculations, soted in 00_data/input 
def getRawResults(target, forcefield='openff-1.0.0.offxml'):
    file_path = os.path.join(path, 
                                   '..', 
                                   '00_data', 
                                   'input',  
                                   f'{target}_{forcefield}.dat'
                                  )
    if not os.path.exists(file_path):
        print(f'File {file_path} does not exist.')
        return
    # read in result file
    res = pd.read_csv(file_path,
                      header=None, 
                      comment='#', 
                      skipinitialspace=True, 
                      names=['edge', 'calc DDG', 'calc dDDG' , 'add dDDG'], 
                      sep=' ')
    res.index = res['edge']
    # read in exp. data
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # copy in calculated values
    df['calc_DDG'] = res.loc[:,'calc DDG']
    df['calc_dDDG'] = res.loc[:,'calc dDDG']
    df['calc_dDDG(additional)'] = res.loc[:, 'add dDDG']
    
    # remove unit of calculated values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG', 'calc_DDG', 'calc_dDDG', 'calc_dDDG(additional)'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
    return df
getRawResults('cmet')

Unnamed: 0,0,1,exp_DDG,exp_dDDG,calc_DDG,calc_dDDG,calc_dDDG(additional)
cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7,CHEMBL3402741_400,CHEMBL3402756_2.7,-2.98,0.0,-11.68,2.63,
cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90,CHEMBL3402741_400,CHEMBL3402763_90,-0.89,0.0,1.97,1.01,
cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90,CHEMBL3402741_400,CHEMBL3402764_90,-0.89,0.0,3.64,0.45,
cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7,CHEMBL3402742_23,CHEMBL3402756_2.7,-1.27,0.0,-1.1,1.3,
cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90,CHEMBL3402742_23,CHEMBL3402763_90,0.82,0.0,0.25,0.18,
cmet_edge_CHEMBL3402743_42_CHEMBL3402742_23,CHEMBL3402743_42,CHEMBL3402742_23,-0.36,0.0,1.13,0.21,
cmet_edge_CHEMBL3402743_42_CHEMBL3402756_2.7,CHEMBL3402743_42,CHEMBL3402756_2.7,-1.63,0.0,-0.14,0.38,
cmet_edge_CHEMBL3402743_42_CHEMBL3402758_10,CHEMBL3402743_42,CHEMBL3402758_10,-0.85,0.0,-4.22,1.41,
cmet_edge_CHEMBL3402743_42_CHEMBL3402760_1,CHEMBL3402743_42,CHEMBL3402760_1,-2.22,0.0,-2.14,2.05,
cmet_edge_CHEMBL3402743_42_CHEMBL3402762_1,CHEMBL3402743_42,CHEMBL3402762_1,-2.22,0.0,-3.44,1.07,


In [13]:
author = "hahn"
software = "pmx"
forcefield = "openff-1.0.0.offxml"

In [14]:
for target in targets.target_dict:
    df = getRawResults(target, forcefield)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'calc_DDG', 'calc_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'calc_DDG': 'DDG', 'calc_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

## Gather data from Activity Cliffs, Perez-Benito et al. 

In [10]:
target="ros1"
file_name = f'{path}/../00_data/input/{target}_opls3e.csv'
if not os.path.exists(file_name):
    print(f'File {file_name} does not exist.')
data = pd.read_csv(file_name, header=0, comment='#')
print(data)
data['Ligand1'] = data['Ligand1'].astype(str)
# data['Ligand1'] = [x.split('_')[1] for x in data['Ligand1']]
data['Ligand2'] = data['Ligand2'].astype(str)
# data['Ligand2'] = [x.split('_')[1] for x in data['Ligand2']]
data['edge'] = pd.Series([f'{target}_edge_{a.split("_")[1]}_{b.split("_")[1]}' for a, b in zip(data['Ligand1'].values, data['Ligand2'].values)])
data.index=data['edge']
data.drop(columns='edge', inplace=True)
print(data)
for e, edg in edges.EdgeSet(target).items():
    if f'{target}_{e}' not in data.index:
        print(e)
        edg_dict = edg.get_dict()
#         print(edg_dict)
        n_index = f'{target}_edge_{e.split("_")[2]}_{e.split("_")[1]}'
        if n_index in data.index:
            row = data.loc[n_index, :]
            n_row = pd.Series({"Ligand1": row["Ligand2"],
                               "Ligand2": row["Ligand1"],
                                "edge": e,
                               "FEP": -row["FEP"],
                               "FEP Error": row["FEP Error"]
                              }
                                  )
            data.loc[e,:] = n_row
            data.drop(n_index, inplace=True)
        else:
            print(e)
data.to_csv('tmp.csv', sep=',', columns=["Ligand1", "Ligand2", "FEP", "FEP Error"], index=None)

     Ligand1   Ligand2   FEP  FEP Error
0   lig_0681  lig_1537  0.92       0.06
1   lig_0681  lig_1872 -0.47       0.06
2   lig_0681  lig_5109  1.60       0.15
3   lig_0681  lig_7454  1.14       0.09
4   lig_1537  lig_0529 -2.12       0.07
..       ...       ...   ...        ...
57  lig_1872  lig_6770  2.57       0.24
58  lig_3507  lig_6770  0.53       0.25
59  lig_5602  lig_7454  2.09       0.20
60  lig_6674  lig_6770  1.95       0.23
61  lig_6770  lig_8550 -1.68       0.29

[62 rows x 4 columns]
                      Ligand1   Ligand2   FEP  FEP Error
edge                                                    
ros1_edge_0681_1537  lig_0681  lig_1537  0.92       0.06
ros1_edge_0681_1872  lig_0681  lig_1872 -0.47       0.06
ros1_edge_0681_5109  lig_0681  lig_5109  1.60       0.15
ros1_edge_0681_7454  lig_0681  lig_7454  1.14       0.09
ros1_edge_1537_0529  lig_1537  lig_0529 -2.12       0.07
...                       ...       ...   ...        ...
ros1_edge_1872_6770  lig_1872  lig_6770  

In [11]:
def get_perez_results(target):  
    file_name = f'{path}/../00_data/input/{target}_opls3e.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(file_name, header=0, comment='#')
    data['Ligand1'] = data['Ligand1'].astype(str)
    data['Ligand1'] = [x.split('_')[1] for x in data['Ligand1']]
    data['Ligand2'] = data['Ligand2'].astype(str)
    data['Ligand2'] = [x.split('_')[1] for x in data['Ligand2']]
    data['edge'] = pd.Series([f'{target}_edge_{a}_{b}' for a, b in zip(data['Ligand1'].values, data['Ligand2'].values)])
    data.index=data['edge']
    data.drop(columns='edge', inplace=True)
    return data

In [12]:
author = "perez"
for target in ['ros1', 'pde10']:
    df = get_perez_results(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    forcefield = 'opls3e'
    software = 'fep+'
    unit = 'kilocalories / mole'
    print(df)
    df.rename(columns={'Ligand1': 'ligandA', 'Ligand2': 'ligandB', f'FEP': 'DDG', f'FEP Error': 'dDDG'}, inplace=True)
    df['unit'] = unit
    if forcefield.startswith('fep'):
        forcefield = f'opls3e_{forcefield[-1]}'
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

                    Ligand1 Ligand2   FEP  FEP Error
edge                                                
ros1_edge_0681_1537    0681    1537  0.92       0.06
ros1_edge_0681_1872    0681    1872 -0.47       0.06
ros1_edge_0681_5109    0681    5109  1.60       0.15
ros1_edge_0681_7454    0681    7454  1.14       0.09
ros1_edge_1537_0529    1537    0529 -2.12       0.07
...                     ...     ...   ...        ...
ros1_edge_1872_6770    1872    6770  2.57       0.24
ros1_edge_3507_6770    3507    6770  0.53       0.25
ros1_edge_5602_7454    5602    7454  2.09       0.20
ros1_edge_6674_6770    6674    6770  1.95       0.23
ros1_edge_6770_8550    6770    8550 -1.68       0.29

[62 rows x 4 columns]
                     Ligand1 Ligand2        FEP  FEP Error
edge                                                      
pde10_edge_0340_0738    0340    0738  -1.750000       0.17
pde10_edge_0340_4754    0340    4754  -0.780000       0.10
pde10_edge_1038_5687    1038    5687   0.190000     

# Gather data from Gapsys et al.
retrieve from https://github.com/deGrootLab/pmx

In [18]:
cmet_dict = {'CHEMBL3402753_200': 'CHEMBL3402753_200_13',
 'CHEMBL3402759_5.7': 'CHEMBL3402759_5.7',
 'CHEMBL3402747_3400': 'CHEMBL3402747_3400_7',
 'CHEMBL3402744_300': 'CHEMBL3402744_300_4',
 'CHEMBL3402745_200': 'CHEMBL3402745_200_5',
 'CHEMBL3402761_1': 'CHEMBL3402761_1_21',
 'CHEMBL3402750_400': 'CHEMBL3402750_400_10',
 'CHEMBL3402743_42': 'CHEMBL3402743_42',
 'CHEMBL3402752_30000': 'CHEMBL3402752_30000_12',
 'CHEMBL3402755_4200': 'CHEMBL3402755_4200_15',
 'CHEMBL3402758_10': 'CHEMBL3402758_10',
 'CHEMBL3402749_500': 'CHEMBL3402749_500_9',
 'CHEMBL3402757_6.5': 'CHEMBL3402757_6.5',
 'CHEMBL3402765_11-charged-pKa-8.1': 'CHEMBL3402765_11-charged-pKa-8.1',
 'CHEMBL3402762_1': 'CHEMBL3402762_1',
 'CHEMBL3402742_23': 'CHEMBL3402742_23',
 'CHEMBL3402754_40': 'CHEMBL3402754_40_14',
 'CHEMBL3402748_5300': 'CHEMBL3402748_5300_8',
 'CHEMBL3402741_400': 'CHEMBL3402741_400',
 'CHEMBL3402763_90': 'CHEMBL3402763_90',
 'CHEMBL3402764_90': 'CHEMBL3402764_90',
 'CHEMBL3402751_2100': 'CHEMBL3402751_2100_11',
 'CHEMBL3402756_2.7': 'CHEMBL3402756_2.7',
 'CHEMBL3402760_1': 'CHEMBL3402760_1'}

In [19]:
temp_directory = tempfile.mkdtemp()

git.Repo.clone_from('https://github.com/deGrootLab/pmx', temp_directory, branch='master', depth=1)

<git.repo.base.Repo '/tmp/tmp25a3ix6h/.git'>

In [20]:
def getGapsysResults(target):  
    file_name = f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat'
    print(file_name)
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat', sep='\s+', header=None, comment='#',
                   names=['edge', 'exp', 'gaff', 'dgaff', 'cgenff', 'dcgenff', 'cons', 'dcons', 'fep5', 'dfep5', 'fep1', 'dfep1'])
    data['dexp']=pd.Series([0.0]*data.shape[0])
    df = edges.EdgeSet(target).get_dataframe()
    if target == 'jnk1':
        df.index = pd.Series([f'{str(a).split("-")[0]}_{str(b).split("-")[0]}' for a, b in zip(df[0].values, df[1].values)])
    else:
        df.index = pd.Series([f'{a}_{b}' for a, b in zip(df[0].values, df[1].values)])
    newdata = data.copy()
    print(newdata['edge'], df.index)
    newdata.index=newdata['edge']

    newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
    newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata = newdata.drop(columns=('edge', '', ''))
    newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
    newdata.index=[f'{target}_edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
    return newdata
getGapsysResults('cmet')

/tmp/tmp25a3ix6h/protLig_benchmark/ddg_data/cmet.dat
0          CHEMBL3402744_300_4_CHEMBL3402745_200_5
1         CHEMBL3402747_3400_7_CHEMBL3402745_200_5
2        CHEMBL3402747_3400_7_CHEMBL3402748_5300_8
3       CHEMBL3402747_3400_7_CHEMBL3402751_2100_11
4       CHEMBL3402748_5300_8_CHEMBL3402751_2100_11
5         CHEMBL3402749_500_9_CHEMBL3402747_3400_7
6         CHEMBL3402749_500_9_CHEMBL3402748_5300_8
7        CHEMBL3402749_500_9_CHEMBL3402751_2100_11
8          CHEMBL3402749_500_9_CHEMBL3402754_40_14
9        CHEMBL3402750_400_10_CHEMBL3402747_3400_7
10       CHEMBL3402750_400_10_CHEMBL3402748_5300_8
11        CHEMBL3402750_400_10_CHEMBL3402749_500_9
12      CHEMBL3402752_30000_12_CHEMBL3402744_300_4
13     CHEMBL3402752_30000_12_CHEMBL3402747_3400_7
14     CHEMBL3402752_30000_12_CHEMBL3402748_5300_8
15      CHEMBL3402752_30000_12_CHEMBL3402749_500_9
16      CHEMBL3402752_30000_12_CHEMBL3402754_40_14
17    CHEMBL3402752_30000_12_CHEMBL3402755_4200_15
18       CHEMBL3402753_200_13

forcefield,ligandA,ligandB,exp,dexp,fep5,dfep5,fep1,dfep1,gaff,dgaff,cgenff,dcgenff,cons,dcons
method,Unnamed: 1_level_1,Unnamed: 2_level_1,exp,exp,fep,fep,fep,fep,pmx,pmx,pmx,pmx,pmx,pmx
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol
cmet_edge_CHEMBL3402744_300_4_CHEMBL3402745_200_5,CHEMBL3402744_300_4,CHEMBL3402745_200_5,-1.01,0.0,-3.24,0.27,-3.58,0.68,-3.43,0.4,-2.34,0.41,-2.88,0.56
cmet_edge_nan_nan,,,-7.02,0.0,-6.33,0.49,-5.01,0.89,-6.85,2.33,-18.11,1.66,-12.48,5.62
cmet_edge_nan_nan,,,1.1,0.0,6.19,0.4,4.97,0.39,3.86,0.91,-6.37,1.36,-1.25,5.11
cmet_edge_CHEMBL3402747_3400_7_CHEMBL3402751_2100_11,CHEMBL3402747_3400_7,CHEMBL3402751_2100_11,-1.19,0.0,-2.4,0.64,-5.43,0.41,-1.06,0.83,-0.73,1.16,-0.9,0.57
cmet_edge_CHEMBL3402748_5300_8_CHEMBL3402751_2100_11,CHEMBL3402748_5300_8,CHEMBL3402751_2100_11,-2.29,0.0,-9.22,0.99,-10.0,0.46,-4.83,0.22,2.57,0.59,-1.13,3.69
cmet_edge_nan_nan,,,4.75,0.0,4.97,0.96,5.08,1.53,1.94,0.6,7.3,1.19,4.62,2.65
cmet_edge_CHEMBL3402749_500_9_CHEMBL3402748_5300_8,CHEMBL3402749_500_9,CHEMBL3402748_5300_8,5.85,0.0,11.39,0.75,10.78,0.82,19.71,0.71,9.58,2.21,14.64,5.06
cmet_edge_nan_nan,,,3.56,0.0,2.26,0.76,1.19,1.11,2.82,0.7,6.59,1.83,4.7,1.94
cmet_edge_nan_nan,,,-6.26,0.0,-9.97,0.33,-11.73,0.88,-6.58,0.95,-5.82,1.18,-6.2,0.65
cmet_edge_nan_nan,,,5.31,0.0,8.35,0.22,10.47,0.87,5.81,0.25,4.14,1.03,4.98,0.85


In [21]:
author = "gapsys"
for target in targets.target_dict:
    original_df = getGapsysResults(target)
    if original_df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    for forcefield in ['fep5', 'fep1', 'gaff', 'cgenff']:
        df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
        software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        if unit == 'kj/mol':
            unit = 'kilojoules / mole'
        df.columns = df.columns.get_level_values(0)
        df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
        df['unit'] = unit
        if forcefield.startswith('fep'):
            forcefield = f'opls3e_{forcefield[-1]}'
        with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
            yaml.dump(df.T.to_dict(), file)
    
    forcefield = 'exp'
    df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
    software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    if unit == 'kj/mol':
        unit = 'kilojoules / mole'
    df.columns = df.columns.get_level_values(0)
    df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
    df['unit'] = unit
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_experiment_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

/tmp/tmp25a3ix6h/protLig_benchmark/ddg_data/jnk1.dat
0     17124_18631
1     17124_18634
2     18626_18624
3     18626_18625
4     18626_18627
5     18626_18628
6     18626_18630
7     18626_18632
8     18626_18634
9     18626_18658
10    18626_18659
11    18626_18660
12    18627_18630
13    18628_18624
14    18629_18627
15    18631_18624
16    18631_18652
17    18631_18660
18    18632_18624
19    18633_18624
20    18634_18637
21    18635_18624
22    18635_18625
23    18636_18624
24    18636_18625
25    18637_18631
26    18638_18634
27    18638_18658
28    18639_18634
29    18639_18658
30    18659_18634
Name: edge, dtype: object Index(['17124_18634', '18626_18624', '18636_18625', '18632_18624',
       '18635_18625', '18626_18658', '18639_18658', '18626_18625',
       '18638_18658', '18628_18624', '18631_18660', '18638_18634',
       '18626_18632', '18626_18630', '18631_18624', '18629_18627',
       '18634_18637', '18626_18627', '18631_18652', '18637_18631',
       '18626_18634', '18633

0      02_ligOMe_05_ligOEt
1     03_ligNHMe_02_ligOMe
2    03_ligNHMe_04_ligNMe2
3     04_ligNMe2_08_ligNH2
4        06_ligPyr_01_ligF
5         07_ligOH_01_ligF
6       07_ligOH_02_ligOMe
Name: edge, dtype: object Index(['02_ligOMe_05_ligOEt', '07_ligOH_02_ligOMe', '07_ligOH_01_ligF',
       '03_ligNHMe_02_ligOMe', '03_ligNHMe_04_ligNMe2', '04_ligNMe2_08_ligNH2',
       '06_ligPyr_01_ligF'],
      dtype='object')
/tmp/tmp25a3ix6h/protLig_benchmark/ddg_data/cdk2.dat
0       17_1h1q
1         17_21
2         17_22
3     1h1r_1oi9
4       1h1r_21
5     1h1s_1oiy
6       1h1s_26
7       1oi9_20
8     1oiu_1h1q
9       1oiu_26
10    1oiy_1h1q
11    1oiy_1oi9
12      1oiy_29
13      1oiy_31
14      1oiy_32
15      20_1h1q
16      22_1h1r
17      26_1h1q
18      26_1oi9
19        28_26
20        28_31
21        29_26
22        30_26
23        30_31
24        31_32
Name: edge, dtype: object Index(['22_1h1r', '1h1s_1oiy', '1oiu_26', '26_1h1q', '17_1h1q', '1oiy_1oi9',
       '29_26', '1h1s_26',



0     26_44
1     26_57
2     26_64
3     27_23
4     27_45
      ...  
66    67_58
67    67_61
68    67_63
69    68_23
70    68_45
Name: edge, Length: 71, dtype: object Index(['50_60', '56_35', '65_60', '26_57', '58_60', '62_45', '60_36', '30_27',
       '33_27', '43_27', '67_58', '67_32', '30_40', '38_60', '41_35', '54_23',
       '56_60', '66_42', '29_40', '26_44', '49_35', '29_35', '42_51', '39_32',
       '35_37', '28_35', '35_53', '67_63', '27_45', '41_32', '67_53', '35_33',
       '27_46', '66_23', '67_61', '57_23', '30_35', '61_60', '67_31', '32_46',
       '35_60', '31_35', '62_26', '35_36', '26_64', '38_35', '35_34', '29_27',
       '48_27', '68_45', '63_60', '54_42', '44_23', '28_27', '67_27', '52_60',
       '27_23', '49_67', '28_47', '67_52', '30_48', '67_35', '32_34', '65_67',
       '67_50', '35_39', '43_47', '67_37', '42_64', '51_45', '68_23'],
      dtype='object')
/tmp/tmp25a3ix6h/protLig_benchmark/ddg_data/bace.dat
0     CAT-13a_CAT-13m
1     CAT-13a_CAT-17g
2     CA

# Compare experimental values stored in repository with Gapsys exp. data

In [22]:
for target in targets.target_dict:
    author = "hahn"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data1 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    author = "gapsys"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data2 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    for e, edata in data1.items():
        if e in data2:
            v1 = unit_registry.Quantity(edata['DDG'], edata['unit'])
            v2 = unit_registry.Quantity(data2[e]['DDG'], data2[e]['unit'])
            if not np.isclose(v1.to('kilocalories / mole').magnitude, 
                              v2.to('kilocalories / mole').magnitude, 
                              atol=.05,
                              equal_nan=False):
                print(target, e, v1.to('kilocalories / mole'), v2.to('kilocalories / mole'))
        else:
            print(target, e)

cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90
cmet cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402742_23
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402758_10
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402760_1
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402762_1
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402763_90
cmet cmet_edge_CHEMBL3402743_42_CHEMBL3402764_90
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402752_30000_12
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402756_2.7
cmet cmet_edge_CHEMBL3402744_300_4_CHEMBL3402757_6.5
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402742_23
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402743_42
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402748_5300_8
cmet cmet_edge_CHEMBL3402745_200_5_CHEMBL3402752_30000_12
cmet cmet_edge_CHEMBL34027

# Get results from Wang et. al., JACS 2015
Input file taken from https://pubs.acs.org/doi/suppl/10.1021/ja512751q/suppl_file/ja512751q_si_003.xlsx (retrieval 2020-09-21) and converted to csv file

In [18]:
def getWangResults(target):    
    file_name = f'{path}/../00_data/input/ja512751q_si_003.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')        
        return None
    data = pd.read_csv(file_name, sep=',')
    data['system'] = data['system'].apply(lambda x: str(x).lower())
    indeces = data.loc[data['system']!='nan', :].index
    data_per_target = [data.loc[indeces[i]:indeces[i+1]-1,:] for i in range(indeces.shape[0]-1)]
    data_per_target = {d['system'].iloc[0]: d for d in data_per_target}
    for t, d in data_per_target.items():
        d.index = [f'{target}_edge_{x["Ligand1"]}_{x["Ligand2"]}' for i, x in d.iterrows()]
        d = d[['Ligand1', 'Ligand2', 'bennett_ddG', 'bennett_error']]
        d = d.rename(columns={'Ligand1': 'LigandA',
                             'Ligand2': 'LigandB',
                             'bennett_ddG': 'DDG',
                             'bennett_error': 'dDDG'})
        d['unit'] = 'kilocalories / mole'
        data_per_target[t] = d
    if target in data_per_target:
        return data_per_target[target]
    else:
        return None
getWangResults('jnk1').head()

Unnamed: 0,LigandA,LigandB,DDG,dDDG,unit
jnk1_edge_17124-1_18634-1,17124-1,18634-1,0.47,0.08,kilocalories / mole
jnk1_edge_18626-1_18624-1,18626-1,18624-1,0.76,0.08,kilocalories / mole
jnk1_edge_18636-1_18625-1,18636-1,18625-1,-0.3,0.09,kilocalories / mole
jnk1_edge_18632-1_18624-1,18632-1,18624-1,0.6,0.09,kilocalories / mole
jnk1_edge_18635-1_18625-1,18635-1,18625-1,0.97,0.07,kilocalories / mole


In [19]:
author = 'wang'
software = 'fep+'
forcefield = 'opls2.1'

In [20]:
for target in targets.target_dict:
    df = getWangResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

# Gather fep-benchmark data (Schindler, Merck KGaA)

In [21]:
# Create temporary dir
temp_directory = tempfile.mkdtemp()
repo = git.Repo.clone_from('https://github.com/MCompChem/fep-benchmark', temp_directory, branch='v1.0')

In [22]:
# dict to match names
# <name in Schindler et al.> : <name in protein ligand benchmark>
cmet_dict = {'CHEMBL3402753_200': 'CHEMBL3402753_200_13',
 'CHEMBL3402759_5.7': 'CHEMBL3402759_5.7',
 'CHEMBL3402747_3400': 'CHEMBL3402747_3400_7',
 'CHEMBL3402744_300': 'CHEMBL3402744_300_4',
 'CHEMBL3402745_200': 'CHEMBL3402745_200_5',
 'CHEMBL3402761_1': 'CHEMBL3402761_1_21',
 'CHEMBL3402750_400': 'CHEMBL3402750_400_10',
 'CHEMBL3402743_42': 'CHEMBL3402743_42',
 'CHEMBL3402752_30000': 'CHEMBL3402752_30000_12',
 'CHEMBL3402755_4200': 'CHEMBL3402755_4200_15',
 'CHEMBL3402758_10': 'CHEMBL3402758_10',
 'CHEMBL3402749_500': 'CHEMBL3402749_500_9',
 'CHEMBL3402757_6.5': 'CHEMBL3402757_6.5',
 'CHEMBL3402765_11-charged-pKa-8.1': 'CHEMBL3402765_11-charged-pKa-8.1',
 'CHEMBL3402762_1': 'CHEMBL3402762_1',
 'CHEMBL3402742_23': 'CHEMBL3402742_23',
 'CHEMBL3402754_40': 'CHEMBL3402754_40_14',
 'CHEMBL3402748_5300': 'CHEMBL3402748_5300_8',
 'CHEMBL3402741_400': 'CHEMBL3402741_400',
 'CHEMBL3402763_90': 'CHEMBL3402763_90',
 'CHEMBL3402764_90': 'CHEMBL3402764_90',
 'CHEMBL3402751_2100': 'CHEMBL3402751_2100_11',
 'CHEMBL3402756_2.7': 'CHEMBL3402756_2.7',
 'CHEMBL3402760_1': 'CHEMBL3402760_1'}
shp2_dict = {'SHP099-1/Example 7': 'SHP099-1',
 'Example 22': 'E22',
 'Example 29': 'E29',
 '11': '11',
 '4': '4',
 'Example 2': 'E2',
 'Example 14': 'E14',
 'Example 26': 'E26',
 'SHP836-2': 'SHP836-2',
 'Example 6': 'E6',
 'Example 1': 'E1',
 '3': '3',
 'Example 9': 'E9',
 'Example 8': 'E8',
 '7': '7',
 'Example 24': 'E24',
 'Example 28': 'E28',
 '10': '10',
 'Example 25': 'E25',
 'Example 27': 'E27',
 '6': '6',
 'Example 30': 'E30',
 'Example 5': 'E5',
 'Example 4': 'E4',
 'Example 23': 'E23',
 'Example 3': 'E3'}

In [23]:
def getSchindlerResults(target):
    file_name = f'{temp_directory}/{target}/results_edges_5ns.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    fepbenchmark5 = pd.read_csv(file_name)

    fepbenchmark5['ligandA'] = fepbenchmark5['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    fepbenchmark5['ligandB'] = fepbenchmark5['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    if target == 'cmet':
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in cmet_dict:
                    fepbenchmark5.loc[i, lig] = cmet_dict[row[lig]]    
    if target == "shp2":
        file_name = f'{temp_directory}/{target}/results_edges_20ns.csv'
        if not os.path.exists(file_name):
            print(f'File {file_name} does not exist.')
            return None
        fepbenchmark20= pd.read_csv(file_name)

        fepbenchmark20['ligandA'] = fepbenchmark20['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        fepbenchmark20['ligandB'] = fepbenchmark20['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
        llist = []
        for i, row in fepbenchmark5.iterrows():
            for lig in ['ligandA', 'ligandB']:
                if row[lig] in shp2_dict:
                    fepbenchmark5.loc[i, lig] = shp2_dict[row[lig]]
            if pd.isna(row["FEP"]):
                nrow = fepbenchmark20.loc[i,:]
                assert row['Ligand1'] == nrow['Ligand1']
                assert row['Ligand2'] == nrow['Ligand2']
                fepbenchmark5.loc[i,"FEP"] = nrow["FEP"]
                fepbenchmark5.loc[i,"FEP Error"] = nrow["FEP Error"]
                llist.append(f'edge_{fepbenchmark5.loc[i,"ligandA"]}_{fepbenchmark5.loc[i,"ligandB"]}')
        print(f"{target} 20 ns runs",  llist)
    fepbenchmark5 = fepbenchmark5.loc[:, ['ligandA', 'ligandB', 'FEP', 'FEP Error']]
    fepbenchmark5 = fepbenchmark5.rename(columns={
                             'FEP': 'DDG',
                             'FEP Error': 'dDDG'})
    fepbenchmark5.index = [f'{target}_edge_{x["ligandA"]}_{x["ligandB"]}' for i, x in fepbenchmark5.iterrows()]
    fepbenchmark5['unit'] = 'kilocalories / mole'
    return fepbenchmark5
getSchindlerResults('hif2a').head()

Unnamed: 0,ligandA,ligandB,DDG,dDDG,unit
hif2a_edge_338_165,338,165,-0.44,0.25,kilocalories / mole
hif2a_edge_338_215,338,215,-0.76,0.13,kilocalories / mole
hif2a_edge_338_163,338,163,-1.02,0.1,kilocalories / mole
hif2a_edge_43_235,43,235,-0.2,0.1,kilocalories / mole
hif2a_edge_43_54,43,54,0.9,0.08,kilocalories / mole


In [24]:
author = 'schindler'
software = 'fep+'
forcefield = 'opls3e'

In [25]:
for target in targets.target_dict:
    df = getSchindlerResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, 
                           targets.get_target_dir(target), 
                           results_dir, 
                           f'{target}_{software}_{forcefield}_{author}.yaml'), 
              'w') as file:
        yaml.dump(df.T.to_dict(), file)

File /tmp/tmpgr9o8r9s/jnk1/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/pde2/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/thrombin/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/p38/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/ptp1b/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/galectin/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/cdk2/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/mcl1/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/bace/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/bace_hunt/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/bace_p2/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/tyk2/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/ros1/results_edges_5ns.csv does not exist.
File /tmp/tmpgr9o8r9s/pde10/results_edges_5ns.csv does not exist.
shp2 20 ns runs ['edge_SHP099-1_E22', 'edge_SHP099-1_E2', 'edge_SHP099-1_

# Create a simple, dump null model (all activities set to 0)

In [26]:
def getNullModell(target):
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1])
    df = df.rename(columns={0: "ligandA", 1: "ligandB"})
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df["ligandA"].values, df["ligandB"].values)])

    
    df['ligandA']= df['ligandA'].astype(str)
    df['ligandB']= df['ligandB'].astype(str)
    df['DDG'] = 0.0
    df['dDDG'] = 0.1
    df['unit'] = 'kilocalories / mole'
    return df
getNullModell('cmet').head()

Unnamed: 0,ligandA,ligandB,DDG,dDDG,unit
cmet_edge_CHEMBL3402741_400_CHEMBL3402756_2.7,CHEMBL3402741_400,CHEMBL3402756_2.7,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402741_400_CHEMBL3402763_90,CHEMBL3402741_400,CHEMBL3402763_90,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402741_400_CHEMBL3402764_90,CHEMBL3402741_400,CHEMBL3402764_90,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402742_23_CHEMBL3402756_2.7,CHEMBL3402742_23,CHEMBL3402756_2.7,0.0,0.1,kilocalories / mole
cmet_edge_CHEMBL3402742_23_CHEMBL3402763_90,CHEMBL3402742_23,CHEMBL3402763_90,0.0,0.1,kilocalories / mole


In [27]:
author = 'hahn'
software = 'null'
forcefield = 'null'

In [28]:
for target in targets.target_dict:
    df = getNullModell(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)