# Gather data from various source into the analysis repository

In [1]:
# imports
import os
import shutil
import numpy as np
import pandas as pd
import yaml
import git
import tempfile
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pint
unit_registry = pint.UnitRegistry()

from PLBenchmarks import targets, ligands, edges
from IPython.core.display import HTML

from tqdm.notebook import tqdm

import benchmarkpl
path = benchmarkpl.__path__[0]



# Set path of data directory

In [2]:
targets.set_data_dir(os.path.join(path))
# directory name where results for each target are stored
results_dir = '10_results'

# Number of targets, ligands and edges in the data set

In [3]:
nligs, nedgs = 0, 0
print(f'{"Target":10s} {"Num Ligs":>10s} {"Num Edges":>10s}')
print(33 * '-')
for target in tqdm(targets.target_dict):
    print(f'{target:10s} {len(ligands.LigandSet(target)):10d} {len(edges.EdgeSet(target)):10d}')
    nligs += len(ligands.LigandSet(target))
    nedgs += len(edges.EdgeSet(target))
print(33 * '-')
print(f'{"total":10s} {nligs:10d} {nedgs:10d}')

Target       Num Ligs  Num Edges
---------------------------------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))

jnk1               21         31
thrombin           11         16
p38                34         56
ptp1b              23         49
cdk2               16         25
mcl1               42         71
bace               36         58
tyk2               16         24

---------------------------------
total             199        330


# Experimental values stored in repository
Retrieve the experimental values stored in the dataset

In [4]:
# function to retrieve exp. data from PLBenchmarks 
def getExpResults(target):
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # remove unit of exp. values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    return df
getExpResults('tyk2').head()

Unnamed: 0,0,1,exp_DDG,exp_dDDG
tyk2_edge_jmc_23_ejm_55,jmc_23,ejm_55,2.52,0.0
tyk2_edge_ejm_44_ejm_55,ejm_44,ejm_55,-1.8,0.0
tyk2_edge_ejm_49_ejm_31,ejm_49,ejm_31,-1.81,0.0
tyk2_edge_ejm_31_ejm_46,ejm_31,ejm_46,-1.79,0.0
tyk2_edge_jmc_28_jmc_27,jmc_28,jmc_27,-0.3,0.0


In [5]:
author = "hahn"
software = "experiment"

In [6]:
for target in targets.target_dict:
    df = getExpResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target)), exist_ok=True)
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'exp_DDG', 'exp_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'exp_DDG': 'DDG', 'exp_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

# pmx calculations with openFF parameters (Hahn et al.)

In [7]:
# function to retrieve data from PLBenchmarks calculations, soted in 00_data/input 
def getRawResults(target, forcefield='openff-1.0.0.offxml'):
    file_path = os.path.join(path, 
                                   '..', 
                                   '00_data', 
                                   'input',  
                                   f'{target}_{forcefield}.dat'
                                  )
    if not os.path.exists(file_path):
        print(f'File {file_path} does not exist.')
        return
    # read in result file
    res = pd.read_csv(file_path,
                      header=None, 
                      comment='#', 
                      skipinitialspace=True, 
                      names=['edge', 'calc DDG', 'calc dDDG' , 'add dDDG'], 
                      sep=' ')
    res.index = res['edge']
    # read in exp. data
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # copy in calculated values
    df['calc_DDG'] = res.loc[:,'calc DDG']
    df['calc_dDDG'] = res.loc[:,'calc dDDG']
    df['calc_dDDG(additional)'] = res.loc[:, 'add dDDG']
    
    # remove unit of calculated values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG', 'calc_DDG', 'calc_dDDG', 'calc_dDDG(additional)'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
    return df
getRawResults('tyk2').head()

Unnamed: 0,0,1,exp_DDG,exp_dDDG,calc_DDG,calc_dDDG,calc_dDDG(additional)
tyk2_edge_jmc_23_ejm_55,jmc_23,ejm_55,2.52,0.0,-0.33,0.19,
tyk2_edge_ejm_44_ejm_55,ejm_44,ejm_55,-1.8,0.0,-4.19,0.29,
tyk2_edge_ejm_49_ejm_31,ejm_49,ejm_31,-1.81,0.0,-0.87,0.12,
tyk2_edge_ejm_31_ejm_46,ejm_31,ejm_46,-1.79,0.0,-1.04,0.21,
tyk2_edge_jmc_28_jmc_27,jmc_28,jmc_27,-0.3,0.0,-0.69,0.05,


In [8]:
author = "hahn"
software = "pmx"
forcefield = "openff-1.0.0.offxml"

In [9]:
for target in targets.target_dict:
    df = getRawResults(target, forcefield)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'calc_DDG', 'calc_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'calc_DDG': 'DDG', 'calc_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

# Gather data from Gapsys et al.
retrieve from https://github.com/deGrootLab/pmx

In [10]:
temp_directory = tempfile.mkdtemp()

git.Repo.clone_from('https://github.com/deGrootLab/pmx', temp_directory, branch='master', depth=1)

<git.repo.base.Repo '/tmp/tmpq1u45fh7/.git'>

In [12]:
def getGapsysResults(target):  
    file_name = f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat', sep='\s+', header=None, comment='#',
                   names=['edge', 'exp', 'gaff', 'dgaff', 'cgenff', 'dcgenff', 'cons', 'dcons', 'fep5', 'dfep5', 'fep1', 'dfep1'])
    data['dexp']=pd.Series([0.0]*data.shape[0])
    df = edges.EdgeSet(target).get_dataframe()
    if target == 'jnk1':
        df.index = pd.Series([f'{str(a).split("-")[0]}_{str(b).split("-")[0]}' for a, b in zip(df[0].values, df[1].values)])
    else:
        df.index = pd.Series([f'{a}_{b}' for a, b in zip(df[0].values, df[1].values)])
    
    newdata = data.copy()
    newdata.index=newdata['edge']
    newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
    newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata = newdata.drop(columns=('edge', '', ''))
    newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
    newdata.index=[f'{target}_edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
    return newdata
getGapsysResults('jnk1').head()

forcefield,ligandA,ligandB,exp,dexp,fep5,dfep5,fep1,dfep1,gaff,dgaff,cgenff,dcgenff,cons,dcons
method,Unnamed: 1_level_1,Unnamed: 2_level_1,exp,exp,fep,fep,fep,fep,pmx,pmx,pmx,pmx,pmx,pmx
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol
jnk1_edge_17124-1_18631-1,17124-1,18631-1,1.09,0.0,6.35,0.29,6.51,0.34,5.57,3.06,3.25,0.42,4.41,1.54
jnk1_edge_17124-1_18634-1,17124-1,18634-1,-1.34,0.0,2.44,0.18,1.56,0.57,2.09,0.67,1.05,0.61,1.57,0.58
jnk1_edge_18626-1_18624-1,18626-1,18624-1,1.59,0.0,4.49,0.17,4.25,0.43,4.71,0.38,0.48,0.14,2.6,2.11
jnk1_edge_18626-1_18625-1,18626-1,18625-1,3.22,0.0,6.05,0.14,5.62,0.48,2.96,0.47,1.99,1.75,2.47,0.82
jnk1_edge_18626-1_18627-1,18626-1,18627-1,1.63,0.0,1.66,0.34,1.98,0.3,1.81,0.32,0.66,0.36,1.23,0.58


In [13]:
author = "gapsys"
for target in targets.target_dict:
    original_df = getGapsysResults(target)
    if original_df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    for forcefield in ['fep5', 'fep1', 'gaff', 'cgenff']:
        df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
        software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        if unit == 'kj/mol':
            unit = 'kilojoules / mole'
        df.columns = df.columns.get_level_values(0)
        df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
        df['unit'] = unit
        if forcefield.startswith('fep'):
            forcefield = f'opls3e_{forcefield[-1]}'
        with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
            yaml.dump(df.T.to_dict(), file)
    
    forcefield = 'exp'
    df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])].copy()
    software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    if unit == 'kj/mol':
        unit = 'kilojoules / mole'
    df.columns = df.columns.get_level_values(0)
    df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
    df['unit'] = unit
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_experiment_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

# Compare experimental values stored in repository with Gapsys exp. data

In [14]:
for target in targets.target_dict:
    author = "hahn"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data1 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    author = "gapsys"
    software = "experiment"
    file_name = os.path.join(path, targets.get_target_dir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data2 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    for e, edata in data1.items():
        if e in data2:
            v1 = unit_registry.Quantity(edata['DDG'], edata['unit'])
            v2 = unit_registry.Quantity(data2[e]['DDG'], data2[e]['unit'])
            if not np.isclose(v1.to('kilocalories / mole').magnitude, 
                              v2.to('kilocalories / mole').magnitude, 
                              atol=.05,
                              equal_nan=False):
                print(target, e, v1.to('kilocalories / mole'), v2.to('kilocalories / mole'))
        else:
            print(target, e)

bace bace_edge_CAT-24_CAT-17e 1.74 kilocalorie / mole 1.32887189292543 kilocalorie / mole
bace bace_edge_CAT-24_CAT-17i 2.29 kilocalorie / mole 1.8809751434034416 kilocalorie / mole


# Get results from Wang et. al., JACS 2015
Input file taken from https://pubs.acs.org/doi/suppl/10.1021/ja512751q/suppl_file/ja512751q_si_003.xlsx (retrieval 2020-09-21) and converted to csv file

In [15]:
def getWangResults(target):    
    file_name = f'{path}/../00_data/input/ja512751q_si_003.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')        
        return None
    data = pd.read_csv(file_name, sep=',')
    data['system'] = data['system'].apply(lambda x: str(x).lower())
    indeces = data.loc[data['system']!='nan', :].index
    data_per_target = [data.loc[indeces[i]:indeces[i+1]-1,:] for i in range(indeces.shape[0]-1)]
    data_per_target = {d['system'].iloc[0]: d for d in data_per_target}
    for t, d in data_per_target.items():
        d.index = [f'{target}_edge_{x["Ligand1"]}_{x["Ligand2"]}' for i, x in d.iterrows()]
        d = d[['Ligand1', 'Ligand2', 'bennett_ddG', 'bennett_error']]
        d = d.rename(columns={'Ligand1': 'LigandA',
                             'Ligand2': 'LigandB',
                             'bennett_ddG': 'DDG',
                             'bennett_error': 'dDDG'})
        d['unit'] = 'kilocalories / mole'
        data_per_target[t] = d
    if target in data_per_target:
        return data_per_target[target]
    else:
        return None
getWangResults('jnk1').head()

Unnamed: 0,LigandA,LigandB,DDG,dDDG,unit
jnk1_edge_17124-1_18634-1,17124-1,18634-1,0.47,0.08,kilocalories / mole
jnk1_edge_18626-1_18624-1,18626-1,18624-1,0.76,0.08,kilocalories / mole
jnk1_edge_18636-1_18625-1,18636-1,18625-1,-0.3,0.09,kilocalories / mole
jnk1_edge_18632-1_18624-1,18632-1,18624-1,0.6,0.09,kilocalories / mole
jnk1_edge_18635-1_18625-1,18635-1,18625-1,0.97,0.07,kilocalories / mole


In [16]:
author = 'wang'
software = 'fep+'
forcefield = 'opls2.1'

In [17]:
for target in targets.target_dict:
    df = getWangResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)