# Gather data from various source into the analysis repository

In [1]:
# imports
import os
import shutil
import numpy as np
import pandas as pd
import yaml
import git
import tempfile
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pint
unit_registry = pint.UnitRegistry()

from PLBenchmarks import targets, ligands, edges
from IPython.core.display import HTML

from tqdm.notebook import tqdm

import benchmarkpl
path = benchmarkpl.__path__[0]



_ColormakerRegistry()

# Set path of data directory

In [2]:
targets.set_data_dir(os.path.join(path))
# directory name where results for each target are stored
results_dir = '10_results'

# Number of targets, ligands and edges in the data set

In [3]:
nligs, nedgs = 0, 0
print(f'{"Target":10s} {"Num Ligs":>10s} {"Num Edges":>10s}')
print(33 * '-')
for target in tqdm(targets.target_dict):
    print(f'{target:10s} {len(ligands.LigandSet(target)):10d} {len(edges.EdgeSet(target)):10d}')
    nligs += len(ligands.LigandSet(target))
    nedgs += len(edges.EdgeSet(target))
print(33 * '-')
print(f'{"total":10s} {nligs:10d} {nedgs:10d}')

Target       Num Ligs  Num Edges
---------------------------------


  0%|          | 0/22 [00:00<?, ?it/s]

jnk1               21         31
pde2               21         34
thrombin           11         16
p38                34         56
ptp1b              23         49
galectin            8          7
cdk2               16         25
cmet               24         57
mcl1               42         71
bace               36         58
bace_hunt          32         60
bace_p2            12         26
tyk2               16         24
ros1               28         61
eg5                28         65
cdk8               33         54
hif2a              42         92
pfkfb3             40         66
pde10              35         59
shp2               26         56
syk                44        101
tnks2              27         60
---------------------------------
total             599       1128


# Gather data from Gapsys et al.

In [4]:
def read_neq_results( fname ):
    if not os.path.exists(fname):
        print('File does not exist')
        return []
    fp = open(fname,'r')
    lines = fp.readlines()
    fp.close()
    out = []
    for l in lines:
        l = l.rstrip()
        foo = l.split()
        if 'BAR: dG' in l:
            out.append(float(foo[-2]))
        elif 'BAR: Std Err (bootstrap)' in l:
            out.append(float(foo[-2]))
        elif 'BAR: Std Err (analytical)' in l:
            out.append(float(foo[-2]))
        elif 'BAR: Conv' in l:
            out.append(float(foo[-1]))
    return(out)


In [5]:
def output_textfile(edgs, df, fname):
    fp = open(fname, 'w')
    fp.write('#%6s  %6s  %6s\n' % ('1_edge', '2_ddg', '3_ddg_err'))
    fp.write('#all values in kJ/mol\n')

    for i, row in edgs.iterrows():
        edge = f'edge_{row[0]}_{row[1]}'
        val = df.loc[f'{edge}_ddg', 'val']
        err = df.loc[f'{edge}_ddg', 'err']

        fp.write('%10s  %4.2f  %4.2f\n' % (edge, val, err))

    fp.close()


In [6]:
cmet_dict = {'lig_CHEMBL3402753_200': 'lig_CHEMBL3402753_200_13',
 'lig_CHEMBL3402759_5.7': 'lig_CHEMBL3402759_5.7',
 'lig_CHEMBL3402747_3400': 'lig_CHEMBL3402747_3400_7',
 'lig_CHEMBL3402744_300': 'lig_CHEMBL3402744_300_4',
 'lig_CHEMBL3402745_200': 'lig_CHEMBL3402745_200_5',
 'lig_CHEMBL3402761_1': 'lig_CHEMBL3402761_1_21',
 'lig_CHEMBL3402750_400': 'lig_CHEMBL3402750_400_10',
 'lig_CHEMBL3402743_42': 'lig_CHEMBL3402743_42',
 'lig_CHEMBL3402752_30000': 'lig_CHEMBL3402752_30000_12',
 'lig_CHEMBL3402755_4200': 'lig_CHEMBL3402755_4200_15',
 'lig_CHEMBL3402758_10': 'lig_CHEMBL3402758_10',
 'lig_CHEMBL3402749_500': 'lig_CHEMBL3402749_500_9',
 'lig_CHEMBL3402757_6.5': 'lig_CHEMBL3402757_6.5',
 'lig_CHEMBL3402765_11-charged-pKa-8.1': 'lig_CHEMBL3402765_11-charged-pKa-8.1',
 'lig_CHEMBL3402762_1': 'lig_CHEMBL3402762_1',
 'lig_CHEMBL3402742_23': 'lig_CHEMBL3402742_23',
 'lig_CHEMBL3402754_40': 'lig_CHEMBL3402754_40_14',
 'lig_CHEMBL3402748_5300': 'lig_CHEMBL3402748_5300_8',
 'lig_CHEMBL3402741_400': 'lig_CHEMBL3402741_400',
 'lig_CHEMBL3402763_90': 'lig_CHEMBL3402763_90',
 'lig_CHEMBL3402764_90': 'lig_CHEMBL3402764_90',
 'lig_CHEMBL3402751_2100': 'lig_CHEMBL3402751_2100_11',
 'lig_CHEMBL3402756_2.7': 'lig_CHEMBL3402756_2.7',
 'lig_CHEMBL3402760_1': 'lig_CHEMBL3402760_1'}
cmet_dict = {value.replace('lig_', ''):key.replace('lig_', '') for key, value in cmet_dict.items()}
cmet_dict

{'CHEMBL3402753_200_13': 'CHEMBL3402753_200',
 'CHEMBL3402759_5.7': 'CHEMBL3402759_5.7',
 'CHEMBL3402747_3400_7': 'CHEMBL3402747_3400',
 'CHEMBL3402744_300_4': 'CHEMBL3402744_300',
 'CHEMBL3402745_200_5': 'CHEMBL3402745_200',
 'CHEMBL3402761_1_21': 'CHEMBL3402761_1',
 'CHEMBL3402750_400_10': 'CHEMBL3402750_400',
 'CHEMBL3402743_42': 'CHEMBL3402743_42',
 'CHEMBL3402752_30000_12': 'CHEMBL3402752_30000',
 'CHEMBL3402755_4200_15': 'CHEMBL3402755_4200',
 'CHEMBL3402758_10': 'CHEMBL3402758_10',
 'CHEMBL3402749_500_9': 'CHEMBL3402749_500',
 'CHEMBL3402757_6.5': 'CHEMBL3402757_6.5',
 'CHEMBL3402765_11-charged-pKa-8.1': 'CHEMBL3402765_11-charged-pKa-8.1',
 'CHEMBL3402762_1': 'CHEMBL3402762_1',
 'CHEMBL3402742_23': 'CHEMBL3402742_23',
 'CHEMBL3402754_40_14': 'CHEMBL3402754_40',
 'CHEMBL3402748_5300_8': 'CHEMBL3402748_5300',
 'CHEMBL3402741_400': 'CHEMBL3402741_400',
 'CHEMBL3402763_90': 'CHEMBL3402763_90',
 'CHEMBL3402764_90': 'CHEMBL3402764_90',
 'CHEMBL3402751_2100_11': 'CHEMBL3402751_2100',
 

In [7]:
def getGapsysResults(target, forcefield):  
#     file_name = f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat'
#     if not os.path.exists(file_name):
#         print(f'File {file_name} does not exist.')
#         return None
#     data = pd.read_csv(f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat', sep='\s+', header=None, comment='#',
#                    names=['edge', 'exp', 'gaff', 'dgaff', 'cgenff', 'dcgenff', 'cons', 'dcons', 'fep5', 'dfep5', 'fep1', 'dfep1'])
#     data['dexp']=pd.Series([0.0]*data.shape[0])
    edgs = edges.EdgeSet(target).get_dataframe()

    bootnum = 1000
    ######## read into a data frame #########
    df = pd.DataFrame()
    arrays = [['water', 'complex', 'ddg'] * 3,
              [int(i / 3) + 1 for i in range(9)],
              [''] * 9]
    arrays = [[] * 9] * 3
    tuples = list(zip(*arrays))

    index = pd.MultiIndex.from_tuples(tuples, names=['leg', 'repeat', ''])
    newdf = pd.DataFrame(columns=index)

    # workpath/[water|complex]/edge* - every edge has its own folder
    waterComplex = ['water','protein']
    waterComplexNew = ['water','complex']
    
    if forcefield == 'openff-2.0.0-rc.1.offxml':
        nFF = 'openff-2.0rc1'
    elif forcefield == 'gaff':
        nFF = 'gaff2_sh'
    else:
        nFF = forcefield
    
    if not os.path.exists(f'analyze/{nFF}/{target}/'):
        return None
    # workpath/[water|complex]/edge*/state[A|B] - two states will be considered for every edge
    states = ['stateA','stateB']
    for i, row in edgs.iterrows():
        edge = f'edge_{row[0]}_{row[1]}'
        for wc, nwc in zip(waterComplex, waterComplexNew):
            for run in range(1,4):
                result = f'analyze/{nFF}/{target}/{wc}/{edge}/analyze{run}/results.dat'
                if target == 'eg5':
                    result = f'analyze/{nFF}/eg5/{wc}/{edge}/analyze{run}/results.dat'
                if target == 'shp2':
                    nedge = edge.replace('E', 'Example_')
                    nedge = nedge.replace('SHP099-1', 'SHP099-1_Example_7')
                    result = f'analyze/{nFF}/{target}/{wc}/{nedge}/analyze{run}/results.dat'
                if target == 'cmet':
                    nedge = f'edge_{cmet_dict[row[0]]}_{cmet_dict[row[1]]}'
                    result = f'analyze/{nFF}/{target}/{wc}/{nedge}/analyze{run}/results.dat'
                print(result)
                foo = read_neq_results(result)
                if len(foo) > 1:
                    df.loc[f'{edge}_{nwc}{run}','val'] = unit_registry.Quantity(foo[0], 'kilojoules/mole').to('kilocalories/mole').magnitude
                    df.loc[f'{edge}_{nwc}{run}','err'] = unit_registry.Quantity(foo[2], 'kilojoules/mole').to('kilocalories/mole').magnitude
                    df.loc[f'{edge}_{nwc}{run}','aerr'] = unit_registry.Quantity(foo[1], 'kilojoules/mole').to('kilocalories/mole').magnitude
                    df.loc[f'{edge}_{nwc}{run}','conv'] = foo[3]
                else:
                    df.loc[f'{edge}_{nwc}{run}','val'] = np.nan
                    df.loc[f'{edge}_{nwc}{run}','err'] = np.nan
                    df.loc[f'{edge}_{nwc}{run}','aerr'] = np.nan
                    df.loc[f'{edge}_{nwc}{run}','conv'] = np.nan
                    print('Results could not be read')
                for t in ['val', 'err', 'aerr', 'conv']:
                    newdf.loc[f'{edge}', (nwc, run, t)] = df.loc[f'{edge}_{nwc}{run}', t]
        vals = []
        errs = []
        aerrs = []
        for run in range(1,4):
            ##### calculate ddg #####
            ddg = df.loc[f'{edge}_complex{run}','val'] - df.loc[f'{edge}_water{run}','val']
            err = np.sqrt( np.power(df.loc[f'{edge}_complex{run}','err'],2.0) +
                           np.power(df.loc[f'{edge}_water{run}','err'],2.0) )
            aerr = np.sqrt( np.power(df.loc[f'{edge}_complex{run}','aerr'],2.0) +
                            np.power(df.loc[f'{edge}_water{run}','aerr'],2.0) )
            df.loc[f'{edge}_ddg{run}','val'] = ddg
            df.loc[f'{edge}_ddg{run}','err'] = err
            df.loc[f'{edge}_ddg{run}','aerr'] = aerr
            newdf.loc[f'{edge}', ('ddg', run, 'val')] = ddg
            newdf.loc[f'{edge}', ('ddg', run, 'err')] = err
            newdf.loc[f'{edge}', ('ddg', run, 'aerr')] = aerr
            vals.append(ddg)
            errs.append(err)
            aerrs.append(aerr)

        ###### calculate mean dg with err ######
        # mean ddg
        mean = np.average(vals)
        df.loc[f'{edge}_ddg', 'val'] = mean

        # error
        # 1) create three distributions
        distribs = []
        for v, e in zip(vals, errs):
            distribs.append(np.random.normal(v, e, size=bootnum))
        if len(distribs) > 1:
            distr = np.vstack(distribs)
            # 2) calculate stderrs
            stderr = np.mean(np.sqrt(np.var(distr, ddof=1, axis=0) / np.float(len(distribs))))
            df.loc[f'{edge}_ddg', 'err'] = stderr
        else:
            stderr = errs[0]
            df.loc[f'{edge}_ddg','err'] = stderr

        for t in ['val', 'err', 'aerr']:
            newdf.loc[f'{edge}', ('ddg_mean', '-', t)] = df.loc[f'{edge}_ddg', t]
    ###### output ######
    output_textfile(edgs, df, os.path.join(path, '..', '00_data', 'input', f'{target}_{forcefield}.dat'))

    newdf.to_csv(os.path.join(path, '..', '00_data', 'input', f'{target}_{forcefield}.csv'), float_format='%10.2f')


#     newdata = data.copy()
#     newdata.index=newdata['edge']
#     newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
#     newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
#     newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
#     newdata = newdata.drop(columns=('edge', '', ''))
#     newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
#     newdata.index=[f'{target}_edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
#     return newdata
# getGapsysResults('cmet')

In [8]:
# function to retrieve data from PLBenchmarks calculations, soted in 00_data/input 
def getRawResults(target, forcefield='openff-1.0.0.offxml'):
    file_path = os.path.join(path, 
                                   '..', 
                                   '00_data', 
                                   'input',  
                                   f'{target}_{forcefield}.dat'
                                  )
    if not os.path.exists(file_path):
        print(f'File {file_path} does not exist.')
        return
    # read in result file
    res = pd.read_csv(file_path,
                      header=None, 
                      comment='#', 
                      skipinitialspace=True, 
                      names=['edge', 'calc DDG', 'calc dDDG' , 'add dDDG'], 
                      sep=' ')
    res.index = res['edge']
    # read in exp. data
    edg = edges.EdgeSet(target)
    df = edg.get_dataframe(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # copy in calculated values
    df['calc_DDG'] = res.loc[:,'calc DDG']
    df['calc_dDDG'] = res.loc[:,'calc dDDG']
    df['calc_dDDG(additional)'] = res.loc[:, 'add dDDG']
    
    # remove unit of calculated values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG', 'calc_DDG', 'calc_dDDG', 'calc_dDDG(additional)'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    df.index = pd.Series([f'{target}_edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
    return df

In [10]:
author = "gapsys"
software = "pmx"
for target in ['pfkfb3']: #['cdk8', 'cmet', 'eg5', 'hif2a', 'pfkfb3', 'shp2', 'syk', 'tnks2']:
    print(target)
    for forcefield in ['openff-2.0.0-rc.1.offxml']:
        print(f"    {forcefield}")
        
        getGapsysResults(target, forcefield)
        df = getRawResults(target, forcefield)
        if df is None:
            continue
        os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
        with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
            df = df.filter([0,1,'calc_DDG', 'calc_dDDG'])
            df.rename(columns={0: 'ligandA', 1: 'ligandB', 'calc_DDG': 'DDG', 'calc_dDDG': 'dDDG'}, inplace=True)
            df['unit']='kilocalories / mole'
            yaml.dump(df.T.to_dict(), file)

pfkfb3
    openff-2.0.0-rc.1.offxml
analyze/openff-2.0rc1/pfkfb3/water/edge_43_48/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_43_48/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_43_48/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_48/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_48/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_48/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_43_44/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_43_44/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_43_44/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_44/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_44/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_43_44/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_30_56/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_30_56/analyze2/results.dat
analyze/openff

analyze/openff-2.0rc1/pfkfb3/protein/edge_56_69/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_35/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_35/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_35/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_35/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_35/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_35/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_52/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_52/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_52/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_52/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_52/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_59_52/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_47/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_59_47/an

analyze/openff-2.0rc1/pfkfb3/water/edge_47_20/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_47_20/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_47_20/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_47_20/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_47_20/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_47_20/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_69/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_69/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_69/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_26_69/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_26_69/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_26_69/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_70/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_70/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_26_70/anal

analyze/openff-2.0rc1/pfkfb3/water/edge_68_53/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_68_53/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_68_53/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_68_53/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_68_53/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_23/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_23/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_23/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_33_23/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_33_23/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_33_23/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_36/analyze1/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_36/analyze2/results.dat
analyze/openff-2.0rc1/pfkfb3/water/edge_33_36/analyze3/results.dat
analyze/openff-2.0rc1/pfkfb3/protein/edge_33_36/an