# Gather data from various source into the analysis repository

In [1]:
# imports
import os
import shutil
import numpy as np
import pandas as pd
import yaml
import git
import tempfile

import pint
unit_registry = pint.UnitRegistry()

from PLBenchmarks import targets, ligands, edges
from IPython.core.display import HTML

from tqdm.notebook import tqdm

import benchmarkpl
path = benchmarkpl.__path__[0]



_ColormakerRegistry()

# Set path of data directory

In [2]:
targets.setDataDir(os.path.join(path))
# directory name where results for each target are stored
results_dir = '10_results'

# Number of targets, ligands and edges in the data set

In [3]:
nligs, nedgs = 0, 0
print(f'{"Target":10s} {"Num Ligs":>10s} {"Num Edges":>10s}')
print(33 * '-')
for target in tqdm(targets.target_list):
    target = target["name"]
    print(f'{target:10s} {len(ligands.ligandSet(target)):10d} {len(edges.edgeSet(target)):10d}')
    nligs += len(ligands.ligandSet(target))
    nedgs += len(edges.edgeSet(target))
print(33 * '-')
print(f'{"total":10s} {nligs:10d} {nedgs:10d}')

Target       Num Ligs  Num Edges
---------------------------------


HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))

jnk1               21         31
pde2               21         34
thrombin           11         16
p38                34         56
ptp1b              23         49
galectin            8          7
cdk2               16         25
cmet               18         35
mcl1               42         71
bace               36         58
bace_hunt          32         60
bace_p2            12         26
tyk2               16         24
ros1               28         27
eg5                28         63
cdk8               33         54
hif2a              42         92
pfkfb3             40         66
pde10              35         36
shp2               26         56
syk                44         99
tnks2              27         60

---------------------------------
total             593       1045


# Experimental values stored in repository
Retrieve the experimental values stored in the dataset

In [5]:
# function to retrieve exp. data from PLBenchmarks 
def getExpResults(target):
    edg = edges.edgeSet(target)
    df = edg.getDF(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # remove unit of exp. values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    return df
getExpResults('tyk2').head()

Unnamed: 0,0,1,exp_DDG,exp_dDDG
edge_jmc_23_ejm_55,jmc_23,ejm_55,2.52,0.0
edge_ejm_44_ejm_55,ejm_44,ejm_55,-1.8,0.0
edge_ejm_49_ejm_31,ejm_49,ejm_31,-1.81,0.0
edge_ejm_31_ejm_46,ejm_31,ejm_46,-1.79,0.0
edge_jmc_28_jmc_27,jmc_28,jmc_27,-0.3,0.0


In [6]:
author = "hahn"
software = "experiment"

In [7]:
for target in targets.target_list:
    target = target["name"]
    df = getExpResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target)), exist_ok=True)
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'exp_DDG', 'exp_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'exp_DDG': 'DDG', 'exp_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

# pmx calculations with openFF parameters (Hahn et al.)

In [9]:
# function to retrieve data from PLBenchmarks calculations, soted in 00_data/input 
def getRawResults(target, forcefield='openff-1.0.0.offxml'):
    file_path = os.path.join(path, 
                                   '..', 
                                   '00_data', 
                                   'input',  
                                   f'{target}_{forcefield}.dat'
                                  )
    if not os.path.exists(file_path):
        print(f'File {file_path} does not exist.')
        return
    # read in result file
    res = pd.read_csv(file_path,
                      header=None, 
                      comment='#', 
                      skipinitialspace=True, 
                      names=['edge', 'calc DDG', 'calc dDDG' , 'add dDDG'], 
                      sep=' ')
    res.index = res['edge']
    # read in exp. data
    edg = edges.edgeSet(target)
    df = edg.getDF(columns=[0,1, 'exp. DeltaG [kcal/mol]', 'exp. Error [kcal/mol]'])
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df[0].values, df[1].values)])
   
    # copy in calculated values
    df['calc_DDG'] = res.loc[:,'calc DDG']
    df['calc_dDDG'] = res.loc[:,'calc dDDG']
    df['calc_dDDG(additional)'] = res.loc[:, 'add dDDG']
    
    # remove unit of calculated values
    df['exp_DDG'] = df['exp. DeltaG [kcal/mol]'].apply(lambda x: x.magnitude)
    df['exp_dDDG'] = df['exp. Error [kcal/mol]'].apply(lambda x: x.magnitude)

    # filter only to relevant columns
    df = df.filter(items = [0,1,'exp_DDG', 'exp_dDDG', 'calc_DDG', 'calc_dDDG', 'calc_dDDG(additional)'])
    df[0]= df[0].astype(str)
    df[1]= df[1].astype(str)
    return df
getRawResults('tyk2').head()

Unnamed: 0,0,1,exp_DDG,exp_dDDG,calc_DDG,calc_dDDG,calc_dDDG(additional)
edge_jmc_23_ejm_55,jmc_23,ejm_55,2.52,0.0,-0.33,0.19,
edge_ejm_44_ejm_55,ejm_44,ejm_55,-1.8,0.0,-4.19,0.29,
edge_ejm_49_ejm_31,ejm_49,ejm_31,-1.81,0.0,-0.87,0.12,
edge_ejm_31_ejm_46,ejm_31,ejm_46,-1.79,0.0,-1.04,0.21,
edge_jmc_28_jmc_27,jmc_28,jmc_27,-0.3,0.0,-0.69,0.05,


In [10]:
author = "hahn"
software = "pmx"
forcefield = "openff-1.0.0.offxml"

In [11]:
for target in targets.target_list:
    target = target["name"]
    df = getRawResults(target, forcefield)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        df = df.filter([0,1,'calc_DDG', 'calc_dDDG'])
        df.rename(columns={0: 'ligandA', 1: 'ligandB', 'calc_DDG': 'DDG', 'calc_dDDG': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'
        yaml.dump(df.T.to_dict(), file)

# Gather data from Gapsys et al.
retrieve from https://github.com/deGrootLab/pmx

In [12]:
temp_directory = tempfile.mkdtemp()

git.Repo.clone_from('https://github.com/deGrootLab/pmx', temp_directory, branch='master', depth=1)

<git.repo.base.Repo '/tmp/tmpl_rd37hv/.git'>

In [13]:
def getGapsysResults(target):  
    file_name = f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    data = pd.read_csv(f'{temp_directory}/protLig_benchmark/ddg_data/{target}.dat', sep='\s+', header=None, comment='#',
                   names=['edge', 'exp', 'gaff', 'dgaff', 'cgenff', 'dcgenff', 'cons', 'dcons', 'fep5', 'dfep5', 'fep1', 'dfep1'])
    data['dexp']=pd.Series([0.0]*data.shape[0])
    df = edges.edgeSet(target).getDF()
    if target == 'jnk1':
        df.index = pd.Series([f'{str(a).split("-")[0]}_{str(b).split("-")[0]}' for a, b in zip(df[0].values, df[1].values)])
    else:
        df.index = pd.Series([f'{a}_{b}' for a, b in zip(df[0].values, df[1].values)])
    
    newdata = data.copy()
    newdata.index=newdata['edge']
    newdata.columns = pd.MultiIndex.from_arrays([np.array(newdata.columns), ['', 'exp'] + ['pmx'] * 6 + ['fep'] * 4 + ['exp'], [''] + ['kj/mol'] * 12], names=['forcefield', 'method', 'unit'])
    
    newdata.loc[:,('ligandA', '', '')] = df[0].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata.loc[:,('ligandB', '', '')] = df[1].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x).astype(str)
    newdata = newdata.drop(columns=('edge', '', ''))
    newdata.sort_index(axis=1, level=1, inplace=True, sort_remaining=False)
    newdata.index=[f'edge_{x[("ligandA", "", "")]}_{x[("ligandB", "", "")]}' for i, x in newdata.iterrows()]
    
    return newdata
getGapsysResults('pde2').head()

forcefield,ligandA,ligandB,exp,dexp,fep5,dfep5,fep1,dfep1,gaff,dgaff,cgenff,dcgenff,cons,dcons
method,Unnamed: 1_level_1,Unnamed: 2_level_1,exp,exp,fep,fep,fep,fep,pmx,pmx,pmx,pmx,pmx,pmx
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol,kj/mol
edge_43249674_48022468,43249674,48022468,-0.8,0.0,-1.3,0.24,-0.4,2.35,-2.02,0.22,-2.22,0.61,-2.12,0.28
edge_43249674_49175789,43249674,49175789,2.7,0.0,-12.36,1.53,-9.87,3.13,-9.23,1.34,-14.99,1.41,-12.11,2.84
edge_48009208_43249674,48009208,43249674,-3.16,0.0,-0.61,0.61,-0.08,2.89,-2.27,0.31,-1.12,0.56,-1.7,0.58
edge_48009208_49137374,48009208,49137374,-1.15,0.0,-0.54,0.8,1.6,3.01,-0.46,1.44,-2.71,1.49,-1.59,1.28
edge_48168913_48022468,48168913,48022468,-6.43,0.0,-0.25,0.32,1.09,2.31,-4.55,2.6,-7.61,0.54,-6.08,1.69


In [14]:
author = "gapsys"
for target in targets.target_list:
    target = target["name"]
    original_df = getGapsysResults(target)
    if original_df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    for forcefield in ['fep5', 'fep1', 'gaff', 'cgenff']:
        df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])]
        software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
        if unit == 'kj/mol':
            unit = 'kilojoules / mole'
        df.columns = df.columns.get_level_values(0)
        df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
        df['unit'] = unit
        if forcefield.startswith('fep'):
            forcefield = f'opls3e_{forcefield[-1]}'
        with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
            yaml.dump(df.T.to_dict(), file)
    
    forcefield = 'exp'
    df = original_df.loc[:, np.in1d(original_df.columns.get_level_values(0), ['ligandA', 'ligandB', f'{forcefield}', f'd{forcefield}'])]
    software = list(df.columns.get_level_values(1))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    unit = list(df.columns.get_level_values(2))[list(df.columns.get_level_values(0)).index(f'{forcefield}')]
    if unit == 'kj/mol':
        unit = 'kilojoules / mole'
    df.columns = df.columns.get_level_values(0)
    df.rename(columns={0: 'ligandA', 1: 'ligandB', f'{forcefield}': 'DDG', f'd{forcefield}': 'dDDG'}, inplace=True)
    df['unit'] = unit
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_experiment_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/ros1.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/eg5.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/cdk8.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/hif2a.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/pfkfb3.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/pde10.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/shp2.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/syk.dat does not exist.
File /tmp/tmpl_rd37hv/protLig_benchmark/ddg_data/tnks2.dat does not exist.


# Compare experimental values stored in repository with Gapsys exp. data

In [15]:
for target in targets.target_list:
    target = target["name"]
    author = "hahn"
    software = "experiment"
    file_name = os.path.join(path, targets.getTargetDir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data1 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    author = "gapsys"
    software = "experiment"
    file_name = os.path.join(path, targets.getTargetDir(target), results_dir, 
                                   f'{target}_{software}_{author}.yaml'
                       )
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data2 = yaml.safe_load(file)
    else:
        print(f"File {file_name} for target {target} not available")
        continue
        
    for e, edata in data1.items():
        if e in data2:
            v1 = unit_registry.Quantity(edata['DDG'], edata['unit'])
            v2 = unit_registry.Quantity(data2[e]['DDG'], data2[e]['unit'])
            if not np.isclose(v1, v2, atol=unit_registry.Quantity(.05, 'kilocalories / mole'), equal_nan=False):
                print(target, e, v1.to('kilocalories / mole'), v2.to('kilocalories / mole'))
        else:
            print(target, e)

cmet edge_CHEMBL3402748_5300_8_CHEMBL3402748_5300_8_alt
cmet edge_CHEMBL3402748_5300_8_CHEMBL3402748_5300_8_alt_pairing
cmet edge_CHEMBL3402748_5300_8_CHEMBL3402748_5300_8_taut
cmet edge_CHEMBL3402748_5300_8_alt_CHEMBL3402748_5300_8
cmet edge_CHEMBL3402748_5300_8_alt_pairing_CHEMBL3402748_5300_8
cmet edge_CHEMBL3402750_400_10_CHEMBL3402750_400_10_alt
cmet edge_CHEMBL3402750_400_10_CHEMBL3402750_400_10_alt_pairing
cmet edge_CHEMBL3402750_400_10_alt_CHEMBL3402748_5300_8_alt
cmet edge_CHEMBL3402750_400_10_alt_CHEMBL3402750_400_10
cmet edge_CHEMBL3402750_400_10_alt_pairing_CHEMBL3402750_400_10
bace edge_CAT-24_CAT-17e 1.74 kilocalorie / mole 1.32887189292543 kilocalorie / mole
bace edge_CAT-24_CAT-17i 2.29 kilocalorie / mole 1.8809751434034416 kilocalorie / mole
bace_p2 edge_32_L_35_L 0.52 kilocalorie / mole 0.5999043977055448 kilocalorie / mole
bace_p2 edge_33_L_30_L -0.11 kilocalorie / mole -0.20076481835564053 kilocalorie / mole
bace_p2 edge_34_L_29_L 0.15 kilocalorie / mole 0.200764818

# Get results from Wang et. al., JACS 2015
Input file taken from https://pubs.acs.org/doi/suppl/10.1021/ja512751q/suppl_file/ja512751q_si_003.xlsx (retrieval 2020-09-21) and converted to csv file

In [16]:
def getWangResults(target):    
    file_name = f'{path}/../00_data/input/ja512751q_si_003.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')        
        return None
    data = pd.read_csv(file_name, sep=',')
    data['system'] = data['system'].apply(lambda x: str(x).lower())
    indeces = data.loc[data['system']!='nan', :].index
    data_per_target = [data.loc[indeces[i]:indeces[i+1]-1,:] for i in range(indeces.shape[0]-1)]
    data_per_target = {d['system'].iloc[0]: d for d in data_per_target}
    for t, d in data_per_target.items():
        d.index = [f'edge_{x["Ligand1"]}_{x["Ligand2"]}' for i, x in d.iterrows()]
        d = d[['Ligand1', 'Ligand2', 'bennett_ddG', 'bennett_error']]
        d = d.rename(columns={'Ligand1': 'LigandA',
                             'Ligand2': 'LigandB',
                             'bennett_ddG': 'DDG',
                             'bennett_error': 'dDDG'})
        d['unit'] = 'kilocalories / mole'
        data_per_target[t] = d
    if target in data_per_target:
        return data_per_target[target]
    else:
        return None
getWangResults('jnk1').head()

Unnamed: 0,LigandA,LigandB,DDG,dDDG,unit
edge_17124-1_18634-1,17124-1,18634-1,0.47,0.08,kilocalories / mole
edge_18626-1_18624-1,18626-1,18624-1,0.76,0.08,kilocalories / mole
edge_18636-1_18625-1,18636-1,18625-1,-0.3,0.09,kilocalories / mole
edge_18632-1_18624-1,18632-1,18624-1,0.6,0.09,kilocalories / mole
edge_18635-1_18625-1,18635-1,18625-1,0.97,0.07,kilocalories / mole


In [17]:
author = 'wang'
software = 'fep+'
forcefield = 'opls2.1'

In [18]:
for target in targets.target_list:
    target = target["name"]
    df = getWangResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

# Gather fep-benchmark data (Schindler, Merck KGaA)

In [19]:
# Create temporary dir
temp_directory = tempfile.mkdtemp()
repo = git.Repo.clone_from('https://github.com/MCompChem/fep-benchmark', temp_directory, branch='v1.0')

In [20]:
def getSchindlerResults(target):
    file_name = f'{temp_directory}/{target}/results_edges_5ns.csv'
    if not os.path.exists(file_name):
        print(f'File {file_name} does not exist.')
        return None
    fepbenchmark5 = pd.read_csv(file_name)

    fepbenchmark5['LigandA'] = fepbenchmark5['Ligand1'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    fepbenchmark5['LigandB'] = fepbenchmark5['Ligand2'].apply(lambda x: str(int(float(x))) if str(x).endswith('.0') else x)
    fepbenchmark5 = fepbenchmark5.loc[:, ['LigandA', 'LigandB', 'FEP', 'FEP Error']]
    fepbenchmark5 = fepbenchmark5.rename(columns={
                             'FEP': 'DDG',
                             'FEP Error': 'dDDG'})
    fepbenchmark5.index = [f'edge_{x["LigandA"]}_{x["LigandB"]}' for i, x in fepbenchmark5.iterrows()]
    fepbenchmark5['unit'] = 'kilocalories / mole'
    return fepbenchmark5
getSchindlerResults('cmet').head()

Unnamed: 0,LigandA,LigandB,DDG,dDDG,unit
edge_CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402744_300,CHEMBL3402765_11-charged-pKa-8.1,CHEMBL3402744_300,2.33,0.13,kilocalories / mole
edge_CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402745_200,CHEMBL3402765_11-charged-pKa-8.1,CHEMBL3402745_200,1.75,0.13,kilocalories / mole
edge_CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402743_42,CHEMBL3402765_11-charged-pKa-8.1,CHEMBL3402743_42,0.43,0.11,kilocalories / mole
edge_CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402764_90,CHEMBL3402765_11-charged-pKa-8.1,CHEMBL3402764_90,1.06,0.1,kilocalories / mole
edge_CHEMBL3402765_11-charged-pKa-8.1_CHEMBL3402760_1,CHEMBL3402765_11-charged-pKa-8.1,CHEMBL3402760_1,-1.94,0.17,kilocalories / mole


In [21]:
author = 'schindler'
software = 'fep+'
forcefield = 'opls3e'

In [22]:
for target in targets.target_list:
    target = target["name"]
    df = getSchindlerResults(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)

File /tmp/tmp9nkgjck9/jnk1/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/pde2/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/thrombin/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/p38/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/ptp1b/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/galectin/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/cdk2/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/mcl1/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/bace/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/bace_hunt/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/bace_p2/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/tyk2/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/ros1/results_edges_5ns.csv does not exist.
File /tmp/tmp9nkgjck9/pde10/results_edges_5ns.csv does not exist.


# Create a simple, dump null model (all activities set to 0)

In [23]:
def getNullModell(target):
    edg = edges.edgeSet(target)
    df = edg.getDF(columns=[0,1])
    df = df.rename(columns={0: "ligandA", 1: "ligandB"})
    df.index = pd.Series(['edge_' + str(lig1) + '_' + str(lig2) for lig1, lig2 in zip(df["ligandA"].values, df["ligandB"].values)])

    
    df['ligandA']= df['ligandA'].astype(str)
    df['ligandB']= df['ligandB'].astype(str)
    df['DDG'] = 0.0
    df['dDDG'] = 0.1
    df['unit'] = 'kilocalories / mole'
    return df
getNullModell('cmet').head()

Unnamed: 0,ligandA,ligandB,DDG,dDDG,unit
edge_CHEMBL3402752_30000_12_CHEMBL3402748_5300_8,CHEMBL3402752_30000_12,CHEMBL3402748_5300_8,0.0,0.1,kilocalories / mole
edge_CHEMBL3402752_30000_12_CHEMBL3402747_3400_7,CHEMBL3402752_30000_12,CHEMBL3402747_3400_7,0.0,0.1,kilocalories / mole
edge_CHEMBL3402752_30000_12_CHEMBL3402749_500_9,CHEMBL3402752_30000_12,CHEMBL3402749_500_9,0.0,0.1,kilocalories / mole
edge_CHEMBL3402752_30000_12_CHEMBL3402754_40_14,CHEMBL3402752_30000_12,CHEMBL3402754_40_14,0.0,0.1,kilocalories / mole
edge_CHEMBL3402752_30000_12_CHEMBL3402755_4200_15,CHEMBL3402752_30000_12,CHEMBL3402755_4200_15,0.0,0.1,kilocalories / mole


In [24]:
author = 'hahn'
software = 'null'
forcefield = 'null'

In [25]:
for target in targets.target_list:
    target = target["name"]
    df = getNullModell(target)
    if df is None:
        continue
    os.makedirs(os.path.join(path, targets.getTargetDir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.getTargetDir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        yaml.dump(df.T.to_dict(), file)