# Inspect data and search for failed simulations

In [1]:
import os
import re
import sys

import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from  plotly import colors
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D, IPythonConsole

rdDepictor.SetPreferCoordGen(True)
from IPython.display import SVG
import rdkit

from svgutils import transform as sg

from IPython.core.display import HTML
from scipy.stats import norm

from PLBenchmarks import targets, ligands, edges

from tqdm.notebook import tqdm

sys.path.append(os.path.join(os.getcwd(), '..'))
import benchmarkpl
path = benchmarkpl.__path__[0]
targets.set_data_dir(path)



_ColormakerRegistry()

# Read in data for Parsley forcefield

Function to read in data

In [2]:
from benchmarkpl import load_data

# load all data into one dataframe

In [3]:
forcefield = 'openff-1.2.0.offxml'
dfs = []
for target in tqdm(targets.target_dict):
    df = load_data.getDetailedResults(target, forcefield=forcefield)
    if df is None:
        continue
    for env in ['complex', 'water']:
        for rep in range(1,4):
            if str(rep) in df.columns.get_level_values(1):
                sub_df = df.loc[:, (env, str(rep), slice(None))].copy()
                sub_df.columns = sub_df.columns.get_level_values(2)
                sub_df['env'] = env
                sub_df['repeat'] = rep
                sub_df['target'] = target
                sub_df['edge'] = sub_df.index
                sub_df.reset_index(drop=True, inplace=True)
                dfs.append(sub_df)
all_sims = pd.concat(dfs, ignore_index=True)
all_sims.reset_index(drop=True, inplace=True)
all_sims.head()

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,val,err,aerr,conv,env,repeat,target,edge
0,,,,,complex,1,jnk1,edge_17124-1_18634-1
1,,,,,complex,1,jnk1,edge_18626-1_18624-1
2,,,,,complex,1,jnk1,edge_18636-1_18625-1
3,,,,,complex,1,jnk1,edge_18632-1_18624-1
4,,,,,complex,1,jnk1,edge_18635-1_18625-1


# Filter out simulations with run issues (nan values as results)

In [4]:
isna = all_sims.isna()
all_sims['failed'] = False
for i, row in isna.iterrows():
    if np.any(row):
        all_sims.loc[i, 'failed'] = True
print(f'There are {all_sims.loc[all_sims["failed"]].shape[0]} failed out of {all_sims.shape[0]} simulations')
all_sims.loc[all_sims['failed']]

There are 2960 failed out of 6264 simulations


Unnamed: 0,val,err,aerr,conv,env,repeat,target,edge,failed
0,,,,,complex,1,jnk1,edge_17124-1_18634-1,True
1,,,,,complex,1,jnk1,edge_18626-1_18624-1,True
2,,,,,complex,1,jnk1,edge_18636-1_18625-1,True
3,,,,,complex,1,jnk1,edge_18632-1_18624-1,True
4,,,,,complex,1,jnk1,edge_18635-1_18625-1,True
...,...,...,...,...,...,...,...,...,...
4957,,,,,water,3,pde10,edge_8414_5644,True
4958,,,,,water,3,pde10,edge_5644_9211,True
4959,,,,,water,3,pde10,edge_5644_0309,True
4960,,,,,water,3,pde10,edge_5670_9211,True


In [5]:
unique_failed_edges = {}

for i, row in tqdm(all_sims.loc[all_sims['failed']].iterrows()):
    target = row['target']
    edge = row['edge']
    env = row['env']
    repeat= row ['repeat']
    if not target in unique_failed_edges:
        unique_failed_edges[target] = {}
    if not edge in unique_failed_edges[target]:
        unique_failed_edges[target][edge] = {}
    if not env in unique_failed_edges[target][edge]:
        unique_failed_edges[target][edge][env] = []
    unique_failed_edges[target][edge][env].append(repeat)
unique_failed_edges

0it [00:00, ?it/s]

{'jnk1': {'edge_17124-1_18634-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18626-1_18624-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18636-1_18625-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18632-1_18624-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18635-1_18625-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18626-1_18658-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18639-1_18658-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18626-1_18625-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18638-1_18658-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18628-1_18624-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18631-1_18660-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18638-1_18634-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18626-1_18632-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_18626-1_18630-1': {'complex': [1, 2, 3], 'water': [1, 2, 3]},
  'edge_1863

In [6]:
from benchmarkpl import drawing

d2ds = []
if os.path.exists('../../../02_benchmark_calculations/'):
    targets.set_data_dir('../../../02_benchmark_calculations/')
for target in unique_failed_edges.keys():
    eSet = edges.EdgeSet(target)
    for edge in unique_failed_edges[target].keys():
        print(target)
        df = eSet[edge].get_dataframe()
        df['target'] = target
        
        text = ''
        for env in unique_failed_edges[target][edge].keys():
            text+=(f'{env} (repeats {", ".join([str(rep) for rep in unique_failed_edges[target][edge][env]])}), ')
        text += f'DDG_exp = {df["exp. DeltaG [kcal/mol]"].magnitude}'\
            f' ({df["exp. Error [kcal/mol]"].magnitude}) kcal/mol'
        
        # check whether image exists
        os.makedirs(os.path.join(path, targets.get_target_dir(target), '11_failed'), exist_ok=True)
        file_path = os.path.join(path, targets.get_target_dir(target), '11_failed', f'{edge}.svg')
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                img = file.read()
        else:
            # visualization
            m1 = Chem.SDMolSupplier(f'{targets.data_path}/{targets.get_target_dir(target)}/02_ligands/lig_{df[0]}/crd/lig_{df[0]}.sdf', removeHs=False)[0]
            m2 = Chem.SDMolSupplier(f'{targets.data_path}/{targets.get_target_dir(target)}/02_ligands/lig_{df[1]}/crd/lig_{df[1]}.sdf', removeHs=False)[0]
            pairs = np.loadtxt(f'{targets.data_path}/{targets.get_target_dir(target)}/03_hybrid/edge_{df[0]}_{df[1]}/water/crd/pairs.dat')
            # decrement pairs to match rdkit counting from 0!
            pairs -= 1
            img = drawing.drawPerturbation(m1, # rdkit molecule 1
                                           m2, # rdkit molecule 2
                                           pairs, # pairs, np array or list of lists
                                           target=target, # string with target name
                                           n1=df[0], # name mol 1
                                           n2=df[1], # name  mol 2
                                           text=text# additional text
                                  )

            with open(file_path, 'w') as file:
                file.write(img)
        #df['img'] = drawPerturbation(m1, m2, pairs, target=t, n1=df[0], n2=df[1]).GetDrawingText()
        d2ds.append(img)

HTML(''.join(d2ds))

jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
jnk1
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
pde2
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
thrombin
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
p38
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
ptp1b
galectin
gale

KeyError: 'edge_3806-mvEster_0340'

In [None]:
#HTML(nanEdges.to_html())

In [None]:
#save failed edges to separate csv file and remove them from dataframe
all_sims.loc[all_sims['failed']].to_csv(f'failed_simulations_{forcefield}.csv')
all_sims = all_sims.drop(labels=all_sims.loc[all_sims['failed']].index, axis=0)
all_sims.head()

In [None]:
# save finished simulations
all_sims.to_csv(f'finished_simulations_{forcefield}.csv')

# Look at convergence criteria and set a criterion

In [None]:
# Extract convergence 
conv_thres = 0.8
all_sims['bConv'] = all_sims['conv'] < conv_thres
print(f'Simulations converged (Convergence < {conv_thres}):\n\
in water: {all_sims["bConv"].loc[all_sims["env"]=="water"].sum()} \
out of {all_sims.loc[all_sims["env"]=="water"].shape[0]}\n\
in complex: {all_sims["bConv"].loc[all_sims["env"]=="complex"].sum()} \
out of {all_sims.loc[all_sims["env"]=="complex"].shape[0]} simulations.')

In [None]:
# Extract convergence
err_thres = 1.0
all_sims['bErr'] = (all_sims['err'] < err_thres).values
print(f'Simulations converged (Bootstrap error < {err_thres}):\n\
in water: {all_sims["bErr"].loc[all_sims["env"]=="water"].sum()} \
out of {all_sims.loc[all_sims["env"]=="water"].shape[0]}\n\
in complex: {all_sims["bErr"].loc[all_sims["env"]=="complex"].sum()} \
out of {all_sims.loc[all_sims["env"]=="complex"].shape[0]} simulations.')

In [None]:
# Extract convergence
aerr_thres = 1.0
all_sims['bAerr'] = (all_sims['aerr'] < aerr_thres).values
print(f'Simulations converged (Analytical error < {aerr_thres}):\n\
in water: {all_sims["bAerr"].loc[all_sims["env"]=="water"].sum()} \
out of {all_sims.loc[all_sims["env"]=="water"].shape[0]}\n\
in complex: {all_sims["bAerr"].loc[all_sims["env"]=="complex"].sum()} \
out of {all_sims.loc[all_sims["env"]=="complex"].shape[0]} simulations.')

In [None]:
all_sims['include'] = all_sims['bAerr'] & all_sims['bConv']
print(f'Included simulations:\n\
in water: {all_sims["include"].loc[all_sims["env"]=="water"].sum()} \
out of {all_sims.loc[all_sims["env"]=="water"].shape[0]}\n\
in complex: {all_sims["include"].loc[all_sims["env"]=="complex"].sum()} \
out of {all_sims.loc[all_sims["env"]=="complex"].shape[0]} simulations.')

In [None]:
import itertools
import plotly
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig = make_subplots(rows=2, cols=2, shared_xaxes=True, shared_yaxes=True)

# Add traces
col = 0
for env, rep in itertools.product(['water', 'complex'], ['1', '2', '3']):
    idx = np.logical_and(all_sims['env']==env, all_sims['repeat'] == int(rep))
    conv = all_sims.loc[idx, 'conv']
    aerr = all_sims.loc[idx, 'aerr']
    err = all_sims.loc[idx, 'err']
    text = [f'{row["target"]}:{row["edge"]}' for i, row in all_sims.loc[idx].iterrows()]
    if env == 'water':
        col=0
    else:
        col=1
    fig.add_trace(go.Scatter(x=conv, y=aerr,
                    mode='markers',
                    hovertext=text,
                    name=f'{env}',
                    opacity=.8,
                    marker_color=cols[col%6]), 
                 col=1,
                 row=1)
    fig.add_trace(go.Scatter(x=aerr, y=err,
                    mode='markers',
                    hovertext=text,
                    name=f'{env}',
                    marker_color=cols[col%6],
                    opacity=.8,
                    showlegend=False), 
                 col=2,
                 row=2)
    fig.add_trace(go.Scatter(x=conv, y=err,
                    mode='markers',
                    hovertext=text,
                    name=f'{env}',
                    opacity=.8,
                    marker_color=cols[col%6],
                    showlegend=False), 
                 col=1,
                 row=2)
fig.add_trace(go.Scatter(x=all_sims.loc[np.invert(all_sims['include']), 'conv'], 
                         y=all_sims.loc[np.invert(all_sims['include']), 'aerr'],
            mode='markers',
            name=f'not converged',
            opacity=.8,
                         marker_size=3,
            marker_color='black',
            showlegend=False), 
         col=1,
         row=1)
fig.add_trace(go.Scatter(x=all_sims.loc[np.invert(all_sims['include']), 'conv'], 
                         y=all_sims.loc[np.invert(all_sims['include']), 'err'],
            mode='markers',
            name=f'not converged',
            opacity=.8,
                         marker_size=3,
            marker_color='black',
            showlegend=False), 
         col=1,
         row=2)
fig.add_trace(go.Scatter(x=all_sims.loc[np.invert(all_sims['include']), 'aerr'], 
                         y=all_sims.loc[np.invert(all_sims['include']), 'err'],
            mode='markers',
            name=f'not converged',
            opacity=.8,
                         marker_size=3,
            marker_color='black',
            showlegend=False), 
         col=2,
         row=2)
fig.update_layout(
    yaxis3 = dict(title='Bootstrap Error [kcal mol<sup>-1</sup>]'),
    xaxis3=dict(title='Convergence'), 
    yaxis=dict(range=[0,5], title='Analytical Error [kcal mol<sup>-1</sup>]'),
    xaxis4=dict(range=[0,5], title='Analytical Error [kcal mol<sup>-1</sup>]'),)
fig.show()

In [None]:
unique_nonconverged_edges = {}

for i, row in tqdm(all_sims.loc[np.invert(all_sims['include'])].iterrows()):
    target = row['target']
    edge = row['edge']
    env = row['env']
    repeat= row ['repeat']
    if not target in unique_nonconverged_edges:
        unique_nonconverged_edges[target] = {}
    if not edge in unique_nonconverged_edges[target]:
        unique_nonconverged_edges[target][edge] = {}
    if not env in unique_nonconverged_edges[target][edge]:
        unique_nonconverged_edges[target][edge][env] = []
    unique_nonconverged_edges[target][edge][env].append(repeat)
unique_nonconverged_edges

In [None]:
d2ds = []
if os.path.exists('../../../02_benchmark_calculations/'):
    targets.set_data_dir('../../../02_benchmark_calculations/')
for target in unique_nonconverged_edges.keys():
    eSet = edges.EdgeSet(target)
    for edge in unique_nonconverged_edges[target].keys():
        df = eSet[edge].get_dataframe()
        df['target'] = target
        
        text = ''
        for env in unique_nonconverged_edges[target][edge].keys():
            text+=(f'{env} (repeats {", ".join([str(rep) for rep in unique_nonconverged_edges[target][edge][env]])}), ')
        text += f'DDG_exp = {df["exp. DeltaG [kcal/mol]"].magnitude}'\
            f' ({df["exp. Error [kcal/mol]"].magnitude}) kcal/mol'
        
        # check whether image exists
        os.makedirs(os.path.join(path, targets.get_target_dir(target), '12_not_converged'), exist_ok=True)
        file_path = os.path.join(path, targets.get_target_dir(target), '12_not_converged', f'{edge}.svg')
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                img = file.read()
        else:
            # visualization
            m1 = Chem.SDMolSupplier(f'{targets.data_path}/{targets.get_target_dir(target)}/02_ligands/lig_{df[0]}/crd/lig_{df[0]}.sdf', removeHs=False)[0]
            m2 = Chem.SDMolSupplier(f'{targets.data_path}/{targets.get_target_dir(target)}/02_ligands/lig_{df[1]}/crd/lig_{df[1]}.sdf', removeHs=False)[0]
            pairs = np.loadtxt(f'{targets.data_path}/{targets.get_target_dir(target)}/03_hybrid/edge_{df[0]}_{df[1]}/water/crd/pairs.dat')
            # decrement pairs to match rdkit counting from 0!
            pairs -= 1
            img = drawing.drawPerturbation(m1, # rdkit molecule 1
                                   m2, # rdkit molecule 2
                                   pairs, # pairs, np array or list of lists
                                   target=target, # string with target name
                                   n1=df[0], # name mol 1
                                   n2=df[1], # name  mol 2
                                   text=text# additional text
                                  )

            with open(file_path, 'w') as file:
                file.write(img)
        #df['img'] = drawPerturbation(m1, m2, pairs, target=t, n1=df[0], n2=df[1]).GetDrawingText()
        d2ds.append(img)

HTML(''.join(d2ds))

# Filter out non-converged simulations

In [None]:
# remove non-converged simulations and save converged simulations to file
all_sims = all_sims.drop(labels=all_sims.loc[np.invert(all_sims['include'])].index, axis=0)
all_sims = all_sims.drop(labels=['include', 'failed', 'bConv', 'bErr', 'bAerr'], axis=1)
all_sims.to_csv(f'converged_simulations_{forcefield}.csv')
all_sims.head()