# Draw Venn plots of successes/outliers compared to experimental values

In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from  plotly import colors
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 
import pandas as pd
import yaml
import pint
unit_registry = pint.UnitRegistry()

from tqdm.notebook import tqdm

from PLBenchmarks import targets, ligands, edges


import benchmarkpl
path = benchmarkpl.__path__[0]
targets.set_data_dir(path)
results_dir = '10_results'



# Read in data

In [2]:
all_edges = pd.read_csv("../03_comparison_experiment/03a_all_edges_all_ffs.csv", index_col=0)
identifiers = [idx[4:] for idx in all_edges if idx.startswith("DDG")]
all_edges.head()

Unnamed: 0,target,edge,ligandA,ligandB,unit,DDG_Exp.,dDDG_Exp.,DDG_OpenFF-1.0,dDDG_OpenFF-1.0,DDG_OpenFF-1.0_converged,...,error_GAFF2,abserror_GAFF2,error_cGenFF,abserror_cGenFF,error_Consensus_OpenFF_GAFF2_cGenFF,abserror_Consensus_OpenFF_GAFF2_cGenFF,error_Consensus_OpenFF_GAFF2,abserror_Consensus_OpenFF_GAFF2,error_Consensus_all,abserror_Consensus_all
jnk1_edge_17124-1_18631-1,jnk1,edge_17124-1_18631-1,17124-1,18631-1,kilocalories / mole,0.26,0.37,1.19,0.096086,1.19,...,1.071262,1.071262,0.516769,0.516769,0.646112,0.646112,0.784876,0.784876,1.222263,1.222263
jnk1_edge_17124-1_18634-1,jnk1,edge_17124-1_18634-1,17124-1,18634-1,kilocalories / mole,-0.33,0.29,0.58,0.128639,0.58,...,0.829522,0.829522,0.580956,0.580956,0.852556,0.852556,0.928604,0.928604,0.798413,0.798413
jnk1_edge_18626-1_18624-1,jnk1,edge_18626-1_18624-1,18626-1,18624-1,kilocalories / mole,0.38,0.21,0.556667,0.099301,0.556667,...,0.745717,0.745717,-0.265277,0.265277,0.309516,0.309516,0.616033,0.616033,0.446727,0.446727
jnk1_edge_18626-1_18625-1,jnk1,edge_18626-1_18625-1,18626-1,18625-1,kilocalories / mole,0.77,0.21,-0.03,0.107462,-0.03,...,-0.062543,0.062543,-0.294379,0.294379,-0.388337,0.388337,-0.155679,0.155679,0.143932,0.143932
jnk1_edge_18626-1_18627-1,jnk1,edge_18626-1_18627-1,18626-1,18627-1,kilocalories / mole,0.39,0.22,0.14,0.046151,0.14,...,0.0426,0.0426,-0.232256,0.232256,-0.020344,0.020344,0.064101,0.064101,-0.12406,0.12406


In [3]:
for idx in identifiers[1:]:
    all_edges[f'error_{idx}'] = all_edges[f'DDG_{idx}'] - all_edges['DDG_Exp.']
    all_edges[f'abserror_{idx}'] = all_edges[f'error_{idx}'].abs()

In [4]:
def get_inliers(threshold):
    inliers = all_edges.filter(items=[f'abserror_{idx}' for idx in identifiers[1:]])
    for idx in identifiers[1:]:
        inliers.loc[:, f'inlier_{idx}'] = inliers.loc[:, f'abserror_{idx}'] <= threshold
        
        inliers.loc[inliers.loc[:, f'abserror_{idx}'].isna(), f'inlier_{idx}'] = np.nan
    inliers = inliers.filter(items=[f'inlier_{idx}' for idx in identifiers[1:]])
    return inliers
inliers = get_inliers(3)
inliers.isna().sum(), (inliers == 1).sum(), (inliers == 0).sum()

(inlier_OpenFF-1.0                         0
 inlier_OpenFF-1.0_converged             260
 inlier_OpenFF-1.2                       577
 inlier_OpenFF-1.2_converged             731
 inlier_OpenFF-2.0                         0
 inlier_OpenFF-2.0_converged             278
 inlier_OPLS3e                            12
 inlier_GAFF2                              0
 inlier_cGenFF                             0
 inlier_Consensus_OpenFF_GAFF2_cGenFF      0
 inlier_Consensus_OpenFF_GAFF2             0
 inlier_Consensus_all                     12
 dtype: int64,
 inlier_OpenFF-1.0                       1043
 inlier_OpenFF-1.0_converged              825
 inlier_OpenFF-1.2                        492
 inlier_OpenFF-1.2_converged              367
 inlier_OpenFF-2.0                       1049
 inlier_OpenFF-2.0_converged              821
 inlier_OPLS3e                           1084
 inlier_GAFF2                            1051
 inlier_cGenFF                           1019
 inlier_Consensus_OpenFF_GAFF2_

In [5]:
def get_outliers(threshold):
    outliers = all_edges.filter(items=[f'abserror_{idx}' for idx in identifiers[1:]])
    for idx in identifiers[1:]:
        outliers.loc[:, f'outlier_{idx}'] = outliers.loc[:, f'abserror_{idx}'] > threshold
        
        outliers.loc[outliers.loc[:, f'abserror_{idx}'].isna(), f'outlier_{idx}'] = np.nan
    outliers = outliers.filter(items=[f'outlier_{idx}' for idx in identifiers[1:]])
    return outliers
outliers = get_outliers(3)
outliers.isna().sum(), (outliers == 1).sum(), (outliers == 0).sum()

(outlier_OpenFF-1.0                         0
 outlier_OpenFF-1.0_converged             260
 outlier_OpenFF-1.2                       577
 outlier_OpenFF-1.2_converged             731
 outlier_OpenFF-2.0                         0
 outlier_OpenFF-2.0_converged             278
 outlier_OPLS3e                            12
 outlier_GAFF2                              0
 outlier_cGenFF                             0
 outlier_Consensus_OpenFF_GAFF2_cGenFF      0
 outlier_Consensus_OpenFF_GAFF2             0
 outlier_Consensus_all                     12
 dtype: int64,
 outlier_OpenFF-1.0                        85
 outlier_OpenFF-1.0_converged              43
 outlier_OpenFF-1.2                        59
 outlier_OpenFF-1.2_converged              30
 outlier_OpenFF-2.0                        79
 outlier_OpenFF-2.0_converged              29
 outlier_OPLS3e                            32
 outlier_GAFF2                             77
 outlier_cGenFF                           109
 outlier_Consensus_

In [6]:
def get_overlap(dataframe, idx1, idx2, idx3, which='outlier'):
    # following order: (100, 010, 110, 001, 101, 011, 111)
    ynn = []
    nyn = []
    nny = []
    yyn = []
    yny = []
    nyy = []
    for i, row in dataframe.iterrows():
        row[row.isna()] = False
        ynn.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nyn.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nny.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyn.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        yny.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nyy.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
    overlap = []
    sizes = [dataframe[dataframe[f'{which}_{idx}']==1].shape[0]
             for idx in [idx1, idx2, idx3]
            ]
    for n in [ynn, nyn, yyn, nny, yny, nyy]:
        overlap.append(np.sum(n))
    overlap.append(sizes[0]-np.sum(ynn)-np.sum(yyn)-np.sum(yny))
    overlap.append(sizes[1]-np.sum(nyn)-np.sum(yyn)-np.sum(nyy))
    overlap.append(sizes[2]-np.sum(nny)-np.sum(nyy)-np.sum(yny))
    overlap.append(sizes[0])
    overlap.append(sizes[1])
    overlap.append(sizes[2])
    return overlap

In [7]:
from matplotlib_venn import _venn3
def venn_plot(idx1, idx2, idx3, threshold, which='outlier'):
    labels = [idx1, idx2, idx3]
    if which=='outlier':
        numbers = get_outliers(threshold)
        color_number = 6
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        color_number = 0
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
    
    numbers = numbers[[f'{which}_{i}' for i in np.unique([idx1, idx2, idx3])]]
    numbers = numbers[~numbers.isna().any(axis=1)]
    overlap = get_overlap(numbers, idx1, idx2, idx3, which=which)
    sizes = overlap[-3:]
    sim_sizes = [
        numbers.shape[0] - numbers[f'{which}_{idx}'].isna().sum() for idx in [idx1, idx2, idx3]
    ]
    areas = _venn3.compute_venn3_areas(overlap[:7])
    centers, radii = _venn3.solve_venn3_circles(areas)
    regions = _venn3.compute_venn3_regions(centers, radii)
    label_positions = np.array([centers[0] + np.array([-radii[0] / 2, radii[0]])*1.1,
                                centers[1] + np.array([radii[1] / 2, radii[1]])*1.1,
                                centers[2] + np.array([0.0, -radii[2] * 1.1])])
    #colors = _venn3.compute_venn3_colors(set_colors)
    
    subset_positions = np.array([r.label_position() for r in regions])
    subset_labels = [f'{int(s):d}' for s in overlap[:7]]
    
    fig = go.Figure()
    colorway=colors.qualitative.Safe + colors.qualitative.Vivid
    # Create scatter trace of text labels
    fig.add_trace(go.Scatter(
        x=label_positions[:,0],
        y=label_positions[:,1],
        text=[f'{l} ({s}/{a})' for l, s, a in zip(labels, sizes, sim_sizes)],
        mode="text",
        textfont=dict(
            color="black",
            size=18
        )
    ))
    fig.add_trace(go.Scatter(
        x=subset_positions[:,0],
        y=subset_positions[:,1],
        text=subset_labels,
        mode="text",
        textfont=dict(
            color="black",
            size=18
        )
    ))
    # Update axes properties
    fig.update_xaxes(
        showticklabels=False,
        showgrid=False,
        zeroline=False,
    )

    fig.update_yaxes(
        showticklabels=False,
        showgrid=False,
        zeroline=False,
    )

    # Add circles
    for i in range(3):
        fig.add_shape(
                type="circle",
                fillcolor=colors.qualitative.Prism[color_number+i],
                x0=centers[i][0]-radii[i],
                y0=centers[i][1]-radii[i],
                x1=centers[i][0]+radii[i],
                y1=centers[i][1]+radii[i],
                line_color=colors.qualitative.Prism[color_number+i]
            )
    fig.update_shapes(dict(
        opacity=0.5,
        xref="x",
        yref="y",
        layer="below"
    ))
    axlim = (np.max(np.fabs(centers)) + np.max(np.fabs(radii)))*1.2
    # Update figure dimensions
    fig.update_layout(
        title={
            'text': title,
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': {'size': 24}},
        margin=dict(
            l=30,
            r=30,
            b=30,
            t=30
        ),
        xaxis=dict(range=[-axlim, axlim]),
        yaxis=dict(range=[-axlim, axlim]),
        height=800,
        width=800,
        plot_bgcolor="white",
        showlegend=False
    )
    fig.write_image(f'04g_venn_{idx1}_{idx2}_{idx3}_{threshold}_{which}.svg')
    return fig

In the following interactive cell, a Venn plot is created. The three calculated sets can be chosen in the dropdown menus of `idx1`, `idx2` and `idx3`. A `threshold` (kcal/mol) can be chosen next and whether you want to see the successes/inliers or the outliers. The Venn plot is based on the comparison between calculated set and experimental values of edges/relative free energies (DDG values). The sizes of the different fields are written into the fields. The numbers in brackets behind the force field name are the number of successes or outliers and the total number of available simulations. 

In [8]:
from ipywidgets import widgets, interact
out = interact(venn_plot, idx1=identifiers[1:], idx2=identifiers[1:], idx3=identifiers[1:], threshold=np.arange(0, 5, 0.5), which=['inlier', 'outlier'])

interactive(children=(Dropdown(description='idx1', options=('OpenFF-1.0', 'OpenFF-1.0_converged', 'OpenFF-1.2'…

In [9]:
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D, IPythonConsole

def get_outliers(threshold):
    outliers = all_edges.copy()
    for idx in identifiers[1:]:
        outliers.loc[:, f'outlier_{idx}'] = outliers.loc[:, f'abserror_{idx}'] > threshold
        
        outliers.loc[outliers.loc[:, f'abserror_{idx}'].isna(), f'outlier_{idx}'] = np.nan
    return outliers

In [10]:
def make_html(edges_df, sets=identifiers):
    columns = ['Perturbation']
    for i, idx in enumerate(sets):
        name = idx
        edges_df.loc[:,f'DDG {name}'] = [
            f'{np.round(row[f"DDG_{idx}"], 1)} ({np.round(row[f"dDDG_{idx}"], 1)})' 
            for i, row in edges_df.iterrows()
        ] 
        columns.append(f'DDG {name}')
    edges_df = edges_df[columns]
    edges_df.reset_index(inplace=True, drop=True)
    html = edges_df.to_html(escape=False, float_format=lambda x: f'{x:.1f}')
    return html

In [11]:
def get_overlap_edges(dataframe, idx1, idx2, idx3, which='outlier'):
    # following order: (100, 010, 110, 001, 101, 011, 111)
    ynn = []
    nyn = []
    nny = []
    yyn = []
    yny = []
    nyy = []
    yyy = []
    nnn = []
    for i, row in dataframe.iterrows():
        row[row.isna()] = False
        ynn.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nyn.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nny.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyn.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        yny.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nyy.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyy.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nnn.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
    overlap = {}
    for name, n in zip(['ynn', 'nyn', 'yyn', 'nny', 'yny', 'nyy', 'yyy', 'nnn'], [ynn, nyn, yyn, nny, yny, nyy, yyy, nnn]):
        n = np.array(n).astype(bool)
        overlap[name] = dataframe.loc[n]
    return overlap

In [12]:
def create_perturbation_visualization(df, text='', img_size=('400px', '200px'), directory='13_outliers', redraw=False):
    import benchmarkpl
    path = benchmarkpl.__path__[0]
    # check whether image exists
    os.makedirs(os.path.join(path, targets.get_target_dir(df["target"]), directory), exist_ok=True)
    file_path = os.path.join(path, targets.get_target_dir(df["target"]), directory, f'{df["edge"]}.svg')
    if not redraw and os.path.exists(file_path):
        with open(file_path, 'r') as file:
            img = file.read()
    else:
        # visualization
        target_path = f'{targets.data_path}/{targets.get_target_dir(df["target"])}'
        m1 = Chem.SDMolSupplier(
            f'{target_path}/02_ligands/lig_{df["ligandA"]}/crd/lig_{df["ligandA"]}.sdf', 
            removeHs=False)[0]
        m2 = Chem.SDMolSupplier(
            f'{target_path}/02_ligands/lig_{df["ligandB"]}/crd/lig_{df["ligandB"]}.sdf', 
            removeHs=False)[0]
        pairs = np.loadtxt(
            f'{target_path}/03_hybrid/edge_{df["ligandA"]}_{df["ligandB"]}/water/crd/pairs.dat'
        )
        # decrement pairs to match rdkit counting from 0!
        pairs -= 1
        
        img = drawing.drawPerturbationBare(m1, # rdkit molecule 1
                                       m2, # rdkit molecule 2
                                       pairs, # pairs, np array or list of lists
                                       target=df["target"], # string with target name
                                       n1=df["ligandA"], # name mol 1
                                       n2=df["ligandB"], # name  mol 2
                                       text=text # additional text
                                      )
        
        with open(file_path, 'w') as file:
            file.write(img)
    original = sg.fromstring(img)
    original.set_size(img_size)
    svgstring = original.to_str().decode("utf-8").rstrip()
    svgstring = '\n'.join(svgstring.split('\n')[1:])
    return svgstring

In [13]:
from matplotlib_venn import _venn3
from benchmarkpl import drawing
from svgutils import transform as sg

from IPython.core.display import HTML

def get_edges(idx1, idx2, idx3, threshold, subset='yyy', which='outlier'):
    labels = [idx1, idx2, idx3]
    if which=='outlier':
        numbers = get_outliers(threshold)
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
        
    overlap_edges = get_overlap_edges(numbers, idx1, idx2, idx3, which=which)
    if os.path.exists('../../../02_benchmark_calculations/'):
        targets.set_data_dir('../../../02_benchmark_calculations/')
    for i, row in overlap_edges[subset].iterrows():    
        text = ''
        svgstring = create_perturbation_visualization(row, 
                                                     text=text, 
                                                      img_size=('400px', '200px'), 
                                                      directory='14_venn',
                                                     redraw=True)
        overlap_edges[subset].loc[i, "Perturbation"] = svgstring
    targets.set_data_dir('../benchmarkpl/')
    return HTML(make_html(overlap_edges[subset]))
edges_df = get_edges('OpenFF-2.0', 'GAFF2', 'OPLS3e', threshold=2, subset='yyy')
edges_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,Perturbation,DDG Exp.,DDG OpenFF-1.0,DDG OpenFF-1.0_converged,DDG OpenFF-1.2,DDG OpenFF-1.2_converged,DDG OpenFF-2.0,DDG OpenFF-2.0_converged,DDG OPLS3e,DDG GAFF2,DDG cGenFF,DDG Consensus_OpenFF_GAFF2_cGenFF,DDG Consensus_OpenFF_GAFF2,DDG Consensus_all
0,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npde2: 49072088 -> 482712494907208848271249,-2.1 (0.2),1.6 (0.4),1.6 (0.5),nan (nan),nan (nan),0.2 (0.4),0.2 (0.4),0.6 (0.5),1.3 (0.5),1.4 (0.4),1.7 (0.3),1.6 (0.6),0.9 (0.2)
1,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ncmet: CHEMBL3402743_42 -> CHEMBL3402758_10CHEMBL3402743_42CHEMBL3402758_10,-0.8 (0.4),-4.2 (1.4),nan (nan),-1.2 (0.9),nan (nan),-3.3 (1.6),nan (nan),-4.6 (0.6),-3.8 (1.5),-4.0 (0.9),-3.2 (1.0),-2.9 (1.1),-3.9 (0.6)
2,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ncmet: CHEMBL3402743_42 -> CHEMBL3402762_1CHEMBL3402743_42CHEMBL3402762_1,-2.2 (0.4),-3.4 (1.1),nan (nan),-4.0 (1.2),nan (nan),-4.3 (0.8),nan (nan),-5.9 (0.7),-7.7 (2.1),-3.7 (1.8),-5.3 (1.3),-5.0 (2.0),-5.4 (0.7)
3,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ncmet: CHEMBL3402747_3400_7 -> CHEMBL3402755_4200_15CHEMBL3402747_3400_7CHEMBL3402755_4200_15,0.1 (0.4),-2.9 (0.3),-2.9 (0.3),-2.8 (0.2),-2.8 (0.2),-2.6 (0.1),-2.6 (0.1),-3.1 (0.1),-2.5 (0.1),-1.1 (0.2),-2.1 (0.5),-2.6 (0.1),-2.3 (0.1)
4,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmcl1: 48 -> 274827,0.5 (0.1),-1.2 (0.5),-1.2 (0.5),nan (nan),nan (nan),-2.9 (0.5),-2.9 (0.5),-2.2 (0.2),-2.2 (0.2),-2.0 (0.8),-2.9 (0.3),-2.6 (0.2),-2.4 (0.2)
5,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmcl1: 67 -> 316731,-0.3 (0.0),3.7 (0.3),3.7 (0.3),nan (nan),nan (nan),2.1 (0.2),nan (nan),2.6 (0.7),3.5 (0.5),3.1 (0.4),2.6 (0.5),2.8 (0.7),2.8 (0.2)
6,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmcl1: 67 -> 356735,-1.2 (0.2),3.1 (0.2),3.1 (0.2),nan (nan),nan (nan),1.9 (0.4),1.9 (0.4),1.2 (0.1),2.5 (0.6),2.8 (0.1),1.9 (0.5),1.5 (0.2),2.1 (0.2)
7,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmcl1: 67 -> 506750,-1.8 (0.2),0.4 (0.3),0.4 (0.3),nan (nan),nan (nan),1.0 (0.1),1.0 (0.1),0.9 (0.2),2.0 (0.4),1.3 (0.3),1.7 (0.3),1.6 (0.5),1.3 (0.1)
8,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nmcl1: 67 -> 526752,-1.6 (0.1),0.9 (0.3),0.9 (0.3),nan (nan),nan (nan),0.7 (0.1),0.7 (0.1),1.2 (1.8),1.0 (0.1),1.3 (0.1),1.0 (0.2),0.8 (0.3),1.0 (0.5)
9,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nbace: CAT-13k -> CAT-4dCAT-13kCAT-4d,0.6 (0.4),-0.6 (1.6),nan (nan),nan (nan),nan (nan),-2.0 (0.7),-2.0 (0.7),3.1 (0.2),-1.6 (0.4),-1.4 (0.5),-1.8 (0.4),-1.8 (0.7),-0.4 (0.3)


# Filter based on common outliers in all force fields

In [14]:
def filter_common(dataframe, sets, threshold, which='outlier'):
    if which=='outlier':
        numbers = get_outliers(threshold)
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
    for i, row in numbers.iterrows():
        row[row.isna()] = False
        numbers.loc[i, 'outlierfilter'] = np.all([row[f'{which}_{idx}'] for idx in sets])
    return numbers

In [15]:
def show_common_outliers(target, sets, threshold):
    numbers = filter_common(all_edges, list(sets), threshold)
    if target != 'all':
        numbers = numbers[numbers['target']==target]
    numbers = numbers[numbers['outlierfilter']]
    if os.path.exists('../../../02_benchmark_calculations/'):
        targets.set_data_dir('../../../02_benchmark_calculations/')
    numbers['Perturbation'] = None
    for i, row in numbers.iterrows(): 
        text = ''
        svgstring = create_perturbation_visualization(row, 
                                                     text=text, 
                                                      img_size=('400px', '200px'), 
                                                      directory='14_venn',
                                                     redraw=True)
        numbers.loc[i, "Perturbation"] = svgstring
    targets.set_data_dir('../benchmarkpl/')
    return HTML(make_html(numbers, sets=['Exp.']+list(sets)))

In [16]:
show_common_outliers('hif2a', sets=['OpenFF-2.0', 'GAFF2', 'OPLS3e'], threshold=2)

Unnamed: 0,Perturbation,DDG Exp.,DDG OpenFF-2.0,DDG GAFF2,DDG OPLS3e
0,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 206 -> 2320623,3.1 (0.4),-3.1 (0.6),-2.5 (1.0),0.9 (0.1)
1,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 206 -> 4220642,1.2 (0.4),-1.4 (0.3),-1.3 (0.6),-1.1 (0.1)
2,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 237 -> 227237227,-1.2 (0.4),-7.8 (0.2),-6.3 (0.5),-4.4 (0.2)
3,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 237 -> 254237254,0.0 (0.4),-3.0 (0.4),-2.4 (0.4),-3.0 (0.2)
4,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 237 -> 290237290,-1.0 (0.4),-6.0 (0.8),-4.2 (0.4),-3.4 (0.2)
5,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 237 -> 3123731,0.7 (0.4),-2.4 (0.1),-3.9 (0.4),-1.7 (0.3)
6,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 289 -> 3528935,3.8 (0.4),-1.4 (0.9),0.1 (0.9),0.7 (0.1)
7,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 35 -> 7a357a,-2.9 (0.4),1.1 (0.4),1.2 (0.3),2.8 (0.1)
8,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 67 -> 25667256,-0.5 (0.4),3.5 (0.5),4.1 (1.0),2.6 (0.1)
9,\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhif2a: 7a -> 237a23,3.0 (0.4),0.0 (0.4),0.0 (0.5),0.2 (0.1)


In [17]:
out = interact(show_common_outliers, target=['all']+list(targets.target_dict.keys()), sets=widgets.SelectMultiple(
    options=identifiers[1:],
    description='Sets',
    disabled=False,
    button_style='info'
),
              threshold=[3.0, 2.5, 2.0, 1.5, 1.0])

interactive(children=(Dropdown(description='target', options=('all', 'jnk1', 'pde2', 'thrombin', 'p38', 'ptp1b…