# Draw Venn plots of successes/outliers compared to experimental values

In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from  plotly import colors
import pandas as pd
import yaml
import pint
unit_registry = pint.UnitRegistry()

from tqdm.notebook import tqdm

from PLBenchmarks import targets, ligands, edges


import benchmarkpl
path = benchmarkpl.__path__[0]
targets.set_data_dir(path)
results_dir = '10_results'



_ColormakerRegistry()

# Read in data

### Function to read in Parsley data

In [2]:
names = ['experiment',
        'OpenFF-1.0',
        'OpenFF-1.0-converged',
        'OpenFF-1.0-filtered',
        'OpenFF-1.2',
        'OpenFF-1.2-converged',
        'OpenFF-1.2-filtered',
        'OpenFF-2.0-RC1',
        'OpenFF-2.0-RC1-converged',
        'OpenFF-2.0-RC1-filtered',
        'GAFF2',
         'cGenFF',
         'opls3e-gap',
         'opls3e-sch',
         'opls3e-per'
        ]
identifiers = [f"experiment_hahn",
              f"pmx_openff-1.0.0.offxml_hahn",
              f"pmx_converged_openff-1.0.0.offxml_hahn",
              f"pmx_repeatfilter_openff-1.0.0.offxml_hahn",
              f"pmx_openff-1.2.0.offxml_gapsys",
              f"pmx_converged_openff-1.2.0.offxml_gapsys",
              f"pmx_repeatfilter_openff-1.2.0.offxml_gapsys",
              f"pmx_openff-2.0.0-rc.1.offxml_gapsys",
              f"pmx_converged_openff-2.0.0-rc.1.offxml_gapsys",
              f"pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys",
              f"pmx_gaff_gapsys",
              f"pmx_cgenff_gapsys",
              f"fep_opls3e_5_gapsys",
              f"fep+_opls3e_schindler",
              f"fep+_opls3e_perez",
             ]

In [3]:
data = {}
for target in tqdm(targets.target_dict.keys()):
    data[target] = {}
    for idx in identifiers:
        file_name = os.path.join(path, targets.get_target_dir(target), results_dir,
                                       f'{target}_{idx}.yaml'
                           )
        if os.path.exists(file_name):
            with open(file_name, 'r') as file:
                data[target][idx] = yaml.safe_load(file)
        else:
            print(f"File {file_name} for target {target} not available")

  0%|          | 0/22 [00:00<?, ?it/s]

File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_pmx_converged_openff-1.2.0.offxml_gapsys.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_pmx_repeatfilter_openff-1.2.0.offxml_gapsys.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_pmx_openff-2.0.0-rc.1.offxml_gapsys.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_fep+_opls3e_schindler.yaml for target jnk1 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2019-09-23_jnk1/10_results/jnk1_fep+_opls3e_perez.yaml for target jnk1 not available
File /projects/CNS/OGA/F

File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-05_bace_hunt/10_results/bace_hunt_pmx_openff-2.0.0-rc.1.offxml_gapsys.yaml for target bace_hunt not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-05_bace_hunt/10_results/bace_hunt_fep+_opls3e_schindler.yaml for target bace_hunt not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-05_bace_hunt/10_results/bace_hunt_fep+_opls3e_perez.yaml for target bace_hunt not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-06_bace_p2/10_results/bace_p2_pmx_openff-2.0.0-rc.1.offxml_gapsys.yaml for target bace_p2 not available
File /projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-06_bace_p2/10_results/bace_p2_fep+_opls3e_schindler.yaml for target bace_p2 no

In [4]:
all_edges = pd.DataFrame()
for target, tdata in tqdm(data.items()):
    dfs = []
    for software, sdata in tdata.items():
        df = pd.DataFrame(sdata).T
        df['target'] = target
        df['edge'] = [f'edge_{row["ligandA"]}_{row["ligandB"]}' for i, row in df.iterrows()] 
        for i, row in df.iterrows():
            df.loc[i, f'DDG_{software}'] = unit_registry.Quantity(row['DDG'], row['unit']).to('kilocalories/mole').magnitude
            df.loc[i, f'dDDG_{software}'] = unit_registry.Quantity(row['dDDG'], row['unit']).to('kilocalories/mole').magnitude
        df = df.drop(labels=['DDG', 'dDDG'], axis=1)
        dfs.append(df)
    if len(dfs) > 0:
        df = pd.concat(dfs, axis=1)
        df = df.loc[:,~df.columns.duplicated()]
        all_edges = all_edges.append(df)
all_edges.head()

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,ligandA,ligandB,unit,target,edge,DDG_experiment_hahn,dDDG_experiment_hahn,DDG_pmx_openff-1.0.0.offxml_hahn,dDDG_pmx_openff-1.0.0.offxml_hahn,DDG_pmx_converged_openff-1.0.0.offxml_hahn,...,DDG_pmx_converged_openff-1.2.0.offxml_gapsys,dDDG_pmx_converged_openff-1.2.0.offxml_gapsys,DDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys,dDDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys,DDG_pmx_openff-2.0.0-rc.1.offxml_gapsys,dDDG_pmx_openff-2.0.0-rc.1.offxml_gapsys,DDG_fep+_opls3e_schindler,dDDG_fep+_opls3e_schindler,DDG_fep+_opls3e_perez,dDDG_fep+_opls3e_perez
jnk1_edge_17124-1_18631-1,17124-1,18631-1,kilocalories / mole,jnk1,edge_17124-1_18631-1,0.26,0.37,1.19,0.1,1.19,...,,,,,,,,,,
jnk1_edge_17124-1_18634-1,17124-1,18634-1,kilocalories / mole,jnk1,edge_17124-1_18634-1,-0.33,0.29,0.58,0.13,0.58,...,,,,,,,,,,
jnk1_edge_18626-1_18624-1,18626-1,18624-1,kilocalories / mole,jnk1,edge_18626-1_18624-1,0.38,0.21,0.56,0.09,0.556667,...,,,,,,,,,,
jnk1_edge_18626-1_18625-1,18626-1,18625-1,kilocalories / mole,jnk1,edge_18626-1_18625-1,0.77,0.21,-0.03,0.11,-0.03,...,,,,,,,,,,
jnk1_edge_18626-1_18627-1,18626-1,18627-1,kilocalories / mole,jnk1,edge_18626-1_18627-1,0.39,0.22,0.14,0.05,0.14,...,,,,,,,,,,


In [5]:
def combine_sets(sets, idx_new, name_new):
    for i, row in all_edges.iterrows():
        for idx in sets:
            if not pd.isna(row[f'DDG_{idx}']):
                all_edges.loc[i, f'DDG_{idx_new}'] = row[f'DDG_{idx}']
                all_edges.loc[i, f'dDDG_{idx_new}'] = row[f'dDDG_{idx}']
    if idx_new not in identifiers:
        identifiers.append(idx_new)
    else:
        raise Exception("idx_new already in identifiers")
    if name_new not in names:
        names.append(name_new)
    else:
        raise Exception("name_new already in identifiers")
    assert len(identifiers) == len(names)
combine_sets(['fep_opls3e_5_gapsys', 'fep+_opls3e_schindler'], 
             'fep+_opls3e',
            'opls3e')
all_edges.head()

Unnamed: 0,ligandA,ligandB,unit,target,edge,DDG_experiment_hahn,dDDG_experiment_hahn,DDG_pmx_openff-1.0.0.offxml_hahn,dDDG_pmx_openff-1.0.0.offxml_hahn,DDG_pmx_converged_openff-1.0.0.offxml_hahn,...,DDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys,dDDG_pmx_repeatfilter_openff-1.2.0.offxml_gapsys,DDG_pmx_openff-2.0.0-rc.1.offxml_gapsys,dDDG_pmx_openff-2.0.0-rc.1.offxml_gapsys,DDG_fep+_opls3e_schindler,dDDG_fep+_opls3e_schindler,DDG_fep+_opls3e_perez,dDDG_fep+_opls3e_perez,DDG_fep+_opls3e,dDDG_fep+_opls3e
jnk1_edge_17124-1_18631-1,17124-1,18631-1,kilocalories / mole,jnk1,edge_17124-1_18631-1,0.26,0.37,1.19,0.1,1.19,...,,,,,,,,,1.517686,0.069312
jnk1_edge_17124-1_18634-1,17124-1,18634-1,kilocalories / mole,jnk1,edge_17124-1_18634-1,-0.33,0.29,0.58,0.13,0.58,...,,,,,,,,,0.583174,0.043021
jnk1_edge_18626-1_18624-1,18626-1,18624-1,kilocalories / mole,jnk1,edge_18626-1_18624-1,0.38,0.21,0.56,0.09,0.556667,...,,,,,,,,,1.073136,0.040631
jnk1_edge_18626-1_18625-1,18626-1,18625-1,kilocalories / mole,jnk1,edge_18626-1_18625-1,0.77,0.21,-0.03,0.11,-0.03,...,,,,,,,,,1.445985,0.033461
jnk1_edge_18626-1_18627-1,18626-1,18627-1,kilocalories / mole,jnk1,edge_18626-1_18627-1,0.39,0.22,0.14,0.05,0.14,...,,,,,,,,,0.39675,0.081262


In [6]:
for idx in identifiers[1:]:
    all_edges[f'error_{idx}'] = all_edges[f'DDG_{idx}'] - all_edges['DDG_experiment_hahn']
    all_edges[f'abserror_{idx}'] = all_edges[f'error_{idx}'].abs()

In [7]:
def get_inliers(threshold):
    inliers = all_edges.filter(items=[f'abserror_{idx}' for idx in identifiers[1:]])
    for idx in identifiers[1:]:
        inliers.loc[:, f'inlier_{idx}'] = inliers.loc[:, f'abserror_{idx}'] <= threshold
        
        inliers.loc[inliers.loc[:, f'abserror_{idx}'].isna(), f'inlier_{idx}'] = np.nan
    inliers = inliers.filter(items=[f'inlier_{idx}' for idx in identifiers[1:]])
    return inliers
inliers = get_inliers(3)
inliers.isna().sum(), (inliers == 1).sum(), (inliers == 0).sum()

(inlier_pmx_openff-1.0.0.offxml_hahn                           7
 inlier_pmx_converged_openff-1.0.0.offxml_hahn                88
 inlier_pmx_repeatfilter_openff-1.0.0.offxml_hahn            269
 inlier_pmx_openff-1.2.0.offxml_gapsys                       586
 inlier_pmx_converged_openff-1.2.0.offxml_gapsys             647
 inlier_pmx_repeatfilter_openff-1.2.0.offxml_gapsys          745
 inlier_pmx_openff-2.0.0-rc.1.offxml_gapsys                  584
 inlier_pmx_converged_openff-2.0.0-rc.1.offxml_gapsys        644
 inlier_pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys     735
 inlier_pmx_gaff_gapsys                                       10
 inlier_pmx_cgenff_gapsys                                      8
 inlier_fep_opls3e_5_gapsys                                  670
 inlier_fep+_opls3e_schindler                                596
 inlier_fep+_opls3e_perez                                   1015
 inlier_fep+_opls3e                                          139
 dtype: int64,
 inlier_pm

In [8]:
def get_outliers(threshold):
    outliers = all_edges.filter(items=[f'abserror_{idx}' for idx in identifiers[1:]])
    for idx in identifiers[1:]:
        outliers.loc[:, f'outlier_{idx}'] = outliers.loc[:, f'abserror_{idx}'] > threshold
        
        outliers.loc[outliers.loc[:, f'abserror_{idx}'].isna(), f'outlier_{idx}'] = np.nan
    outliers = outliers.filter(items=[f'outlier_{idx}' for idx in identifiers[1:]])
    return outliers
outliers = get_outliers(3)
outliers.isna().sum(), (outliers == 1).sum(), (outliers == 0).sum()

(outlier_pmx_openff-1.0.0.offxml_hahn                           7
 outlier_pmx_converged_openff-1.0.0.offxml_hahn                88
 outlier_pmx_repeatfilter_openff-1.0.0.offxml_hahn            269
 outlier_pmx_openff-1.2.0.offxml_gapsys                       586
 outlier_pmx_converged_openff-1.2.0.offxml_gapsys             647
 outlier_pmx_repeatfilter_openff-1.2.0.offxml_gapsys          745
 outlier_pmx_openff-2.0.0-rc.1.offxml_gapsys                  584
 outlier_pmx_converged_openff-2.0.0-rc.1.offxml_gapsys        644
 outlier_pmx_repeatfilter_openff-2.0.0-rc.1.offxml_gapsys     735
 outlier_pmx_gaff_gapsys                                       10
 outlier_pmx_cgenff_gapsys                                      8
 outlier_fep_opls3e_5_gapsys                                  670
 outlier_fep+_opls3e_schindler                                596
 outlier_fep+_opls3e_perez                                   1015
 outlier_fep+_opls3e                                          139
 dtype: in

In [9]:
def get_overlap(dataframe, idx1, idx2, idx3, which='outlier'):
    # following order: (100, 010, 110, 001, 101, 011, 111)
    ynn = []
    nyn = []
    nny = []
    yyn = []
    yny = []
    nyy = []
    for i, row in dataframe.iterrows():
        row[row.isna()] = False
        ynn.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nyn.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nny.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyn.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        yny.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nyy.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
    overlap = []
    sizes = [dataframe[dataframe[f'{which}_{idx}']==1].shape[0]
             for idx in [idx1, idx2, idx3]
            ]
    for n in [ynn, nyn, yyn, nny, yny, nyy]:
        overlap.append(np.sum(n))
    overlap.append(sizes[0]-np.sum(ynn)-np.sum(yyn)-np.sum(yny))
    overlap.append(sizes[1]-np.sum(nyn)-np.sum(yyn)-np.sum(nyy))
    overlap.append(sizes[2]-np.sum(nny)-np.sum(nyy)-np.sum(yny))
    overlap.append(sizes[0])
    overlap.append(sizes[1])
    overlap.append(sizes[2])
    return overlap

In [10]:
from matplotlib_venn import _venn3
def venn_plot(idx1, idx2, idx3, threshold, which='outlier'):
    labels = [names[identifiers.index(idx)] for idx in [idx1, idx2, idx3]]
    if which=='outlier':
        numbers = get_outliers(threshold)
        color_number = 6
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        color_number = 0
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
    
    numbers = numbers[[f'{which}_{i}' for i in np.unique([idx1, idx2, idx3])]]
    numbers = numbers[~numbers.isna().any(axis=1)]
    print(numbers.head())
    overlap = get_overlap(numbers, idx1, idx2, idx3, which=which)
    sizes = overlap[-3:]
    sim_sizes = [
        numbers.shape[0] - numbers[f'{which}_{idx}'].isna().sum() for idx in [idx1, idx2, idx3]
    ]
    areas = _venn3.compute_venn3_areas(overlap[:7])
    centers, radii = _venn3.solve_venn3_circles(areas)
    regions = _venn3.compute_venn3_regions(centers, radii)
    label_positions = np.array([centers[0] + np.array([-radii[0] / 2, radii[0]])*1.1,
                                centers[1] + np.array([radii[1] / 2, radii[1]])*1.1,
                                centers[2] + np.array([0.0, -radii[2] * 1.1])])
    #colors = _venn3.compute_venn3_colors(set_colors)
    
    subset_positions = np.array([r.label_position() for r in regions])
    subset_labels = [f'{int(s):d}' for s in overlap[:7]]
    
    fig = go.Figure()
    colorway=colors.qualitative.Safe + colors.qualitative.Vivid
    # Create scatter trace of text labels
    fig.add_trace(go.Scatter(
        x=label_positions[:,0],
        y=label_positions[:,1],
        text=[f'{l} ({s}/{a})' for l, s, a in zip(labels, sizes, sim_sizes)],
        mode="text",
        textfont=dict(
            color="black",
            size=18
        )
    ))
    fig.add_trace(go.Scatter(
        x=subset_positions[:,0],
        y=subset_positions[:,1],
        text=subset_labels,
        mode="text",
        textfont=dict(
            color="black",
            size=18
        )
    ))
    # Update axes properties
    fig.update_xaxes(
        showticklabels=False,
        showgrid=False,
        zeroline=False,
    )

    fig.update_yaxes(
        showticklabels=False,
        showgrid=False,
        zeroline=False,
    )

    # Add circles
    for i in range(3):
        fig.add_shape(
                type="circle",
                fillcolor=colors.qualitative.Prism[color_number+i],
                x0=centers[i][0]-radii[i],
                y0=centers[i][1]-radii[i],
                x1=centers[i][0]+radii[i],
                y1=centers[i][1]+radii[i],
                line_color=colors.qualitative.Prism[color_number+i]
            )
    fig.update_shapes(dict(
        opacity=0.5,
        xref="x",
        yref="y",
        layer="below"
    ))
    axlim = (np.max(np.fabs(centers)) + np.max(np.fabs(radii)))*1.2
    # Update figure dimensions
    fig.update_layout(
        title={
            'text': title,
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': {'size': 24}},
        margin=dict(
            l=30,
            r=30,
            b=30,
            t=30
        ),
        xaxis=dict(range=[-axlim, axlim]),
        yaxis=dict(range=[-axlim, axlim]),
        height=800,
        width=800,
        plot_bgcolor="white",
        showlegend=False
    )
    fig.write_image(f'venn_{idx1}_{idx2}_{idx3}_{threshold}_{which}.svg')
    return fig

ModuleNotFoundError: No module named 'matplotlib_venn'

In the following interactive cell, a Venn plot is created. The three calculated sets can be chosen in the dropdown menus of `idx1`, `idx2` and `idx3`. A `threshold` (kcal/mol) can be chosen next and whether you want to see the successes/inliers or the outliers. The Venn plot is based on the comparison between calculated set and experimental values of edges/relative free energies (DDG values). The sizes of the different fields are written into the fields. The numbers in brackets behind the force field name are the number of successes or outliers and the total number of available simulations. 

In [None]:
from ipywidgets import widgets, interact
out = interact(venn_plot, idx1=identifiers[1:], idx2=identifiers[1:], idx3=identifiers[1:], threshold=np.arange(0, 5, 0.5), which=['inlier', 'outlier'])

In [12]:
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D, IPythonConsole

def get_outliers(threshold):
    outliers = all_edges.copy()
    for idx in identifiers[1:]:
        outliers.loc[:, f'outlier_{idx}'] = outliers.loc[:, f'abserror_{idx}'] > threshold
        
        outliers.loc[outliers.loc[:, f'abserror_{idx}'].isna(), f'outlier_{idx}'] = np.nan
    return outliers

In [13]:
def make_html(edges_df, sets=identifiers):
    columns = ['Perturbation']
    for i, idx in enumerate(sets):
        name = names[identifiers.index(idx)]
        edges_df.loc[:,f'DDG {name}'] = [
            f'{np.round(row[f"DDG_{idx}"], 1)} ({np.round(row[f"dDDG_{idx}"], 1)})' 
            for i, row in edges_df.iterrows()
        ] 
        columns.append(f'DDG {name}')
    edges_df = edges_df[columns]
    edges_df.reset_index(inplace=True, drop=True)
    html = edges_df.to_html(escape=False, float_format=lambda x: f'{x:.1f}')
    return html

In [14]:
def get_overlap_edges(dataframe, idx1, idx2, idx3, which='outlier'):
    # following order: (100, 010, 110, 001, 101, 011, 111)
    ynn = []
    nyn = []
    nny = []
    yyn = []
    yny = []
    nyy = []
    yyy = []
    nnn = []
    for i, row in dataframe.iterrows():
        row[row.isna()] = False
        ynn.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nyn.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        nny.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyn.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
        yny.append(    row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nyy.append(not row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        yyy.append(    row[f'{which}_{idx1}'] and 
                       row[f'{which}_{idx2}'] and 
                       row[f'{which}_{idx3}'])
        nnn.append(not row[f'{which}_{idx1}'] and 
                   not row[f'{which}_{idx2}'] and 
                   not row[f'{which}_{idx3}'])
    overlap = {}
    for name, n in zip(['ynn', 'nyn', 'yyn', 'nny', 'yny', 'nyy', 'yyy', 'nnn'], [ynn, nyn, yyn, nny, yny, nyy, yyy, nnn]):
        n = np.array(n).astype(bool)
        overlap[name] = dataframe.loc[n]
    return overlap

In [15]:
def create_perturbation_visualization(df, text='', img_size=('400px', '200px'), directory='13_outliers', redraw=False):
    import benchmarkpl
    path = benchmarkpl.__path__[0]
    # check whether image exists
    os.makedirs(os.path.join(path, targets.get_target_dir(df["target"]), directory), exist_ok=True)
    file_path = os.path.join(path, targets.get_target_dir(df["target"]), directory, f'{df["edge"]}.svg')
    if not redraw and os.path.exists(file_path):
        with open(file_path, 'r') as file:
            img = file.read()
    else:
        # visualization
        target_path = f'{targets.data_path}/{targets.get_target_dir(df["target"])}'
        m1 = Chem.SDMolSupplier(
            f'{target_path}/02_ligands/lig_{df["ligandA"]}/crd/lig_{df["ligandA"]}.sdf', 
            removeHs=False)[0]
        m2 = Chem.SDMolSupplier(
            f'{target_path}/02_ligands/lig_{df["ligandB"]}/crd/lig_{df["ligandB"]}.sdf', 
            removeHs=False)[0]
        pairs = np.loadtxt(
            f'{target_path}/03_hybrid/edge_{df["ligandA"]}_{df["ligandB"]}/water/crd/pairs.dat'
        )
        # decrement pairs to match rdkit counting from 0!
        pairs -= 1
        
        img = drawing.drawPerturbationBare(m1, # rdkit molecule 1
                                       m2, # rdkit molecule 2
                                       pairs, # pairs, np array or list of lists
                                       target=df["target"], # string with target name
                                       n1=df["ligandA"], # name mol 1
                                       n2=df["ligandB"], # name  mol 2
                                       text=text # additional text
                                      )
        
        with open(file_path, 'w') as file:
            file.write(img)
    original = sg.fromstring(img)
    original.set_size(img_size)
    svgstring = original.to_str().decode("utf-8").rstrip()
    svgstring = '\n'.join(svgstring.split('\n')[1:])
    return svgstring

In [16]:
from matplotlib_venn import _venn3
from benchmarkpl import drawing
from svgutils import transform as sg

from IPython.core.display import HTML

def get_edges(idx1, idx2, idx3, threshold, subset='yyy', which='outlier'):
    labels = [names[identifiers.index(idx)] for idx in [idx1, idx2, idx3]]
    if which=='outlier':
        numbers = get_outliers(threshold)
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
        
    overlap_edges = get_overlap_edges(numbers, idx1, idx2, idx3, which=which)
    if os.path.exists('../../../02_benchmark_calculations/'):
        targets.set_data_dir('../../../02_benchmark_calculations/')
    for i, row in overlap_edges[subset].iterrows():    
        text = ''
        svgstring = create_perturbation_visualization(row, 
                                                     text=text, 
                                                      img_size=('400px', '200px'), 
                                                      directory='14_venn',
                                                     redraw=True)
        overlap_edges[subset].loc[i, "Perturbation"] = svgstring
    targets.set_data_dir('../benchmarkpl/')
    return HTML(make_html(overlap_edges[subset]))
edges_df = get_edges('pmx_repeatfilter_openff-1.0.0.offxml_hahn', 'pmx_gaff_gapsys', 'fep_opls3e_5_gapsys', threshold=2, subset='yyy')
edges_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Perturbation,DDG experiment,DDG OpenFF-1.0,DDG OpenFF-1.0-converged,DDG OpenFF-1.0-filtered,DDG OpenFF-1.2,DDG OpenFF-1.2-converged,DDG OpenFF-1.2-filtered,DDG OpenFF-2.0-RC1,DDG OpenFF-2.0-RC1-converged,DDG OpenFF-2.0-RC1-filtered,DDG GAFF2,DDG cGenFF,DDG opls3e-gap,DDG opls3e-sch,DDG opls3e-per,DDG opls3e
0,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nN\nN\nN\nN\nN\nN\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nN\nN\nN\nN\nN\nN\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\npde2: 49072088 -> 482712494907208848271249,-2.1 (0.2),1.6 (0.4),1.6 (0.4),1.6 (0.4),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),1.3 (0.5),1.4 (0.4),0.6 (0.5),nan (nan),nan (nan),0.6 (0.5)
1,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nF\nF\nF\nH\nH\nH\nmcl1: 67 -> 316731,-0.3 (0.0),3.7 (0.2),3.7 (0.3),3.7 (0.3),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),3.5 (0.5),3.1 (0.4),2.6 (0.7),nan (nan),nan (nan),2.6 (0.7)
2,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nmcl1: 67 -> 356735,-1.2 (0.2),3.1 (0.2),3.1 (0.2),3.1 (0.2),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),2.5 (0.6),2.8 (0.1),1.2 (0.1),nan (nan),nan (nan),1.2 (0.1)
3,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nN\nH\nH\nH\nH\nCl\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nCl\nH\nH\nH\nH\nH\nmcl1: 67 -> 506750,-1.8 (0.2),0.4 (0.3),0.4 (0.3),0.4 (0.3),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),2.0 (0.4),1.3 (0.3),0.9 (0.2),nan (nan),nan (nan),0.9 (0.2)
4,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO-\nO\nN\nH\nH\nCl\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\nH\nH\nH\nCl\nH\nH\nmcl1: 67 -> 526752,-1.6 (0.1),0.9 (0.3),0.9 (0.3),0.9 (0.3),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),1.0 (0.1),1.3 (0.1),1.2 (1.8),nan (nan),nan (nan),1.2 (1.8)
5,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN+\nO\nN\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN+\nO\nN\nN\nCl\nH\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nbace: CAT-4a -> CAT-13kCAT-4aCAT-13k,-1.8 (0.0),0.6 (0.1),0.6 (0.1),0.6 (0.1),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),0.5 (0.8),1.8 (0.3),-4.3 (0.1),nan (nan),nan (nan),-4.3 (0.1)
6,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN+\nO\nN\nN\nH\nH\nH\nH\nH\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN+\nO\nN\nN\nN\nH\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nbace: CAT-4m -> CAT-4lCAT-4mCAT-4l,-0.2 (0.0),1.8 (0.8),1.9 (0.8),1.9 (0.8),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),2.1 (0.7),1.3 (0.2),2.2 (0.1),nan (nan),nan (nan),2.2 (0.1)
7,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCl\nO\nN\nN\nN\nO\nCl\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCl\nO\nN\nN\nN\nO\nCl\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nO\nH\ntyk2: ejm_44 -> ejm_55ejm_44ejm_55,-1.8 (0.0),-4.2 (0.3),-4.2 (0.3),-4.2 (0.3),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),nan (nan),-4.2 (0.3),-3.4 (0.8),-3.9 (0.1),nan (nan),nan (nan),-3.9 (0.1)


# Filter based on common outliers in all force fields

In [17]:
def filter_common(dataframe, sets, threshold, which='outlier'):
    if which=='outlier':
        numbers = get_outliers(threshold)
        title = f'Outliers with Δ(ΔΔG) > {threshold} kcal mol<sup>-1</sup>'
    elif which=='inlier':
        numbers = get_inliers(threshold)
        title = f'Successes with Δ(ΔΔG) <= {threshold} kcal mol<sup>-1</sup>'
    else:
        raise ValueError(f'{which} argument not known.')
    for i, row in numbers.iterrows():
        row[row.isna()] = False
        numbers.loc[i, 'outlierfilter'] = np.all([row[f'{which}_{idx}'] for idx in sets])
    return numbers

In [18]:
def show_common_outliers(target, sets, threshold):
    numbers = filter_common(all_edges, list(sets), threshold)
    if target != 'all':
        numbers = numbers[numbers['target']==target]
    numbers = numbers[numbers['outlierfilter']]
    if os.path.exists('../../../02_benchmark_calculations/'):
        targets.set_data_dir('../../../02_benchmark_calculations/')
    numbers['Perturbation'] = None
    for i, row in numbers.iterrows(): 
        text = ''
        svgstring = create_perturbation_visualization(row, 
                                                     text=text, 
                                                      img_size=('400px', '200px'), 
                                                      directory='14_venn',
                                                     redraw=True)
        numbers.loc[i, "Perturbation"] = svgstring
    targets.set_data_dir('../benchmarkpl/')
    return HTML(make_html(numbers, sets=['experiment_hahn']+list(sets)))

In [19]:
show_common_outliers('hif2a', sets=['openff', 'pmx_gaff_gapsys', 'fep+_opls3e'], threshold=2)

Unnamed: 0,Perturbation,DDG experiment,DDG openff,DDG gaff2,DDG opls3e
0,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nS\nO\nO\nF\nF\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nF\nS\nO\nO\nF\nF\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 206 -> 2320623,3.1 (0.0),-3.9 (0.4),-2.5 (1.0),0.9 (0.1)
1,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nS\nO\nO\nF\nF\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nS\nO\nO\nF\nF\nCl\nO\nF\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 206 -> 4220642,1.2 (0.0),-1.6 (0.2),-1.2 (0.6),-1.1 (0.1)
2,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nS\nO\nO\nF\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nS\nO\nO\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 237 -> 227237227,-1.2 (0.0),-7.4 (0.8),-6.3 (0.5),-4.4 (0.2)
3,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nS\nO\nO\nF\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nS\nO\nO\nN\nF\nF\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 237 -> 254237254,0.0 (0.0),-3.2 (0.3),-2.4 (0.4),-3.0 (0.2)
4,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nS\nO\nO\nF\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN\nO\nF\nS\nO\nO\nF\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 237 -> 290237290,-1.0 (0.0),-6.7 (0.6),-4.2 (0.4),-3.4 (0.2)
5,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nS\nO\nO\nF\nF\nN\nF\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nS\nO\nO\nF\nF\nN\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 237 -> 3123731,0.7 (0.0),-3.6 (0.4),-3.9 (0.4),-1.7 (0.3)
6,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nS\nO\nO\nF\nN\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nCl\nS\nO\nO\nF\nF\nN\nO\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 289 -> 3528935,3.8 (0.0),-0.7 (1.2),0.1 (0.9),0.7 (0.1)
7,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nCl\nS\nO\nO\nF\nF\nN\nO\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nCl\nS\nO\nO\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 35 -> 7a357a,-2.9 (0.0),1.6 (0.4),1.2 (0.3),2.8 (0.1)
8,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nF\nF\nF\nN\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nF\nS\nO\nO\nN\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 67 -> 25667256,-0.5 (0.0),4.7 (0.9),4.1 (1.0),2.6 (0.1)
9,\n\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nCl\nS\nO\nO\nF\nF\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nO\nO\nF\nF\nS\nO\nO\nF\nF\nO\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nH\nhif2a: 7a -> 237a23,3.0 (0.0),-0.2 (0.2),0.0 (0.5),0.2 (0.1)


In [20]:
out = interact(show_common_outliers, target=['all']+list(targets.target_dict.keys()), sets=widgets.SelectMultiple(
    options=identifiers[1:],
    description='Sets',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
# #     tooltips=,
#     icons=['check'] * (len(names)-1)
),
              threshold=[3.0, 2.5, 2.0, 1.5, 1.0])

interactive(children=(Dropdown(description='target', options=('all', 'jnk1', 'pde2', 'thrombin', 'p38', 'ptp1b…

In [21]:
all_edges = all_edges[np.invert(all_edges['target'].isna())]

In [22]:
author = "hahn"
software = "pmx_commonfilter"
forcefield = "openff"
numbers = filter_common(all_edges, sets=['openff_repeatfilter', 'pmx_gaff_gapsys', 'pmx_cgenff_gapsys', 'fep+_opls3e'], threshold=2.0)
for i, row in tqdm(numbers.iterrows()):
    if row['outlierfilter'] :
        numbers.loc[i, 'DDG_openff_commonfilter'] = np.nan
        numbers.loc[i, 'dDDG_openff_commonfilter'] = np.nan
    else:
        numbers.loc[i, 'DDG_openff_commonfilter'] = row['DDG_openff_repeatfilter']
        numbers.loc[i, 'dDDG_openff_commonfilter'] = row['dDDG_openff_repeatfilter']
for target in targets.target_dict:
    os.makedirs(os.path.join(path, targets.get_target_dir(target), results_dir), exist_ok=True)
    with open(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'), 'w') as file:
        df = numbers.loc[all_edges['target']==target].filter(['ligandA','ligandB','DDG_openff_commonfilter', 'dDDG_openff_commonfilter'])
        df.rename(columns={'DDG_openff_commonfilter': 'DDG', 'dDDG_openff_commonfilter': 'dDDG'}, inplace=True)
        df['unit']='kilocalories / mole'        
        yaml.dump(df.T.to_dict(), file)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [23]:
print(1103-sum(numbers['DDG_openff_commonfilter'].isna()))
print(1103-sum(numbers['DDG_openff_repeatfilter'].isna()))
print(1103-sum(numbers['DDG_openff_converged'].isna()))
print(1103-sum(numbers['DDG_openff'].isna()))

868
890
1032
1097


In [24]:
sum(numbers['outlierfilter'])

22

In [25]:
print(os.path.join(path, targets.get_target_dir(target), results_dir, f'{target}_{software}_{forcefield}_{author}.yaml'))

/projects/CNS/OGA/FEP_compare/openforcefield/03_benchmark_analysis/benchmarkpl/benchmarkpl/2020-02-04_bace_2/10_results/bace_2_pmx_commonfilter_openff_hahn.yaml
