This notebook converts the epik microscopic results to type I pKas

In [1]:
import os
os.environ["OE_LICENSE"] = "C:\\Users\\rustenba\\Downloads\\oe_license.txt"

from openeye import oechem
import openeye
import oenotebook as oenb
import csv

from pathlib import Path
import pandas as pd
import numpy as np

if not openeye.OEChemIsLicensed():
    raise RunTimeError("Can't find a valid OpenEye license")

### Loading microstates from SAMPL6 files

SAMPL6 microstate definitions are loaded into a dictionary using `load_microstate_dictionary`.

By default it uses from the files at www.github.com/MobleyLab/SAMPL6 , a local copy is stored in the microstates folder.

The states are read in as smiles. For a proper comparison of microstates, explicit hydrogens are added to the molecules using the `add_h` function.



In [2]:
def add_h(mol: oechem.OEMol):
    """Add explicit hydrogens to a molecule"""
    for atom in mol.GetAtoms():
        oechem.OEAddExplicitHydrogens(mol, atom)

def load_microstate_dictionary(sampl_id: str, microstate_folder: str= "./microstates/"):
    """Load a dictionary of microstates as defined by SAMPL from the microstate files.
    
    Parameters
    ----------
    sampl_id - the identifier for the molecule in sampl, e.g. SM12 
    
    Returns
    -------
    dict - key : microstate_id, value : oe_mol
    """

    microstates = dict()

    # Locate the file containing state definitions
    filename = "{}_microstates.csv".format(sampl_id)
    full_path = os.path.join(microstate_folder, filename)
    mypath = Path(full_path)
    if not mypath.is_file():
        raise ValueError("No microstate definitions were found for molecule {}. Check for typos.".format(sampl_id))
    
    # SAMPL6 CSV files are not OpenEye compatible, so instead we read the csv line by line and translate the smiles.
    with open(full_path, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
        
        for row in csvreader:
            key = row['microstate ID']
            val = oechem.OEGraphMol()
            oechem.OESmilesToMol(val, row['canonical isomeric SMILES'])
            add_h(val)
            microstates[key] = val
            
    return microstates

### Loading results from Epik

This function loads the results for a single SAMPL6 molecule at a single pH function. By default it uses
the `output/microscopic` folder, which contains pKa predictions between 2.0 and 12.0 for states with a population of > $e^{-10 RT}$

In [3]:
def load_sdf(sampl_id:str, pH_str: str, epik_output_folder:str="./output/microscopic/") -> list:
    """Load a single result from Epik files using SAMPL ID, string of the pH with 1 decimal point.
    
    Returns
    -------
    list - OEMols with SD tags containing epik information, every OEMol represents one microstate
    """
    results = list()
        
    
    # Locate the file containing state definitions
    filename = "{}.epik.{}.sdf".format(sampl_id, pH_str)
    full_path = os.path.join(epik_output_folder, sampl_id, filename)
    mypath = Path(full_path)
    if not mypath.is_file():
        raise ValueError("No result file was found for molecule {} at pH {}. Check for typos.".format(sampl_id, pH_str))
    
    ifs = oechem.oemolistream()
    
    if ifs.open(full_path):
        if not oechem.OEIsSDDataFormat(ifs.GetFormat()):
            raise IOError("No SD data")
    
        # Need graphmols for SDF loading
        for mol in ifs.GetOEGraphMols():
            #add_h(mol)
            newmol = oechem.OEGraphMol(mol)
            results.append(newmol)
            # print(newmol.HasSDData())

        ifs.close()
    else:
        raise IOError("Could not read input file.")
    return results

def delete_atom(mol: oechem.OEGraphMol, atom_index: int):
    """Delete atom from molecule (modifies in place)"""
    atom = list(mol.GetAtoms())[atom_index]
    mol.DeleteAtom(atom)
    return mol

def protonate_atom(mol: oechem.OEGraphMol, parent_index: int):
    parent = list(mol.GetAtoms())[parent_index]
    H = mol.NewAtom(1)
    mol.NewBond(parent, H)    
    return mol


def match_subset(pattern: oechem.OEGraphMol, target:oechem.OEGraphMol):
    """Check if target is a subset of pattern."""
    # Atoms are equal if they have same atomic number (so explicit Hydrogens are needed as well for a match)
    atomexpr = oechem.OEExprOpts_AtomicNumber
     # single or double bonds are considered identical (resonance,chirality fix)
    bondexpr = oechem.OEExprOpts_EqSingleDouble
    ss = oechem.OESubSearch(pattern, atomexpr, bondexpr )
    oechem.OEPrepareSearch(target, ss)

    return ss.SingleMatch(target)

def match_microstates(mol1, mol2):
    """If both states are contained in each other, they're the same."""
    return match_subset(mol1, mol2) and match_subset(mol2, mol1)

# Testing


# electrons shifted for benzene
mol1 = oechem.OEMol()
oechem.OESmilesToMol(mol1, "C1=C[CH-]=CC=[CH+]1")
add_h(mol1)

# regular benzene
mol2 = oechem.OEMol()
oechem.OESmilesToMol(mol2, "c1ccccc1")
add_h(mol2)

# protonated benzene
mol3 = oechem.OEMol()
oechem.OESmilesToMol(mol3, "C1=C[CH2-]=CC=[C+]1")
add_h(mol3)


if not match_microstates(mol1, mol2,):
    raise RuntimeError("These should be resonance structures")

if not match_microstates(mol2, mol1, ):
    raise RuntimeError("Order should not matter.")

if match_microstates(mol1, mol3, ):
    raise RuntimeError("These should not be equal states.")

if match_microstates(mol2, mol3, ):
    raise RuntimeError("These should not be equal states either.")
    
def map_result(microstates: dict, molecule: oechem.OEGraphMol):
    """Match states from results at one pH to microstates."""        
    for microstate_id, microstate in microstates.items():
        newmatch = match_microstates(molecule, microstate)
        if newmatch:
            return microstate_id

    # State was out of SAMPL (pun intended)
    return "Unmatched"

def load_pkas(sampl_id:str, pH_str: str, epik_output_folder:str="./output/microscopic/"):
    """Load the pKa values form a single Epik result file.
    
    Returns
    -------
    pd.Dataframe with pKa values and atoms
    """
    # molecular structures
    mols = load_sdf(sampl_id, pH_str, epik_output_folder)
    microstates = load_microstate_dictionary(sampl_id)
    # Locate the file containing state definitions
    filename = "{}.epik.{}.prop_atoms.csv".format(sampl_id, pH_str)
    full_path = os.path.join(epik_output_folder, sampl_id, filename)
    mypath = Path(full_path)
    if not mypath.is_file():
        raise ValueError("No result file was found for molecule {} at pH {}. Check for typos.".format(sampl_id, pH_str))
    df = pd.read_csv(mypath)   
    df = df.dropna(subset=['r_epik_H2O_pKa'])
    df = df[['Structure', 'Atom', 'i_m_atomic_number', 'r_epik_H2O_pKa', 'r_epik_H2O_pKa_uncertainty']]
    df.columns = ['Microstate', 'Atom', 'AtomicNumber', 'pKa', 'Uncertainty']
    # make states and atoms 0 indexed
    df['Microstate'] = df['Microstate'].apply(lambda state: state -1)
    df['Atom'] = df['Atom'].apply(lambda atom: atom -1)
    df['pKa'] = df['pKa'].apply(lambda pka: "{:.2f}".format(pka))
    df['Uncertainty'] = df['Uncertainty'].apply(lambda u: "{:.2f}".format(u))
    
    # Check if pKa is for acid or base
    df['Acid?'] = df['AtomicNumber'].apply(lambda num: True if num == 1 else False)
    df['Protonated'] = df['Microstate'].apply(lambda state: oechem.OEGraphMol(mols[state]))
    df['Deprotonated'] = df['Microstate'].apply(lambda state: oechem.OEGraphMol(mols[state]))
    df['Deprotonated'] = df.apply(lambda row: delete_atom(row['Deprotonated'], row['Atom']) if row['Acid?'] else row['Deprotonated'], axis=1)
    df['Protonated'] = df.apply(lambda row: protonate_atom(row['Protonated'], row['Atom']) if not row['Acid?'] else row['Protonated'], axis=1)
    df['Protonated_ID'] = df.apply(lambda row: map_result(microstates, row['Protonated']) , axis=1)
    df['Deprotonated_ID'] = df.apply(lambda row: map_result(microstates, row['Deprotonated']), axis=1)
    
    return df.reset_index()


In [5]:
full_df = pd.DataFrame()
for molecule in range(24):
    SAMPL_ID = "SM{:02d}".format(molecule+1)
    print(SAMPL_ID)
    microstates = load_microstate_dictionary(SAMPL_ID)
    for pH in np.linspace(2.0, 12.0, 101):
        if pH % 2.0 == 0:
            print(pH)
        pH_str = '{:.1f}'.format(pH)
        results = load_pkas(SAMPL_ID, pH_str) 
        full_df = full_df.append(results, ignore_index=True)

    


SM01
2.0
4.0
6.0
8.0
10.0
12.0
SM02
2.0
4.0
6.0
8.0
10.0
12.0
SM03
2.0
4.0
6.0
8.0
10.0
12.0
SM04
2.0
4.0
6.0
8.0
10.0
12.0
SM05
2.0
4.0
6.0
8.0
10.0
12.0
SM06
2.0
4.0
6.0
8.0
10.0
12.0
SM07
2.0
4.0
6.0
8.0
10.0
12.0
SM08
2.0
4.0
6.0
8.0
10.0
12.0
SM09
2.0
4.0
6.0
8.0
10.0
12.0
SM10
2.0
4.0
6.0
8.0
10.0
12.0
SM11
2.0
4.0
6.0
8.0
10.0
12.0
SM12
2.0
4.0
6.0
8.0
10.0
12.0
SM13
2.0
4.0
6.0
8.0
10.0
12.0
SM14
2.0
4.0
6.0
8.0
10.0
12.0
SM15
2.0
4.0
6.0
8.0
10.0
12.0
SM16
2.0
4.0
6.0
8.0
10.0
12.0
SM17
2.0
4.0
6.0
8.0
10.0
12.0
SM18
2.0
4.0
6.0
8.0
10.0
12.0
SM19
2.0
4.0
6.0
8.0
10.0
12.0
SM20
2.0
4.0
6.0
8.0
10.0
12.0
SM21
2.0
4.0
6.0
8.0
10.0
12.0
SM22
2.0
4.0
6.0
8.0
10.0
12.0
SM23
2.0
4.0
6.0
8.0
10.0
12.0
SM24
2.0
4.0
6.0
8.0
10.0
12.0


In [22]:
# Drop unmatched entries, most of these were states that were too exotic (pentavalent nitrogen), or with a pKa outside of a reasonable range to have been included in SAMPL6
filtered_df = full_df[full_df["Protonated_ID"]!= "Unmatched"]
filtered_df = filtered_df[filtered_df["Deprotonated_ID"]!= "Unmatched"]


In [23]:
filtered_df = filtered_df.drop_duplicates(["Protonated_ID", "Deprotonated_ID", "pKa"], keep="first")
typeI = filtered_df[["Protonated_ID", "Deprotonated_ID", "pKa", "Uncertainty"]]

In [24]:
typeI.to_csv(open("typeI-raw-microscopic.csv", "w"), index=False)