### Notebook for making images with RDKit

In [1]:
import requests
import svgutils.transform as sg

import rdkit
from rdkit import Chem
from IPython.display import SVG
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdDepictor
rdDepictor.SetPreferCoordGen(True)
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem.Descriptors import MolWt,NumRotatableBonds,HeavyAtomCount
from rdkit.Chem import PandasTools
from rdkit import RDLogger  

from tdc.single_pred import HTS
from tqdm import tqdm
tqdm.pandas()

print(rdkit.__version__)

2021.09.4


#### some helper functions

In [2]:
def neutralize_atoms(smi):
    #RDLogger.DisableLog('rdApp*')                                                                                                                                                       
    mol = Chem.MolFromSmiles(smi)
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    #RDLogger.DisableLog('rdApp*')                                                                                                                                                       
    smi = Chem.MolToSmiles(mol)
    return smi


def stripSalts(smi):
    smi_longest = smi.split(".")
    return max(smi_longest, key=len)


def calc_3_descriptors(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mw,rotors,hvys = [x(mol) for x in [MolWt, NumRotatableBonds, HeavyAtomCount]]
        res = [mw,rotors,hvys]
    else:
        res = [None] * 3
    return res


def commonAtoms(smi):
    commonAtomNumbers = [1,6,7,8,9,15,16,17,35,53]
    mol = Chem.MolFromSmiles(smi)
    res = True
    if mol:
        for atm in mol.GetAtoms():
            if not atm.GetAtomicNum() in commonAtomNumbers:
                res = False
                break
    else:
        res = False
    return res


def getMolListFromDataFrame(df, prop_name):
    molObjList = []    
    for ind, row in df.iterrows():
        mol = Chem.MolFromSmiles(row['SMILES'])
        for prop_name in [s for s in row.index if s !="SMILES"]: # all other cols except smiles 
            mol.SetProp(prop_name,str(row[prop_name])) # only str allowed in mol props
        molObjList.append(mol)
    return molObjList   

### Retrieve the HIV data set from TD commons and store in data frame
(https://tdcommons.ai/single_pred_tasks/hts/#hiv)

In [3]:
data = HTS(name = 'HIV')
df = data.get_data()
#df.drop(['Drug'], axis=1, inplace=True)
df.rename(columns={'Drug_ID':'MolName','Drug':'SMILES','Y':'HIV_active'},inplace=True)

Found local copy...
Loading...
Done!


In [4]:
df["HIV_active"].value_counts()[1]



1443

In [5]:
# Silence warning messages

# add column flagging compounds with uncommon atom types
numCpdsBefore = df.shape[0]

# remove compounds with uncommon atoms types
df['OnlyCommonAtms'] = df.SMILES.progress_apply(commonAtoms)
df = df.loc[(df['OnlyCommonAtms'] == True)]

# removing salts, cleaning up SMILES
df['SMILES'] = df.SMILES.progress_apply(stripSalts) 

# neutralize
df['SMILES'] = df.SMILES.progress_apply(neutralize_atoms) # clean SMILES

# calculate and filter on three simple descriptors (RDkit)
df['desc'] = df.SMILES.progress_apply(calc_3_descriptors)
desc_cols = ['MW','Rotors','Heavys']
df[desc_cols] = df.desc.to_list()
df.drop("desc",axis=1,inplace=True)

# filter on simple properties
df = df.loc[(df['Heavys'] > 5) & 
            (df['Heavys'] < 50) &
            (df['Rotors'] < 18) &
            (df['MW'] > 100) &
            (df['MW'] < 900) #&
            #(df['OnlyCommonAtms'] == True)
            ]

# drop columns from dataframe
df.drop(["OnlyCommonAtms",'Heavys','MW','Rotors'], axis=1,inplace=True)

numCpdsAfter = df.shape[0]

print(f"Starting with {numCpdsBefore} compounds. This many remaining {numCpdsAfter} after filtering.")

100%|██████████| 41127/41127 [00:07<00:00, 5429.43it/s]
100%|██████████| 39227/39227 [00:00<00:00, 1075781.11it/s]
  1%|▏         | 521/39227 [00:00<00:08, 4354.28it/s]


KeyboardInterrupt: 

## Sort the HIV actives and inactives into separate dateframes and lists

In [None]:
df_inactives = df.loc[(df['HIV_active'] == 0)]
inactivesList = getMolListFromDataFrame(df_inactives, "MolName")

df_actives = df.loc[(df['HIV_active'] == 1)]
activesList = getMolListFromDataFrame(df_actives, "MolName")

print(f"Number of actives: {len(activesList)} and inactives: {len(inactivesList)}")

Number of actives: 1088 and inactives: 36554


### export images as png files 
(saved in ./images folder)

In [None]:
path = "MolFromSmilesImages/"
for active in activesList:
    molname = active.GetProp("MolName")
    molname = molname.replace(" ", "_")
    filename = path + molname + "_Active.png"
    Chem.Draw.MolToFile(active, filename, size=(200,150))#, kekulize=True, wedgeBonds=True, imageType=None, fitImage=False, options=None)
print('Done')

Done


In [None]:
# and the inactives
for inactive in inactivesList:
    molname = inactive.GetProp("MolName")
    molname = molname.replace(" ", "_")
    filename = path + molname + "_Inactive.png"
    Chem.Draw.MolToFile(inactive, filename, size=(200,150))#, kekulize=True, wedgeBonds=True, imageType=None, fitImage=False, options=None)
print('Done')


Done


In [6]:
import deepchem as dc
model = ChemCeption(img_spec: str = 'std', img_size: int = 80, base_filters: int = 16, inception_blocks: Dict = {'A': 3, 'B': 3, 'C': 3}, n_tasks: int = 10, n_classes: int = 2, augment: bool = False, mode: str = 'regression', **kwargs)

SyntaxError: invalid syntax (Temp/ipykernel_17372/1781818280.py, line 2)

In [18]:
import numpy as np
from collections import Counter

lst = [1,1,1,0,0]
arr = np.array(lst)


class_list = cs_data_balance(arr)
class_list

lst = [[1,0],[1,0],[1,0],[0,1],[0,1]]
arr = np.array(lst)

arr[class_list]


array([[1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0]])

In [8]:
def cs_data_balance(class_list):
    # Count how many samples for each class is present
    counts = Counter(class_list)

    # Determine max class and count
    maxclass, maxcount = Counter(class_list).most_common(1)[0]

    # Construct separate list of each class to match max class and concat to single list
    index_lists = []
    for key in counts.keys():
        tmp_list = [i for i in range(len(class_list)) if class_list[i] == key]
        index_lists.append(tmp_list)
        # Oversample non-max class until max count is reached
        if len(tmp_list) < maxcount:
            index_lists.append(np.random.choice(tmp_list, size=maxcount - len(tmp_list)))  # , replace=True))
    index_list = np.concatenate(index_lists)
    np.random.shuffle(index_list)

    return index_list