## Substituent Polarity 

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdMMPA import FragmentMol
from operator import itemgetter
import useful_rdkit_utils as uru
import mols2grid
from tqdm.auto import tqdm

In [4]:
# import requests
# lib_file = requests.get("https://raw.githubusercontent.com/PatWalters/practical_cheminformatics_tutorials/main/sar_analysis/scaffold_finder.py")
# ofs = open("scaffold_finder.py","w")
# print(lib_file.text,file=ofs)
# ofs.close()
import sys
sys.path.append('/home/dom/Projects/Playground/practical_cheminformatics_tutorials/sar_analysis')
from scaffold_finder import generate_fragments, cleanup_fragment

Useful utility functions

In [5]:
def remove_map_nums(mol):
    for atm in mol.GetAtoms():
        atm.SetAtomMapNum(0)

def sort_fragments(mol):
    """
    Transform a molecule with multiple fragments into a list of molecules that is sorted by number of atoms
    from largest to smallest
    """
    frag_list = list(Chem.GetMolFrags(mol, asMols=True))
    [remove_map_nums(x) for x in frag_list]
    frag_num_atoms_list = [(x.GetNumAtoms(), x) for x in frag_list]
    frag_num_atoms_list.sort(key=itemgetter(0), reverse=True)
    return [x[1] for x in frag_num_atoms_list]

def smi2logp(smi):
    mol = Chem.MolFromSmiles(smi)
    return uru.MolLogP(mol)

### Read Input
Read the input data

In [7]:
infile_url = "https://raw.githubusercontent.com/PatWalters/datafiles/main/chembl_drugs.smi"
drug_df = pd.read_csv(infile_url,sep=" ",names=["SMILES","Name"])

Add an RDKit molecule to the dataframe

In [8]:
drug_df['mol'] = drug_df.SMILES.apply(Chem.MolFromSmiles)
drug_df.mol = drug_df.mol.apply(uru.get_largest_fragment)

### Generate Scaffold-Substituent Pairs
Iterate over the molecules in the dataframe and transform to scaffold, substituent pairs 

In [9]:
row_list = []
for smiles, name, mol in tqdm(drug_df.values):
    frag_list = FragmentMol(mol,maxCuts=1)
    for _,frag_mol in frag_list:
        pair_list = sort_fragments(frag_mol)
        row_list.append([smiles]+[Chem.MolToSmiles(x) for x in pair_list]+[name])
row_df = pd.DataFrame(row_list,columns=["SMILES","Core","R_group","Name"])
row_df

  0%|          | 0/1203 [00:00<?, ?it/s]

Unnamed: 0,SMILES,Core,R_group,Name
0,Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1,*c1ccc(S(=O)(=O)Nc2ccccn2)cc1,*N,CHEMBL700
1,Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1,*S(=O)(=O)Nc1ccccn1,*c1ccc(N)cc1,CHEMBL700
2,Nc1ccc(S(=O)(=O)Nc2ccccn2)cc1,*NS(=O)(=O)c1ccc(N)cc1,*c1ccccn1,CHEMBL700
3,CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+],*CC(C)C1(CC)C(=O)[N-]C(=O)NC1=O,*C,CHEMBL1200982
4,CCC(C)C1(CC)C(=O)[N-]C(=O)NC1=O.[Na+],*C(C)C1(CC)C(=O)[N-]C(=O)NC1=O,*CC,CHEMBL1200982
...,...,...,...,...
8104,CONC(=O)Nc1ccc(-c2sc3c(c2CN(C)C)c(=O)n(-c2ccc(...,*c1ccc(-n2c(=O)c3c(CN(C)C)c(-c4ccc(NC(=O)NOC)c...,*OC,CHEMBL1800159
8105,CONC(=O)Nc1ccc(-c2sc3c(c2CN(C)C)c(=O)n(-c2ccc(...,*Oc1ccc(-n2c(=O)c3c(CN(C)C)c(-c4ccc(NC(=O)NOC)...,*C,CHEMBL1800159
8106,CONC(=O)Nc1ccc(-c2sc3c(c2CN(C)C)c(=O)n(-c2ccc(...,*n1c(=O)n(-c2ccc(OC)nn2)c(=O)c2c(CN(C)C)c(-c3c...,*Cc1c(F)cccc1F,CHEMBL1800159
8107,CONC(=O)Nc1ccc(-c2sc3c(c2CN(C)C)c(=O)n(-c2ccc(...,*Cn1c(=O)n(-c2ccc(OC)nn2)c(=O)c2c(CN(C)C)c(-c3...,*c1c(F)cccc1F,CHEMBL1800159


Create a dataframe with substituents and number of occurrences. Change **min_occurrences** below to a smaller value to increase the number of substituents shown. 

In [12]:
min_occurrences = 20
substituent_df = row_df.R_group.value_counts().to_frame().reset_index()
substituent_df.columns = ["SMILES","Count"]
substituent_df['LogP'] = substituent_df.SMILES.apply(smi2logp)
frequent_substituent_df = substituent_df.query("Count >= @min_occurrences").sort_values("LogP")

Display the substituents in a grid. 

In [13]:
mols2grid.display(frequent_substituent_df,
                 subset=["img","LogP","Count"],
                 transform={
                 "LogP": lambda x: f"{x:.2f}"},
                 selection=False,
                 custom_css=""".data-mols2grid-id-display { display: none }""")