In [1]:
import rdkit
from rdkit import Chem
from rdkit.Chem import rdFMCS
import pandas as pd

In [None]:
#identify the subtructures of interest:
#random smiles list
smiles_list=["Fc1cc(F)c(F)c(CCOc2cncc3nnc(-c4ccc(Cl)cc4)n23)c1", 
             "Fc1ccc(C(F)(F)COc2cncc3nnc(-c4ccc(Cl)cc4)n23)cc1", 
             "FC(F)(COc1cncc2nnc(-c3ccc(Cl)cc3)n12)c1ccccc1", 
             "N#Cc1cccc(CCOc2cncc3nnc(-c4ccc(C(F)(F)F)cc4)n23)c1",
             "FC(F)Oc1ccc(-c2nnc3cncc(OCCc4cccc(C(F)(F)F)c4)n23)cc1",
             "Fc1ccc(-c2nnc3cncc(OCCc4ccc5ccccc5c4)n23)cc1F",
             "COc1cc(-c2nnc3cncc(OCCc4cccc(F)c4)n23)ccc1OC(F)F",
             "FC(F)(COc1cncc2nnc(-c3ccc(Cl)cc3Cl)n12)c1ccccc1",
             "FC(F)(F)Oc1ccc(-c2nnc3cncc(OCCc4ccc(C(F)(F)F)nc4)n23)cc1",
             "FC(F)(F)Oc1ccc(-c2nnc3cncc(OCC4CCCO4)n23)cc1"
            ]

mols=[]
for smile in smiles_list:
    mol=Chem.MolFromSmarts(smile)
    mols += [mol]

#use rdFMCS to identify maximum common substructure on the above molecules (to get variable benzene bonds) 
rdFMCS.FindMCS(mols).smartsString

In [None]:
#includes aspirin and core substructres to check the patterns
smiles_list=["Fc1cc(F)c(F)c(CCOc2cncc3nnc(-c4ccc(Cl)cc4)n23)c1", 
             "Fc1ccc(C(F)(F)COc2cncc3nnc(-c4ccc(Cl)cc4)n23)cc1", 
             "FC(F)(COc1cncc2nnc(-c3ccc(Cl)cc3)n12)c1ccccc1", 
             "N#Cc1cccc(CCOc2cncc3nnc(-c4ccc(C(F)(F)F)cc4)n23)c1",
             "FC(F)Oc1ccc(-c2nnc3cncc(OCCc4cccc(C(F)(F)F)c4)n23)cc1",
             "Fc1ccc(-c2nnc3cncc(OCCc4ccc5ccccc5c4)n23)cc1F",
             "COc1cc(-c2nnc3cncc(OCCc4cccc(F)c4)n23)ccc1OC(F)F",
             "FC(F)(COc1cncc2nnc(-c3ccc(Cl)cc3Cl)n12)c1ccccc1",
             "FC(F)(F)Oc1ccc(-c2nnc3cncc(OCCc4ccc(C(F)(F)F)nc4)n23)cc1",
             "FC(F)(F)Oc1ccc(-c2nnc3cncc(OCC4CCCO4)n23)cc1",
             "CC(=O)OC1=CC=CC=C1C(=O)O",
             "*c1cncc2nnc(C3=CC=CC=C3)n12",
             "CC1=CC=C(C=C1)c1nnc2cncc(*)n12"
            ]

#substructures of interest:
#max substructure identified in previous step
pattern_maxsubstr = "[#6](-[#8]-[#6]1:[#6]:[#7]:[#6]:[#6]2:[#7]:1:[#6](:[#7]:[#7]:2)-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1)-[#6]-[#6]:,-[#6]:,-[#6]"
#Right Hand Side substituent with phenyl ring (any atom on left substituent)
pattern_phenyl = "[*]-c1cncc2nnc(-c3ccccc3)n12"
#RHS substituent with phenyl ring (any atom on left substituent) and para substituent on the RHS phenyl
pattern_phenyl_para = "[*]-c1ccc(cc1)-c1nnc2cncc(-[*])n12"

def search_structure(pattern):
    for idx,smiles in enumerate(smiles_list):
        m = Chem.MolFromSmarts(smiles)
        print("Structure {}: pattern found {}".format(idx+1,m.HasSubstructMatch(pattern)))

search_structure(Chem.MolFromSmarts(pattern_maxsubstr))
search_structure(Chem.MolFromSmarts(pattern_phenyl))
search_structure(Chem.MolFromSmarts(pattern_phenyl_para))

In [None]:
#smiles list with chosen substructure on the RHS:
smiles_list_chosen=[
             "Fc1cc(F)c(F)c(CCOc2cncc3nnc(-c4ccc(Cl)cc4)n23)c1", #para EOS-0-00111
             "CCn1cc(-c2ccnc(-c3nnc4cncc(OCc5ccc(F)c(F)c5F)n34)c2)cn1", #meta left heteroatom p2   
             "COc1cc(-c2nnc3cncc(OCCc4cccc(F)c4)n23)ccc1OC(F)F", #meta and para right EOS-0-00650
             "Fc1ccc(-c2ccc(-c3nnc4cncc(OCc5cccnn5)n34)cc2Cl)c(F)n1", #meta and para left EOS-2-04413
             "N#Cc1ccc(-c2nnc3cncc(Oc4ncnc(-c5ccc(F)c(F)c5)n4)n23)c(F)c1", #ortho and para right EOS-08055
             "Cc1cc(-c2nnco2)cc(F)c1Oc1cncc2nnc(-c3ccc(F)c(F)c3F)n12", #ortho meta para right EOS-2-11835
             "Fc1ccc(CCOc2cncc3nnc(-c4ccc5ccc(F)c(F)c5c4)n23)cn1", #naphtalene EOS-0-01293
             "Cc1ncc(-c2cnc(-c3nnc4cncc(OCc5ccc(F)c(F)c5F)n34)cn2)cn1", #para heteroatoms p2,5 EOS-2-00811
             "Fc1ccncc1-c1cncc(-c2nnc3cncc(OCCc4cncc(Cl)n4)n23)c1", #meta R heteroatom p2, EOS-2-03026
             "Cc1ccc(-c2cc(-c3nnc4cncc(OCc5ccc(F)c(Cl)c5)n34)[nH]n2)c(F)n1" #not benzene
            ]

#substructures of interest:
#Right Hand Side substituent with phenyl ring containing heteroatoms
pattern_heteroaryl = "*-c1cncc2nnc(n12)-[*]:1:[*]:[*]:[*]:[*]:[*]:1"

#RHS substituent with phenyl ring (no heteroatoms)
pattern_phenyl = "[*]-c1cncc2nnc(-c3ccccc3)n12"

#RHS substituent with phenyl ring (with heteroatoms) and para substituent on the RHS phenyl
pattern_heteroaryl_para = "*-c1cncc2nnc(n12)-[*]:1:[*]:c:[*](-[*]):[*]:[*]:1"

#RHS substituent with phenyl ring (with heteroatoms) and para substituent on the RHS phenyl
pattern_heteroaryl_meta = "*-c1cncc2nnc(n12)-[*]:1:[*]:[*]:[*]:[*](-[*]):[*]:1"

#RHS substituent with phenyl ring (with heteroatoms) and para substituent on the RHS phenyl
pattern_heteroaryl_orto = "*-c1cncc2nnc(n12)-[*]:1:[*]:[*]:[*]:[*]:[*]:1-[*]"

#RHS substituent with a napthalene 
pattern_naphthalene = "[*]-c1cncc2nnc(-c3ccc4ccccc4c3)n12"

def search_structure(pattern):
    for idx,smiles in enumerate(smiles_list_chosen):
        m = Chem.MolFromSmarts(smiles)
        print("Structure {}: pattern found {}".format(idx+1,m.HasSubstructMatch(pattern)))

search_structure(Chem.MolFromSmarts(pattern_naphthalene))

#1 para EOS-0-00111
#2 meta left heteroatom p2   
#3 meta and para right EOS-0-00650
#4 meta and para left EOS-2-04413
#5 ortho and para right EOS-08055
#6 ortho meta para right EOS-2-11835
#7 naphthalene EOS-0-01293
#8 para heteroatoms p2,5 EOS-2-00811
#9 meta R heteroatom p2, EOS-2-03026
#10 not benzene


### Add columns of interest to results excel sheet

In [None]:
df = pd.read_csv("../postprocess/210530_EOSI_OSM_Series4_1000.csv")
smiles=df["Smiles"].tolist()

new_list=[]
def substructure(pattern):
    new_list.clear()
    for smi in smiles:
        mol=Chem.MolFromSmarts(smi)
        substr=mol.HasSubstructMatch(pattern)
        if substr == True:
            value = 1
        else:
            value = 0
        new_list.append(value)
    return new_list

substructure(Chem.MolFromSmarts(pattern_heteroaryl))
df["Heteroaryl"]=new_list
substructure(Chem.MolFromSmarts(pattern_phenyl))
df["Phenyl"]=new_list
substructure(Chem.MolFromSmarts(pattern_heteroaryl_para))
df["Para"]=new_list
substructure(Chem.MolFromSmarts(pattern_heteroaryl_meta))
df["Meta"]=new_list
substructure(Chem.MolFromSmarts(pattern_heteroaryl_orto))
df["Orto"]=new_list


df.to_csv("../postprocess/210530_EOSI_OSM_Series4_1000_substr.csv", index=False)


### Divide smiles according to their substructure

#### Divide Active smiles according to substructure

In [105]:
df = pd.read_csv("../data/OriginalData/series4_processed.csv")
df = df.query("bin_activity != 0")
smiles=df["smiles"].tolist()
print(len(smiles))

new_list=[]
def has_substructure(pattern): #function to identify the smiles with certain substructure patterns
    new_list.clear()
    for smi in smiles:
        mol=Chem.MolFromSmarts(smi)
        substr=mol.HasSubstructMatch(pattern)
        if substr == True:
            new_list.append(smi)
        else:
            pass
    return new_list

def check_repeated(original, selected): #check if we have gotten all smiles without repeating
    notselected=list(set(original).difference(selected))
    allnew=[]
    allnew.extend(selected)
    allnew.extend(notselected)
    if len(original) == len(allnew):
        pass
    else:
        print("smiles do not match")

#patterns of interest
pattern_pyrazineether="[*]-[#8]-c1cncc2nnc(-[*])n12"
pattern_pyrazineamide="[*]-[#7]-[#6](=O)-c1cncc2nnc(-[*])n12"
pattern_pyrazinecarbon="[*]-[#6]-c1cncc2nnc(-[*])n12"
pattern_triazoloaryl_pyrazineether="[*]-[#8]-c1cncc2nnc(-c3ccccc3)n12"
pattern_triazolocyclic_pyrazineether="[*]-[#8]-c1cncc2nnc(-[*]-3-[*]-[*]-[*]-[*]-[*]-3)n12"
pattern_triazoloaryl_pyrazinecarbon="[*]-[#6]-c1cncc2nnc(-c3ccccc3)n12"
pattern_triazolocyclic_pyrazinecarbon="[*]-[#6]-c1cncc2nnc(-[*]-3-[*]-[*]-[*]-[*]-[*]-3)n12"
pattern_triazoloaryl_pyrazineamide="[*]-[#7]-[#6](=O)-c1cncc2nnc(-c3ccccc3)n12"
pattern_triazolocyclic_pyrazineamide="[*]-[#7]-[#6](=O)-c1cncc2nnc(-[*]-3-[*]-[*]-[*]-[*]-[*]-3)n12"


#create three lists for substructure patterns related to the Pyrazine substituents
has_substructure(Chem.MolFromSmarts(pattern_pyrazineether))
with open("../data/OriginalData/pyrazineether_act.txt", 'w') as file_handler:
    for item in new_list:
        file_handler.write("{}\n".format(item))
print(len(new_list))
pyrazineether=[]
pyrazineether.extend(new_list)

has_substructure(Chem.MolFromSmarts(pattern_pyrazineamide))
with open("../data/OriginalData/pyrazineamide_act.txt", 'w') as file_handler:
    for item in new_list:
        file_handler.write("{}\n".format(item))
print(len(new_list))
pyrazineamide=[]
pyrazineamide.extend(new_list)

has_substructure(Chem.MolFromSmarts(pattern_pyrazinecarbon))
with open("../data/OriginalData/pyrazinecarbon_act.txt", 'w') as file_handler:
    for item in new_list:
        file_handler.write("{}\n".format(item))
print(len(new_list))
pyrazinecarbon=[]
pyrazinecarbon.extend(new_list)
#carbon and amide substituents can be mixed, substract the amides from the carbons
pyrazinecarbon_short=list(set(pyrazinecarbon).difference(pyrazineamide))
print(len(pyrazinecarbon_short))

allpyr=[]
allpyr.extend(pyrazineether)
allpyr.extend(pyrazineamide)
allpyr.extend(pyrazinecarbon_short)

check_repeated(smiles, allpyr)

#list of smiles that do not match any of the defined pyrazine substitutents
notselected_pyr=list(set(smiles).difference(allpyr))
with open("../data/OriginalData/notselected_pyr_act.txt", 'w') as file_handler:
    for item in notselected_pyr:
        file_handler.write("{}\n".format(item))


#select smiles that have certain pyrazine and triazolo substituents
has_substructure(Chem.MolFromSmarts(pattern_triazoloaryl_pyrazineether))
with open("../data/OriginalData/triazoloaryl_pyrazineether_act.txt", 'w') as file_handler:
    for item in new_list:
        file_handler.write("{}\n".format(item))
print(len(new_list))
triazoloaryl_pyrazineether=[]
triazoloaryl_pyrazineether.extend(new_list)

has_substructure(Chem.MolFromSmarts(pattern_triazolocyclic_pyrazineether))
with open("../data/OriginalData/triazolocyclic_pyrazineether_act.txt", 'w') as file_handler:
    for item in new_list:
        file_handler.write("{}\n".format(item))
print(len(new_list))
triazolocyclic_pyrazineether=[]
triazolocyclic_pyrazineether.extend(new_list)

has_substructure(Chem.MolFromSmarts(pattern_triazoloaryl_pyrazinecarbon))
print(len(new_list))
triazoloaryl_pyrazinecarbon=[]
triazoloaryl_pyrazinecarbon.extend(new_list)
        
has_substructure(Chem.MolFromSmarts(pattern_triazolocyclic_pyrazinecarbon))
print(len(new_list))
triazolocyclic_pyrazinecarbon=[]
triazolocyclic_pyrazinecarbon.extend(new_list)

#carbon and amide substituents can be mixed, substract the amides from the carbons
triazoloaryl_pyrazinecarbon_short=list(set(triazoloaryl_pyrazinecarbon).difference(triazoloaryl_pyrazineamide))
with open("../data/OriginalData/triazoloaryl_pyrazinecarbon_act.txt", 'w') as file_handler:
    for item in triazoloaryl_pyrazinecarbon_short:
        file_handler.write("{}\n".format(item))
print(len(triazoloaryl_pyrazinecarbon_short))

triazolocyclic_pyrazinecarbon_short=list(set(triazolocyclic_pyrazinecarbon).difference(triazolocyclic_pyrazineamide))
print(len(triazolocyclic_pyrazinecarbon_short))
with open("../data/OriginalData/triazoloaryl_pyrazinecarbon_act.txt", 'w') as file_handler:
    for item in triazolocyclic_pyrazinecarbon_short:
        file_handler.write("{}\n".format(item))

alltry=[]
alltry.extend(triazoloaryl_pyrazineether)
alltry.extend(triazolocyclic_pyrazineether)
alltry.extend(triazoloaryl_pyrazineamide)
alltry.extend(triazolocyclic_pyrazineamide)
alltry.extend(triazoloaryl_pyrazinecarbon_short)
alltry.extend(triazolocyclic_pyrazinecarbon_short)

check_repeated(smiles, alltry)


#list of smiles that do not match any of the defined pyrazine/triazolo substitutents
notselected_try=list(set(smiles).difference(alltry))
with open("../data/OriginalData/notselected_try_act.txt", 'w') as file_handler:
    for item in notselected_try:
        file_handler.write("{}\n".format(item))


162
129
20
24
4
124
1
24
0
4
0
smiles do not match
