In [2]:
from pathlib import Path
TOP = Path.cwd().as_posix().replace('notebooks', '')


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
from rdkit import Chem

In [4]:
raw_dir = Path(TOP) /'data'/'raw'
interim_dir = Path(TOP) /'data'/'interim'
processed_dir = Path(TOP) /'data'/'processed'
external_dir = Path(TOP) /'data'/'external'

In [5]:
def has_metal_atom(smiles):
    metal_dict = {'Na': 11 , 'Mg': 12, 'Si': 14, 'K':19, 'Ca':20, 'Mn':25, 'Fe':26, 'Cu':29, 'Zn':30, 'Co':27, 'Ni':28, 'As': 33, 'Cr':24, 'Hg':80, 'Pb':82, 'V':23, 'Al':13, 'Ag':47, 'Cd':48, 'B':5, 'Ti': 22, 'Se': 34, 'Sn': 50, 'Sb':51, 'Be': 4, 'Zr': 40, 'Nb': 41, 'Mo': 42, 'Te':52, 'Ba':56, 'W':74, 'Au': 79, 'Bi': 83}
    essential_metal_dict = {'Na': 11 , 'K':19,'Mg': 12,  'Ca':20, 'Fe':26,  'Mn':25, 'Co':27, 'Cu':29, 'Zn':30, 'Mo': 42}
    mol = Chem.MolFromSmiles(smiles)
    if mol  is None:
        return False
    return any(atom for atom in mol.GetAtoms() if atom.GetAtomicNum() in list(metal_dict.values()))

In [6]:
def metal_ions(smiles):
    essential_metal = ['[Na+]', '[K+]', '[Mg++]',  '[Ca++]', '[Fe+3]',  '[Mn++]', '[Co+]', '[Cu++]', '[Zn++]', '[Mo++]']
    mol = Chem.MolFromSmiles(smiles)
    if mol  is None:
        return False
    return any(mol.HasSubstructMatch(Chem.MolFromSmarts(e)) for e in essential_metal)

In [7]:
def P_inorg(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol  is None:
        return False
    return mol.HasSubstructMatch(Chem.MolFromSmarts('[OH]P(=[O])([OH])[OH]'))

In [13]:
DLCs = pd.read_excel(raw_dir/'exclusions.xlsx', sheet_name = 'DLCs2')

In [14]:
DLCs

Unnamed: 0,ID,SMARTS
0,biphenyl,"c1cc(ccc1)c1ccccc1.[F,Br,Cl,#1].[F,Br,Cl,#1]"
1,dibenzofuran,"c1cccc2oc3ccccc3c12.[F,Br,Cl,#1].[F,Br,Cl,#1]"
2,dibenzodioxin,"c1ccc2Oc3ccccc3Oc2c1.[F,Br,Cl,#1].[F,Br,Cl,#1]"


In [17]:
dlc_dict = { k:v for (k,v) in zip(DLCs['ID'], DLCs['SMARTS'])}

In [18]:
dlc_dict

{'biphenyl': 'c1cc(ccc1)c1ccccc1.[F,Br,Cl,#1].[F,Br,Cl,#1]',
 'dibenzofuran': 'c1cccc2oc3ccccc3c12.[F,Br,Cl,#1].[F,Br,Cl,#1]',
 'dibenzodioxin ': 'c1ccc2Oc3ccccc3Oc2c1.[F,Br,Cl,#1].[F,Br,Cl,#1]'}

In [9]:
HPC = pd.read_excel(raw_dir/'exclusions.xlsx', sheet_name = 'HPC')

In [11]:
hpc_dict = {}
for i,group in HPC.groupby('HPC'):
    key = [name for name in group['HPC'] if name!=' '][0]
    value = set(group['SMARTS'])
    hpc_dict[key]=value

In [12]:
hpc_dict

{'Aflatoxin': {'COC1=C2C3=C(C(=O)OCC3)C(=O)OC2=C2C3C=COC3OC2=C1',
  'O=C1C=Cc2ccccc2O1',
  'O=c1ccc2ccccc2(o1)',
  '[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c([#8][#6h3])c1)[#6]1=[#6]([#6](=[#8])[#8]2)[#6](=[#8])[#6][#6]1',
  '[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#8][#6][#6]1)=[#8])[#6](=[#8])[#8]2',
  '[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#6][#6]1)=[#8])[#6](=[#8])[#8]2',
  '[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#8][#6][#6]1)=[#8])[#6](=[#8])[#8]2'},
 'Azoxy': {'C=[$(N=N);!$(N=N=N);!$(N=NN)]',
  'CN=NO',
  'NN=O',
  '[$(C=[N+]=[N-]);!$(C=[N+]=[N-]=N);!$(C=[N+]=[N-]N)]',
  '[C,#1]N=[NX2][C,#1]'},
 'Benzidine': {'c2c(c1ccc(N)cc1)ccc(N)c2'},
 'Nitroso': {'[C,c]N[NX2;v3]=O',
  '[a;!$(a(a[A;!#1])(a[A;!#1]));!$(aa[CX3](=O)[OX2H1]);!$(aa[SX4](=[OX1])(=[OX1])([O]));!$(aaa[SX4](=[OX1])(=[OX1])([O]));!$(aaaa[SX4](=[OX1])(=[OX1])([O]));!$(aaaaa[SX4](=[OX1])(=[OX1])([O]));!$(aaaaaa[SX4](=[OX1])(=[OX1])([O]))

In [19]:
all_fda = pd.read_excel(raw_dir/'FDA_COC.xlsx')

In [21]:
all_fda[all_fda['TTC_Cat'] =='COC']

Unnamed: 0,COC,SMARTS,Name,Source,TTC_Cat,Unnamed: 5
77,biphenyl,"c1cc(ccc1)c1ccccc1.[F,Br,Cl,#1].[F,Br,Cl,#1]",,,COC,
78,biphenyl,"[Cl,Br,F,I]c1ccc(cc1)!@c2ccc(cc2)[Cl,Br,F,I]","Halogenated PAH (naphthalenes, biphenyls, diph...",SA31b_nogen,COC,
79,biphenyl,"c1cc(ccc1[!R]c2ccc(cc2)[Cl,Br,F,I])[Cl,Br,F,I]","Halogenated PAH (naphthalenes, biphenyls, diph...",SA31b_nogen,COC,
80,dibenzodioxin,"c1ccc2Oc3cc(ccc3(Oc2(c1)))[Cl,Br,F,I]",Halogenated dibenzodioxins (Nongenotoxic carc...,SA31c_nogen,COC,
81,dibenzodioxin,"c1ccc2Oc3ccccc3Oc2c1.[F,Br,Cl,#1].[F,Br,Cl,#1]",,,COC,
82,dibenzofuran,"c1cccc2oc3ccccc3c12.[F,Br,Cl,#1].[F,Br,Cl,#1]",,,COC,


In [24]:
fda_dict = {}
for i,group in all_fda[all_fda['TTC_Cat'] =='COC'].groupby('COC'):
    key = [name for name in group['COC'] if name!=' '][0]
    value = set(group['SMARTS'])
    fda_dict[key]=value

In [25]:
fda_dict

{'biphenyl': {'[Cl,Br,F,I]c1ccc(cc1)!@c2ccc(cc2)[Cl,Br,F,I]',
  'c1cc(ccc1)c1ccccc1.[F,Br,Cl,#1].[F,Br,Cl,#1]',
  'c1cc(ccc1[!R]c2ccc(cc2)[Cl,Br,F,I])[Cl,Br,F,I]'},
 'dibenzodioxin': {'c1ccc2Oc3cc(ccc3(Oc2(c1)))[Cl,Br,F,I]',
  'c1ccc2Oc3ccccc3Oc2c1.[F,Br,Cl,#1].[F,Br,Cl,#1]'},
 'dibenzofuran': {'c1cccc2oc3ccccc3c12.[F,Br,Cl,#1].[F,Br,Cl,#1]'}}

In [31]:
all_fda[all_fda['TTC_Cat'] =='HPC']

Unnamed: 0,COC,SMARTS,Name,Source,TTC_Cat,Unnamed: 5
39,Aflatoxin,COC1=C2C3=C(C(=O)OCC3)C(=O)OC2=C2C3C=COC3OC2=C1,,,HPC,
40,Aflatoxin,O=c1ccc2ccccc2(o1),,SA30_gen,HPC,
41,Aflatoxin,O=C1C=Cc2ccccc2O1,,SA30_gen,HPC,
42,Aflatoxin,[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O]...,,,HPC,
43,Aflatoxin,[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c([#8][#6...,,,HPC,
44,Aflatoxin,[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c(c1)[O][...,,,HPC,
45,Aflatoxin,[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O]...,,,HPC,
46,Azoxy,NN=O,,,HPC,
47,Azoxy,"[C,#1]N=[NX2][C,#1]",Aliphatic azo and azoxy,SA14_gen,HPC,
48,Azoxy,[$(C=[N+]=[N-]);!$(C=[N+]=[N-]=N);!$(C=[N+]=[N...,Aliphatic azo and azoxy,SA14_gen,HPC,


In [32]:
hpc_dict = {}
for i,group in all_fda[all_fda['TTC_Cat'] =='HPC'].groupby('COC'):
    key = [name for name in group['COC'] if name!=' '][0]
    value = set(group['SMARTS'])
    hpc_dict[key]=value

In [33]:
hpc_dict

{'Aflatoxin': {'COC1=C2C3=C(C(=O)OCC3)C(=O)OC2=C2C3C=COC3OC2=C1',
  'O=C1C=Cc2ccccc2O1',
  'O=c1ccc2ccccc2(o1)',
  '[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c([#8][#6h3])c1)[#6]1=[#6]([#6](=[#8])[#8]2)[#6](=[#8])[#6][#6]1',
  '[#6]1[#6]2[#6]([#8][#6]1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#8][#6][#6]1)=[#8])[#6](=[#8])[#8]2',
  '[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#6][#6]1)=[#8])[#6](=[#8])[#8]2',
  '[#6]1[#6]2[#6]([#8][#6]=1)[#8]c1c2c2c(c(c1)[O][CH3])[#6]1=[#6]([#6]([#8][#6][#6]1)=[#8])[#6](=[#8])[#8]2'},
 'Azoxy': {'C=[$(N=N);!$(N=N=N);!$(N=NN)]',
  'CN=NO',
  'NN=O',
  '[$(C=[N+]=[N-]);!$(C=[N+]=[N-]=N);!$(C=[N+]=[N-]N)]',
  '[$([$([CX4]([*!C])([*!C])([*!C])),$([CX4]([*!C])([*!C])([*!C])[CX4]([*!C])([*!C])([*!C])),$([CX4]([*!C])([*!C])([*!C])[CX4]([*!C])([*!C])[CX4]([*!C])([*!C])([*!C])),$([CX4]([CX4]([*!C])([*!C])([*!C]))([CX4]([*!C])([*!C])([*!C]))([*!C])([*!C])),$([CX4]([*!C])([*!C])([*!C])[CX4]([*!C])([*!C])[CX4]([*!C])([*!C])[CX4]([*!C])([*!C])

In [28]:
all_fda[all_fda['Source'] =='nelms']

Unnamed: 0,COC,SMARTS,Name,Source,TTC_Cat,Unnamed: 5
92,carbamates,"[NX3]([CX4,#1,S])([CX4,#1,S])C(=[S])[S-]",,nelms,carbamates,
93,carbamates,"[NX3]([CX4&!R,#1,S])([CX4&!R,#1])C(=[O,S])[O,S...",,nelms,carbamates,
94,OPs,CP(=O)(C)O,,nelms,OPs,
95,OPs,SP(S)S,,nelms,OPs,
96,OPs,"OP(=[O])([O,S])N[C&!R,#1]",,nelms,OPs,
97,OPs,"[O,S]P(=[O,SX])([O,S]C)[O,S]",,nelms,OPs,
98,OPs,"OP(=[SX1])([S])[O,C]",,nelms,OPs,
99,OPs,[#1]OP(=[O])(O[#1])C*,,nelms,OPs,
100,OPs,[#6]OP(=[SX1])([#6])O[#6],,nelms,OPs,
101,OPs,SP(=O)(S)[S],,nelms,OPs,


In [29]:
op_dict = {}
for i,group in all_fda[all_fda['Source'] =='nelms'].groupby('COC'):
    key = [name for name in group['COC'] if name!=' '][0]
    value = set(group['SMARTS'])
    op_dict[key]=value

In [30]:
op_dict

{'OPs': {'CP(=O)(C)O',
  'OP(=[O])([O,S])N[C&!R,#1]',
  'OP(=[SX1])([S])[O,C]',
  'SP(=O)(S)[S]',
  'SP(S)S',
  '[#1]OP(=[O])(O[#1])C*',
  '[#6]OP(=[SX1])([#6])O[#6]',
  '[O,S]P(=[O,SX])([O,S]C)[O,S]'},
 'carbamates': {'[NX3]([CX4&!R,#1,S])([CX4&!R,#1])C(=[O,S])[O,S][#6,N,#1]',
  '[NX3]([CX4,#1,S])([CX4,#1,S])C(=[S])[S-]'}}