In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold

from rdkit import Chem
from rdkit.Chem import Descriptors

In [53]:
# The following functions are from the work of Feinstein et al.
def count_cf_bonds(mol):
    abstract_cf = Chem.MolFromSmarts('C~F')
    cf_bonds = mol.GetSubstructMatches(abstract_cf)
    return len(cf_bonds)

# Turn to EPA categories
def convert_to_mgkg(neglogld50s, smiles):
    mgkg_values = []
    for neglogld50, smile in zip(neglogld50s, smiles):
        molwt = Descriptors.MolWt(Chem.MolFromSmiles(smile))
        mgkg = (10**(-1*neglogld50)) * 1000 * molwt
        mgkg_values.append(mgkg)
    return mgkg_values

# Function to convert mg/kg values to EPA categories
def convert_to_epa(neglog_values, smiles):
    mgkg_values = convert_to_mgkg(neglog_values, smiles)
    epa_categories = pd.cut(mgkg_values, labels=[0,1,2,3], bins=[-np.inf, 50, 500, 5000, np.inf])
    return epa_categories

In [54]:
def safe_smiles(smiles_series, remove_stereochemistry = True):
    """
    Converts a series of SMILES strings into canonical SMILES after validating 
    the conversion from SMILES to molecule and back to SMILES.
    
    Parameters:
    smiles_series (pd.Series): A pandas Series containing SMILES strings.
    
    Returns:
    pd.Series: A pandas Series containing canonical SMILES strings, 
               or None for invalid SMILES.
    """
    def safe_smiles_to_smiles(smiles, idx):
        try:
            # Convert SMILES to molecule
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                # Convert molecule back to canonical SMILES but
                # first get rid off stereochemistry
                if remove_stereochemistry:
                    return Chem.MolToSmiles(mol, isomericSmiles=False)
                else:
                    return Chem.MolToSmiles(mol)
            else:
                raise ValueError("Invalid molecule")
        except Exception as e:
            # Handle any errors and log the index and SMILES string
            print(f"Error at index {idx}: processing SMILES {smiles}, Error: {e}")
            return None
    
    return smiles_series.apply(lambda smiles: safe_smiles_to_smiles(smiles, smiles_series.index[smiles_series == smiles][0]))

In [55]:
# Read the data with smiles and LD50
ldtoxdb = pd.read_csv('../../data/ldtoxdb-mordred.csv').dropna(axis=1)

In [56]:
# SMILES CANONIZATION
ldtoxdb['smiles'] = safe_smiles(ldtoxdb.SMI, remove_stereochemistry=True)
ldtoxdb = ldtoxdb.dropna(subset=['smiles'])

# Get other info
ldtoxdb['rd_mol'] = ldtoxdb.smiles.apply(Chem.MolFromSmiles)
ldtoxdb['mol_wt'] = ldtoxdb.rd_mol.apply(Chem.Descriptors.MolWt)

# Find PFAS like
ldtoxdb['n_cf_bonds'] = ldtoxdb.rd_mol.apply(count_cf_bonds)
ldtoxdb['is_pfas_like'] = ldtoxdb['n_cf_bonds'] >= 2

In [57]:
duplicates = ldtoxdb[ldtoxdb['smiles'].duplicated(keep=False)]

# Calculate the percentage of duplicates
duplicate_percentage = (len(duplicates) / len(ldtoxdb)) * 100 if len(ldtoxdb) > 0 else 0
print(f"Percentage of duplicates in 'smiles': {duplicate_percentage:.2f}%")
# Drop duplicates
ldtoxdb = ldtoxdb.drop_duplicates(subset='smiles', keep='first')

duplicates.head()

Percentage of duplicates in 'smiles': 15.53%


Unnamed: 0,SMI,INCHI,INCHIKEY,NeglogLD50,Source,nAcid,nBase,SpMax_A,SpMAD_A,nSpiro,...,JGI7,JGI8,JGI9,JGI10,JGT10,smiles,rd_mol,mol_wt,n_cf_bonds,is_pfas_like
3,CC(Oc1ccc(Cl)cc1Cl)C(=O)O,InChI=1S/C9H8Cl2O3/c1-5(9(12)13)14-8-3-2-6(10)...,MZHCENGPTKEIGP-UHFFFAOYSA-N,2.532971,CATMOS,1,0,2.301365,1.180914,0,...,0.029514,0.0,0.0,0.0,0.586752,CC(Oc1ccc(Cl)cc1Cl)C(=O)O,<rdkit.Chem.rdchem.Mol object at 0x000001E40C7...,235.066,0,False
4,COC(=O)C=Cc1ccccc1,InChI=1S/C10H10O2/c1-12-10(11)8-7-9-5-3-2-4-6-...,CCRCUPLGCSFEDV-UHFFFAOYSA-N,1.793378,CATMOS,0,0,2.172565,1.279536,0,...,0.017795,0.020408,0.0,0.0,0.359828,COC(=O)C=Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x000001E40C7...,162.188,0,False
8,CC(=O)OCC=C(C)CCC=C(C)C,InChI=1S/C12H20O2/c1-10(2)6-5-7-11(3)8-9-14-12...,HIGQPQRQIQDZMP-UHFFFAOYSA-N,1.491494,CATMOS,0,0,2.074313,1.102721,0,...,0.015625,0.009877,0.02,0.0,0.512729,CC(=O)OCC=C(C)CCC=C(C)C,<rdkit.Chem.rdchem.Mol object at 0x000001E40C7...,196.29,0,False
29,C(=NC(N=Cc1ccco1)c1ccco1)c1ccco1,InChI=1S/C15H12N2O3/c1-4-12(18-7-1)10-16-15(14...,CYGDSXFTXXFMNI-UHFFFAOYSA-N,2.826515,CATMOS,0,0,2.302776,1.332675,0,...,0.00659,0.002267,0.002346,0.0,0.240784,C(=NC(N=Cc1ccco1)c1ccco1)c1ccco1,<rdkit.Chem.rdchem.Mol object at 0x000001E40C7...,268.272,0,False
51,CC(C=O)=Cc1ccccc1,InChI=1S/C10H10O/c1-9(8-11)7-10-5-3-2-4-6-10/h...,VLUMOWNVWOXZAU-UHFFFAOYSA-N,1.853161,CATMOS,0,0,2.186699,1.254226,0,...,0.027778,0.0,0.0,0.0,0.381507,CC(C=O)=Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x000001E4181...,146.189,0,False


In [58]:
# Read the PFAS dataset and convert smiles
pfas8k = pd.read_csv('../../data/pfas8k-mordred.csv')
pfas8k['canon_smi'] = safe_smiles(pfas8k.SMILES)
pfas8k = pfas8k.dropna(subset=['canon_smi'])

[18:30:43] Explicit valence for atom # 1 Cl, 3, is greater than permitted


Error at index 4135: processing SMILES F[Cl](C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)[Si](Cl)(Cl)C1=C(F)C(F)=C(F)C(F)=C1F, Error: Invalid molecule


In [59]:
duplicates_pfas8k = pfas8k[pfas8k['canon_smi'].duplicated(keep=False)]

# Calculate the percentage of duplicates
duplicate_percentage = (len(duplicates_pfas8k) / len(pfas8k)) * 100 if len(pfas8k) > 0 else 0
print(f"Percentage of duplicates in 'smiles': {duplicate_percentage:.2f}%")
# Drop duplicates
pfas8k = pfas8k.drop_duplicates(subset='canon_smi', keep='first')

duplicates_pfas8k.head()

Percentage of duplicates in 'smiles': 2.49%


Unnamed: 0.1,Unnamed: 0,index,INPUT,FOUND_BY,DTXSID,PREFERRED_NAME,SMILES,INCHI_STRING,rd_mol,nAcid,...,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,canon_smi
0,0,0,DTXSID1020562,DSSTox_Substance_Id,DTXSID1020562,Enflurane,FC(F)OC(F)(F)C(F)Cl,"InChI=1/C3H2ClF5O/c4-1(5)3(8,9)10-2(6)7/h1-2H",<rdkit.Chem.rdchem.Mol object at 0x0000025DD94...,0,...,0.068182,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.744934,FC(F)OC(F)(F)C(F)Cl
5,5,5,DTXSID4022369,DSSTox_Substance_Id,DTXSID4022369,Fulvestrant,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@]1([H])C3=C(...,InChI=1S/C32H47F5O3S/c1-30-17-15-26-25-12-11-2...,<rdkit.Chem.rdchem.Mol object at 0x0000025DD94...,0,...,0.047917,0.041347,0.027907,0.021649,0.013305,0.011596,0.008941,0.006173,0.520035,CC12CCC3c4ccc(O)cc4CC(CCCCCCCCCS(=O)CCCC(F)(F)...
21,21,21,DTXSID3031860,DSSTox_Substance_Id,DTXSID3031860,Perfluorodecanoic acid,OC(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(...,"InChI=1S/C10HF19O2/c11-2(12,1(30)31)3(13,14)4(...",<rdkit.Chem.rdchem.Mol object at 0x0000025DD94...,1,...,0.078526,0.049855,0.034259,0.02481,0.018601,0.014216,0.010833,0.006612,0.968804,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(...
24,24,24,DTXSID8031863,DSSTox_Substance_Id,DTXSID8031863,Perfluorononanoic acid,OC(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(...,"InChI=1S/C9HF17O2/c10-2(11,1(27)28)3(12,13)4(1...",<rdkit.Chem.rdchem.Mol object at 0x0000025DD94...,1,...,0.077899,0.049333,0.033769,0.024295,0.017992,0.013374,0.008,0.0,0.956689,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(...
25,25,25,DTXSID3031864,DSSTox_Substance_Id,DTXSID3031864,Perfluorooctanesulfonic acid,OS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,"InChI=1S/C8HF17O3S/c9-1(10,3(13,14)5(17,18)7(2...",<rdkit.Chem.rdchem.Mol object at 0x0000025DD94...,1,...,0.078125,0.049524,0.033951,0.02449,0.018229,0.013717,0.01,0.0,0.973627,O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...


In [60]:
# Classify the LDTOXDB
ldtoxdb['is_pfas'] = ldtoxdb.smiles.isin(pfas8k.canon_smi)
# Add EPA classes
ldtoxdb['EPA'] = convert_to_epa(ldtoxdb['NeglogLD50'], smiles=ldtoxdb['smiles'])
# Nicer names
ldtoxdb.columns = ldtoxdb.columns.str.lower()

In [61]:
ldtoxdb.to_csv('../../data/full_dataset.csv', index=False)

In [62]:
# Separate PFAS and PFAS-like from data
pfas = ldtoxdb[(ldtoxdb['is_pfas']) | (ldtoxdb['is_pfas_like'])]

# The rest of the DataFrame where both columns are False
non_pfas = ldtoxdb[~((ldtoxdb['is_pfas']) | (ldtoxdb['is_pfas_like']))]

In [63]:
non_pfas.to_csv('../../data/nonpfas_dataset.csv', index=False)
pfas.to_csv('../../data/pfas_dataset.csv', index=False)

In [64]:
len(ldtoxdb)

12238