In [160]:
import numpy as np
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Fragments, rdMolDescriptors, rdchem, PandasTools, EState
import matplotlib.pyplot as plt

In [161]:
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, BondType, AllChem as Chem, rdFingerprintGenerator
from rdkit.Chem import rdmolops
from collections import defaultdict
from multiprocessing import Pool, cpu_count

### potential PBT chemicals identified by Strempel et al

In [162]:
Stremper_et_al = pd.read_csv('Strempel_etal_PBTcompounds.csv')
Stremper_et_al

Unnamed: 0,CAS,R,SMILES,PBT_label
0,50-29-3,pr,Clc1ccc(cc1)C(c1ccc(Cl)cc1)C(Cl)(Cl)Cl,1
1,50-41-9,pr,CCN(CC)CCOc1ccc(cc1)C(=C(Cl)c1ccccc1)c1ccccc1,1
2,50-52-2,pr,CSc1ccc2Sc3ccccc3N(CCC3CCCCN3C)c2c1,1
3,53-19-0,pr,Clc1ccc(cc1)C(C(Cl)Cl)c1ccccc1Cl,1
4,53-69-0,pr,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1,1
...,...,...,...,...
2780,187348-02-3,n,ClCC12C(Cl)C(Cl)C(CC1(Cl)Cl)C2(CCl)C(Cl)Cl,1
2781,189084-62-6,n,Brc1cccc(Br)c1Oc1ccc(Br)c(Br)c1,1
2782,190383-43-8,n,CCC(Cc1ccccc1)c1cc(O)c(c(=O)o1)C(C1CC1)c1cccc(...,1
2783,226256-56-0,n,C[C@H](NCCCc1cccc(c1)C(F)(F)F)c1cccc2ccccc12,1


### expert-verified PBT chemicals from the European Chemicals Agency (ECHA) PBT/vPvB assessments

In [163]:
ECHA_expert_verified = pd.read_csv('expert-verified_PBT_chemicals_ECHA.csv')
ECHA_expert_verified

Unnamed: 0,SMILES,Source,PBT_label
0,c1ccc(-c2cccc(-c3ccccc3)c2)cc1,https://echa.europa.eu/pbt,1
1,O=S(=O)(c1ccc(Cl)cc1)c1ccc(Cl)cc1,https://echa.europa.eu/pbt,1
2,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1,https://echa.europa.eu/pbt,1
3,c1cc2ccc3cccc4ccc(c1)c2c34,https://echa.europa.eu/pbt,1
4,c1ccc2sc(SN(C3CCCCC3)C3CCCCC3)nc2c1,https://echa.europa.eu/pbt,1
...,...,...,...
76,N.O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F...,https://echa.europa.eu/pbt,1
77,O=C1OC(=O)c2c(Br)c(Br)c(Br)c(Br)c21,https://echa.europa.eu/pbt,0
78,O=C([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(...,https://echa.europa.eu/pbt,1
79,c1ccc2c(c1)-c1cccc3cccc-2c13,https://echa.europa.eu/pbt,1


### ECHA PBT assessment list

In [164]:
ECHA_PBT_assessment_list = pd.read_csv('ECHA_PBT_assessment_list.csv')
ECHA_PBT_assessment_list

Unnamed: 0,SMILES,Source,PBT_label
0,Cc1ccc2ccccc2c1,https://echa.europa.eu/information-on-chemical...,0
1,O=[N+]([O-])c1ccc(Oc2ccc(Cl)cc2Cl)cc1,https://echa.europa.eu/information-on-chemical...,1
2,CCCC[Sn](CCCC)(CCCC)O[Sn](CCCC)(CCCC)CCCC,https://echa.europa.eu/information-on-chemical...,1
3,C[Pb](C)(C)C,https://echa.europa.eu/information-on-chemical...,1
4,Clc1ccc(C(Cl)(Cl)Cl)cc1,https://echa.europa.eu/information-on-chemical...,0
...,...,...,...
59,CCCCCCCCCCCCCCCCCCOC(=O)CCc1cc(C(C)(C)C)c(O)c(...,https://echa.europa.eu/information-on-chemical...,0
60,CC(C)(C)c1cc(CCC(=O)OCC(COC(=O)CCc2cc(C(C)(C)C...,https://echa.europa.eu/information-on-chemical...,0
61,CCCCCCCCCCCCCCCS(=O)(=O)Oc1ccccc1,https://echa.europa.eu/information-on-chemical...,0
62,CC[Pb](CC)(CC)CC,https://echa.europa.eu/information-on-chemical...,0


### the ECHA list of substances subject to POP Regulation

In [165]:
ECHA_POP_regulation = pd.read_csv('ECHA_substances_POP_Regulation.csv')
ECHA_POP_regulation

Unnamed: 0,SMILES,Source,PBT_label
0,Br[C@H]1CC[C@@H](Br)[C@H](Br)CC[C@@H](Br)[C@@H...,https://www.echa.europa.eu/list-of-substances-...,1
1,Br[C@H]1CC[C@H](Br)[C@H](Br)CC[C@@H](Br)[C@H](...,https://www.echa.europa.eu/list-of-substances-...,1
2,Brc1cc(Br)c(Oc2ccc(Br)c(Br)c2Br)cc1Br,https://www.echa.europa.eu/list-of-substances-...,1
3,Brc1cc(Br)c(Oc2ccc(Br)c(Br)c2)c(Br)c1,https://www.echa.europa.eu/list-of-substances-...,1
4,Brc1ccc(Oc2ccc(Br)c(Br)c2Br)c(Br)c1,https://www.echa.europa.eu/list-of-substances-...,1
...,...,...,...
213,O=C([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(C(F)...,https://www.echa.europa.eu/list-of-substances-...,1
214,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,https://www.echa.europa.eu/list-of-substances-...,1
215,c1ccc2c(c1)-c1cccc3c1c-2cc1ccccc13,https://www.echa.europa.eu/list-of-substances-...,1
216,O=C(O)CC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)...,https://www.echa.europa.eu/list-of-substances-...,1


### the new POP list under the Stockholm Convention

In [166]:
POP_list_stockolm = pd.read_csv('new POP list under the Stockholm Convention.csv')
POP_list_stockolm

Unnamed: 0,SMILES,Source,PBT_label
0,CCCC(Cl)CCCC(Cl)CCC(Cl)CCC(Cl)CCC(Cl)CCCC(Cl)CCC,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
1,CCCC(Cl)CCC(Cl)CC(Cl)C(Cl)CCC(Cl)CCC(Cl)C(Cl)C...,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
2,COc1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
3,CC(Cl)CCCC(Cl)CCCC(Cl)CCCC(Cl)CCCC(Cl)CCCC(Cl)...,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
4,CCCCCCCCCCCC(=O)Oc1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
5,O.[Na+].[O-]c1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
6,[Na+].[O-]c1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
7,CCC(Cl)C(Cl)C(Cl)CC(Cl)C(Cl)C(C)Cl,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
8,Brc1cc(Br)c(Oc2cc(Br)c(Br)cc2Br)c(Br)c1,http://www.pops.int/TheConvention/ThePOPs/TheN...,1
9,Brc1cc(Br)c(Oc2c(Br)cc(Br)cc2Br)c(Br)c1,http://www.pops.int/TheConvention/ThePOPs/TheN...,1


### ECHA-registered substances

In [167]:
ECHA_reg_substances = pd.read_csv('ECHA-registered_substances.csv')
ECHA_reg_substances

Unnamed: 0,SMILES,Source,PBT_label
0,CC1(C)[C@@H]2CC[C@@]1(C)C(=O)C2,https://echa.europa.eu/information-on-chemical...,0
1,Cl.NC(N)=NCCC[C@H](N)C(=O)O,https://echa.europa.eu/information-on-chemical...,0
2,O=C(O)[C@H](O)[C@@H](O)C(=O)O,https://echa.europa.eu/information-on-chemical...,0
3,O=C(O)[C@@H](O)[C@H](O)C(=O)O,https://echa.europa.eu/information-on-chemical...,0
4,CC1=CCC(/C=C/C(C)(C)C(C)O)C1(C)C,https://echa.europa.eu/information-on-chemical...,0
...,...,...,...
2882,CC1(C)[C@@H]2CC[C@@]1(C)[C@@H](O)C2,https://echa.europa.eu/information-on-chemical...,0
2883,CC1(C)[C@@H]2CC[C@]1(C)[C@H](O)C2,https://echa.europa.eu/information-on-chemical...,0
2884,CC1(C)C2CC[C@]1(C)C(O)C2,https://echa.europa.eu/information-on-chemical...,0
2885,CC1(C)[C@@H]2CC[C@@]1(C)[C@H](O)C2,https://echa.europa.eu/information-on-chemical...,0


In [168]:
data_concat = pd.concat([Stremper_et_al, ECHA_expert_verified, ECHA_PBT_assessment_list, ECHA_POP_regulation, POP_list_stockolm, ECHA_reg_substances], ignore_index=True)
data_concat

Unnamed: 0,CAS,R,SMILES,PBT_label,Source
0,50-29-3,pr,Clc1ccc(cc1)C(c1ccc(Cl)cc1)C(Cl)(Cl)Cl,1,
1,50-41-9,pr,CCN(CC)CCOc1ccc(cc1)C(=C(Cl)c1ccccc1)c1ccccc1,1,
2,50-52-2,pr,CSc1ccc2Sc3ccccc3N(CCC3CCCCN3C)c2c1,1,
3,53-19-0,pr,Clc1ccc(cc1)C(C(Cl)Cl)c1ccccc1Cl,1,
4,53-69-0,pr,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1,1,
...,...,...,...,...,...
6067,,,CC1(C)[C@@H]2CC[C@@]1(C)[C@@H](O)C2,0,https://echa.europa.eu/information-on-chemical...
6068,,,CC1(C)[C@@H]2CC[C@]1(C)[C@H](O)C2,0,https://echa.europa.eu/information-on-chemical...
6069,,,CC1(C)C2CC[C@]1(C)C(O)C2,0,https://echa.europa.eu/information-on-chemical...
6070,,,CC1(C)[C@@H]2CC[C@@]1(C)[C@H](O)C2,0,https://echa.europa.eu/information-on-chemical...


In [169]:
data_PBT_concat= data_concat.drop(['Source', 'CAS', 'R'], axis= 1)
data_PBT_concat

Unnamed: 0,SMILES,PBT_label
0,Clc1ccc(cc1)C(c1ccc(Cl)cc1)C(Cl)(Cl)Cl,1
1,CCN(CC)CCOc1ccc(cc1)C(=C(Cl)c1ccccc1)c1ccccc1,1
2,CSc1ccc2Sc3ccccc3N(CCC3CCCCN3C)c2c1,1
3,Clc1ccc(cc1)C(C(Cl)Cl)c1ccccc1Cl,1
4,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1,1
...,...,...
6067,CC1(C)[C@@H]2CC[C@@]1(C)[C@@H](O)C2,0
6068,CC1(C)[C@@H]2CC[C@]1(C)[C@H](O)C2,0
6069,CC1(C)C2CC[C@]1(C)C(O)C2,0
6070,CC1(C)[C@@H]2CC[C@@]1(C)[C@H](O)C2,0


### Data Cleaning and pre-processing

In [170]:
def neutralize_atoms(mol):
    if not mol:
        return None
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    try:
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
    except:
        return mol
    return mol


def get_largest_frag(mol):
    if not mol:
        return None
    mol_frags = rdmolops.GetMolFrags(mol, asMols = True)
    largest_mol = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
    Chem.RemoveStereochemistry(largest_mol)
    return largest_mol

In [171]:
data_PBT_concat['mol'] = data_PBT_concat.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)

[16:52:28] Can't kekulize mol.  Unkekulized atoms: 9 10 11
[16:52:28] Explicit valence for atom # 7 N, 4, is greater than permitted
[16:52:28] Explicit valence for atom # 1 C, 5, is greater than permitted
[16:52:28] Explicit valence for atom # 8 N, 4, is greater than permitted
[16:52:28] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 9
[16:52:28] Explicit valence for atom # 9 N, 4, is greater than permitted
[16:52:28] Explicit valence for atom # 7 N, 4, is greater than permitted
[16:52:28] Explicit valence for atom # 10 N, 4, is greater than permitted
[16:52:28] Explicit valence for atom # 8 O, 3, is greater than permitted
[16:52:28] Explicit valence for atom # 6 N, 4, is greater than permitted
[16:52:28] Explicit valence for atom # 22 O, 3, is greater than permitted
[16:52:28] Can't kekulize mol.  Unkekulized atoms: 17 19 20 21 22 23 24
[16:52:28] Explicit valence for atom # 11 O, 3, is greater than permitted
[16:52:28] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 10 11 12 1

In [172]:
# 1. Get largest fragment
data_PBT_concat['mol_no-mc'] = data_PBT_concat.apply(lambda x: get_largest_frag(x['mol']), axis=1)
# 2. Neutralise molecules
data_PBT_concat['mol_no-mc-neutral'] = data_PBT_concat.apply(lambda x: neutralize_atoms(x['mol_no-mc']), axis=1)
# 3. canonicalise everything  
data_PBT_concat['comparator_smiles'] =data_PBT_concat.apply(lambda x: Chem.MolToSmiles(x['mol_no-mc-neutral']) if x['mol_no-mc-neutral'] is not None else None,  axis=1)

[16:52:29] Explicit valence for atom # 4 B, 5, is greater than permitted
[16:52:29] Explicit valence for atom # 3 B, 5, is greater than permitted
[16:52:29] Explicit valence for atom # 1 B, 5, is greater than permitted


In [173]:
df = data_PBT_concat[data_PBT_concat.duplicated(keep=False, subset=['comparator_smiles'])]
# df.drop(labels=['original_mol'], inplace=True, axis=1)
pair_list = df.groupby(by=['comparator_smiles']).apply(lambda x: tuple(x.index))
pair_list = pair_list.tolist()
print(pair_list)

[(4777, 5938), (549, 2930, 2931, 3165), (2683, 2684), (1364, 1758), (3157, 3167), (3161, 3171), (3156, 3166), (3158, 3168), (2933, 3130), (3162, 3172), (2932, 3129), (2877, 2889), (806, 2695), (3159, 3169), (3164, 3174), (3061, 3132), (2781, 3062, 3133), (3064, 3135), (3063, 3134), (2934, 3131), (3163, 3173), (335, 368), (3160, 3170), (3583, 5898), (145, 880), (2798, 2843), (2879, 2913), (3281, 6006), (4990, 6014), (3544, 5887), (5412, 5413), (627, 2967, 3098), (3615, 5903), (3031, 3108), (3032, 3109), (3008, 3106), (3009, 3107), (3048, 3101), (449, 4067, 5876), (3939, 4023, 6050), (205, 766), (3662, 5997), (3326, 6016), (4022, 6059), (4020, 4521), (4520, 5700), (3050, 3117), (3049, 3116), (3033, 3120), (3026, 3119), (3007, 3118), (1180, 2482, 2966, 3110), (907, 4066, 5867), (2814, 2842), (252, 363), (1252, 3038, 3097), (4064, 5866), (394, 613), (215, 604), (4250, 6008), (3373, 6053), (4190, 4191), (3826, 3966), (3925, 3926), (4197, 4198), (4746, 4747, 4748, 4927, 5333), (4749, 4901), 

In [174]:
len(pair_list)

593

In [175]:
pairs_dicts = []

for pair in pair_list:
    # Iterate over each element in the tuple
    for i in range(len(pair)):
        for j in range(i + 1, len(pair)):
            # Create pairs from the tuple
            i1, i2 = pair[i], pair[j]
            d = {
                'smi1_original': df.loc[i1]['SMILES'],
                'smi2_original': df.loc[i2]['SMILES'],
                'smi1_standardized': Chem.MolToSmiles(Chem.MolFromSmiles(df.loc[i1]['comparator_smiles'])),
                'smi2_standardized': Chem.MolToSmiles(Chem.MolFromSmiles(df.loc[i2]['comparator_smiles'])),
                'label_1': df.loc[i1]['PBT_label'],
                'label_2': df.loc[i2]['PBT_label'],
            }
            pairs_dicts.append(d)

# Create a DataFrame from the list of dictionaries and save it as a CSV file
pd.DataFrame(pairs_dicts).to_csv('pair_comparisons_newdataPBT.csv', index=False)

In [176]:
pairs = pd.read_csv('pair_comparisons_newdataPBT.csv')
pairs

Unnamed: 0,smi1_original,smi2_original,smi1_standardized,smi2_standardized,label_1,label_2
0,BrC(Br)Br,BrC(Br)Br,BrC(Br)Br,BrC(Br)Br,0,0
1,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,Br[C@H]1CC[C@@H](Br)[C@H](Br)CC[C@@H](Br)[C@@H...,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,1,1
2,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,Br[C@H]1CC[C@H](Br)[C@H](Br)CC[C@@H](Br)[C@H](...,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,1,1
3,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,Br[C@H]1CC[C@@H](Br)[C@H](Br)CC[C@H](Br)[C@H](...,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,1,1
4,Br[C@H]1CC[C@@H](Br)[C@H](Br)CC[C@@H](Br)[C@@H...,Br[C@H]1CC[C@H](Br)[C@H](Br)CC[C@@H](Br)[C@H](...,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,1,1
...,...,...,...,...,...,...
2008,c1ccc2cc3c(cc2c1)c1cccc2cccc3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,1,1
2009,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,c1ccc2cc3c(cc2c1)-c1cccc2cccc-3c12,1,1
2010,c1ccc2cc3c(cc2c1)c1ccccc1c1ccccc31,c1ccc2cc3c(cc2c1)c1ccccc1c1ccccc31,c1ccc2cc3c4ccccc4c4ccccc4c3cc2c1,c1ccc2cc3c4ccccc4c4ccccc4c3cc2c1,1,1
2011,c1ccc2sc(SN(C3CCCCC3)C3CCCCC3)nc2c1,c1ccc2sc(SN(C3CCCCC3)C3CCCCC3)nc2c1,c1ccc2sc(SN(C3CCCCC3)C3CCCCC3)nc2c1,c1ccc2sc(SN(C3CCCCC3)C3CCCCC3)nc2c1,1,1


In [177]:
unique_smis = list(set(pairs['smi1_standardized']))
keep = {}
for smi in unique_smis:
    rows = pairs[pairs['smi1_standardized']==smi]
    labels = list(rows['label_1']) + list(rows['label_2'])
    if len(set(labels)) <= 1:
        keep[smi] = labels[0]

In [178]:
len(unique_smis)

593

In [179]:
# Assuming standardized_smiles_data is your standardized SMILES data
data_PBT_concat['standardized_smiles'] = data_PBT_concat['comparator_smiles']

In [180]:
data_PBT_concat = data_PBT_concat[~data_PBT_concat['standardized_smiles'].isin(unique_smis)]
data_PBT_concat

Unnamed: 0,SMILES,PBT_label,mol,mol_no-mc,mol_no-mc-neutral,comparator_smiles,standardized_smiles
4,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1,1,<rdkit.Chem.rdchem.Mol object at 0x7f6d62434e10>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7210>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7210>,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1,Cc1cc(C)c2nc3ccc4ccccc4c3cc2c1
5,c1ccc2ccc3cc4c(ccc5ccccc45)cc3c2c1,1,<rdkit.Chem.rdchem.Mol object at 0x7f6d62434e70>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7270>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7270>,c1ccc2c(c1)ccc1cc3c(ccc4ccccc43)cc12,c1ccc2c(c1)ccc1cc3c(ccc4ccccc43)cc12
6,Cc1ccc2cc3c4ccccc4ccc3c3CCc1c23,1,<rdkit.Chem.rdchem.Mol object at 0x7f6d62434750>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a71b0>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a71b0>,Cc1ccc2cc3c(ccc4ccccc43)c3c2c1CC3,Cc1ccc2cc3c(ccc4ccccc43)c3c2c1CC3
9,O=C(CCC1CCCC1)OC1CCC2C3CCC4=CC(=O)CCC4(C)C3CCC12C,1,<rdkit.Chem.rdchem.Mol object at 0x7f6d62434390>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7690>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7690>,CC12CCC(=O)C=C1CCC1C2CCC2(C)C(OC(=O)CCC3CCCC3)...,CC12CCC(=O)C=C1CCC1C2CCC2(C)C(OC(=O)CCC3CCCC3)...
10,CCN(CC)CCN1c2ccccc2Sc2ccc(Cl)cc12,1,<rdkit.Chem.rdchem.Mol object at 0x7f6d62434990>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7750>,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a7750>,CCN(CC)CCN1c2ccccc2Sc2ccc(Cl)cc21,CCN(CC)CCN1c2ccccc2Sc2ccc(Cl)cc21
...,...,...,...,...,...,...,...
5860,CN(C)CCCN(C)C,0,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1ed0>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070da50>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070da50>,CN(C)CCCN(C)C,CN(C)CCCN(C)C
5861,C[N+](C)(C)C1CCCCC1.[OH-],0,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1f30>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070dab0>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070dab0>,C[N+](C)(C)C1CCCCC1,C[N+](C)(C)C1CCCCC1
5862,c1ccc(N(CC2CO2)CC2CO2)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1f90>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db10>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db10>,c1ccc(N(CC2CO2)CC2CO2)cc1,c1ccc(N(CC2CO2)CC2CO2)cc1
5863,CCCCCCCCCCCC(=O)N(CCO)CCO,0,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a3030>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db70>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db70>,CCCCCCCCCCCC(=O)N(CCO)CCO,CCCCCCCCCCCC(=O)N(CCO)CCO


In [181]:
#add the things from keep dict to it

In [182]:
starting_PBTdat = pd.concat([pd.DataFrame([{'standardized_smiles':k, 'PBT_label':v} for k,v in keep.items()]), data_PBT_concat])
starting_PBTdat

Unnamed: 0,standardized_smiles,PBT_label,SMILES,mol,mol_no-mc,mol_no-mc-neutral,comparator_smiles
0,CCCCCCCCCCCCO,0,,,,,
1,C=CC(=O)OCC(O)CC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,1,,,,,
2,Cc1cccc(Cc2ccccc2)c1Cc1ccccc1,0,,,,,
3,COC(F)(C(F)(F)C(F)(F)F)C(F)(C(F)(F)F)C(F)(F)F,0,,,,,
4,Nc1cccc(Cl)c1,0,,,,,
...,...,...,...,...,...,...,...
5860,CN(C)CCCN(C)C,0,CN(C)CCCN(C)C,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1ed0>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070da50>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070da50>,CN(C)CCCN(C)C
5861,C[N+](C)(C)C1CCCCC1,0,C[N+](C)(C)C1CCCCC1.[OH-],<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1f30>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070dab0>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070dab0>,C[N+](C)(C)C1CCCCC1
5862,c1ccc(N(CC2CO2)CC2CO2)cc1,0,c1ccc(N(CC2CO2)CC2CO2)cc1,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a1f90>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db10>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db10>,c1ccc(N(CC2CO2)CC2CO2)cc1
5863,CCCCCCCCCCCC(=O)N(CCO)CCO,0,CCCCCCCCCCCC(=O)N(CCO)CCO,<rdkit.Chem.rdchem.Mol object at 0x7f6d606a3030>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db70>,<rdkit.Chem.rdchem.Mol object at 0x7f6d6070db70>,CCCCCCCCCCCC(=O)N(CCO)CCO


In [183]:
starting_PBT = starting_PBTdat.iloc[:, :2]
starting_PBT

Unnamed: 0,standardized_smiles,PBT_label
0,CCCCCCCCCCCCO,0
1,C=CC(=O)OCC(O)CC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,1
2,Cc1cccc(Cc2ccccc2)c1Cc1ccccc1,0
3,COC(F)(C(F)(F)C(F)(F)F)C(F)(C(F)(F)F)C(F)(F)F,0
4,Nc1cccc(Cl)c1,0
...,...,...
5860,CN(C)CCCN(C)C,0
5861,C[N+](C)(C)C1CCCCC1,0
5862,c1ccc(N(CC2CO2)CC2CO2)cc1,0
5863,CCCCCCCCCCCC(=O)N(CCO)CCO,0


In [184]:
starting_PBT = starting_PBT.drop_duplicates()
starting_PBT

Unnamed: 0,standardized_smiles,PBT_label
0,CCCCCCCCCCCCO,0
1,C=CC(=O)OCC(O)CC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,1
2,Cc1cccc(Cc2ccccc2)c1Cc1ccccc1,0
3,COC(F)(C(F)(F)C(F)(F)F)C(F)(C(F)(F)F)C(F)(F)F,0
4,Nc1cccc(Cl)c1,0
...,...,...
5860,CN(C)CCCN(C)C,0
5861,C[N+](C)(C)C1CCCCC1,0
5862,c1ccc(N(CC2CO2)CC2CO2)cc1,0
5863,CCCCCCCCCCCC(=O)N(CCO)CCO,0


### Check Validity with RDkit

In [185]:
# Define the to_mol function to convert SMILES to molecule
def to_mol(smi):
    try:
        return Chem.MolFromSmiles(smi)
    except:
        return None

# Apply the function and filter out rows with invalid SMILES
valid_df_PBT = starting_PBT[starting_PBT["standardized_smiles"].apply(lambda smi: to_mol(smi) is not None)]

# Reset the index of the resulting DataFrame
valid_df_PBT.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
valid_df_PBT

[16:53:18] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7
[16:53:18] Explicit valence for atom # 4 B, 5, is greater than permitted
[16:53:18] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[16:53:18] Explicit valence for atom # 3 B, 5, is greater than permitted
[16:53:18] Explicit valence for atom # 1 B, 5, is greater than permitted


Unnamed: 0,standardized_smiles,PBT_label
0,CCCCCCCCCCCCO,0
1,C=CC(=O)OCC(O)CC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,1
2,Cc1cccc(Cc2ccccc2)c1Cc1ccccc1,0
3,COC(F)(C(F)(F)C(F)(F)F)C(F)(C(F)(F)F)C(F)(F)F,0
4,Nc1cccc(Cl)c1,0
...,...,...
5125,CN(C)CCCN(C)C,0
5126,C[N+](C)(C)C1CCCCC1,0
5127,c1ccc(N(CC2CO2)CC2CO2)cc1,0
5128,CCCCCCCCCCCC(=O)N(CCO)CCO,0


In [186]:
### final_correct_startingPBTdata.csv