# Cleaning PubChemLite (Feb 2025)

In [23]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem, rdBase, RDLogger, DataStructs
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem import AllChem, Draw, inchi, rdDepictor, PandasTools, SaltRemover, Descriptors
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from rdkit.Chem.MolStandardize import rdMolStandardize

import pickle

In [24]:
pubchemlite_orig = pd.read_csv('PubChemLite_CCSbase_20250228.csv')

pubchemlite = pubchemlite_orig[['SMILES', 'InChIKey', 'CompoundName', 'Identifier', 'XLogP', 'MonoisotopicMass']]
pubchemlite.rename(columns={'Identifier': 'pubchem_cid',
                                   'CompoundName':'name',
                                   'MonoisotopicMass':'monoisotopic_mass_pubchem',
                                   'InChIKey':'InChIKey_pubchem',
                                   'SMILES':'SMILES_pubchem',
                                   'XLogP':'XLogP_pubchem'}, inplace=True)
pubchemlite

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pubchemlite.rename(columns={'Identifier': 'pubchem_cid',


Unnamed: 0,SMILES_pubchem,InChIKey_pubchem,name,pubchem_cid,XLogP_pubchem,monoisotopic_mass_pubchem
0,C1=CC(C(C(=C1)C(=O)O)O)O,INCSWYKICIYAHB-UHFFFAOYSA-N,"5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...",3,-0.3,156.042259
1,CC(CN)O,HXKKHQJGJAFBHI-UHFFFAOYSA-N,1-aminopropan-2-ol,4,-1.0,75.068414
2,C(C(=O)COP(=O)(O)O)N,HIQNVODXENYOFK-UHFFFAOYSA-N,(3-amino-2-oxopropyl) dihydrogen phosphate,5,-5.0,169.014009
3,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,VYZAHLCBVHPDDF-UHFFFAOYSA-N,"1-chloro-2,4-dinitrobenzene",6,2.3,201.978134
4,CCC(C)(C(C(=O)O)O)O,PDGXJDXVGMHUIR-UHFFFAOYSA-N,"2,3-dihydroxy-3-methylpentanoic acid",8,-0.4,148.073559
...,...,...,...,...,...,...
408805,C1=CC(=C(C=C1NC(=O)NC2=C(C=C(C=C2)Cl)C3=C(C=CC...,CAWJHCNZWVFAPL-UHFFFAOYSA-N,"1-[4-chloro-2-[6-chloro-3-(2,4-dichlorophenoxy...",172643734,9.1,599.913558
408806,B1(OC(C(O1)(C)C)(C)C)C2=CC(=CN=C2)OC3COC3,VLOPGBXDLHSTBE-UHFFFAOYSA-N,"3-(oxetan-3-yloxy)-5-(4,4,5,5-tetramethyl-1,3,...",172644635,,277.148538
408807,B1(OC(C(O1)(C)C)(C)C)C2=C(C=C(C=C2)C3COC3)C#N,YMEYBJPAQUDGPW-UHFFFAOYSA-N,"5-(oxetan-3-yl)-2-(4,4,5,5-tetramethyl-1,3,2-d...",172645001,,285.153624
408808,B1(OC(C(O1)(C)C)(C)C)C2=C(C=CC(=C2)C3COC3)C#N,IYBNKVGFHAWEDF-UHFFFAOYSA-N,"4-(oxetan-3-yl)-2-(4,4,5,5-tetramethyl-1,3,2-d...",172645595,,285.153624


### Cleaning of the SMILES and convertion to ROMol objects
The InChIKey, new SMILES and InChIKey14 will be computed from the ROMol objects after desalting and uncharging has been done. 

In [25]:
PandasTools.AddMoleculeColumnToFrame(pubchemlite, smilesCol='SMILES_pubchem', molCol='ROMol')

pubchemlite.ROMol.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)


np.int64(9)

In [26]:
pubchemlite[pubchemlite.ROMol.isna()==True]

Unnamed: 0,SMILES_pubchem,InChIKey_pubchem,name,pubchem_cid,XLogP_pubchem,monoisotopic_mass_pubchem,ROMol
13982,O=Cl(=O)(=O)F,XHFXMNZYIKFCPN-UHFFFAOYSA-N,perchloryl fluoride,24258,3.3,101.952,
14115,FBr(F)F,FQFKTKUFHWNTBN-UHFFFAOYSA-N,trifluoro-lambda3-bromane,24594,2.6,135.91355,
14117,FBr(F)(F)(F)F,XHVUVQAANZKEKF-UHFFFAOYSA-N,pentafluoro-lambda5-bromane,24606,3.7,173.91035,
14137,FCl(F)F,JOHWNGGYGAVMGU-UHFFFAOYSA-N,trifluoro-lambda3-chlorane,24637,2.5,91.964062,
32298,FCl(F)(F)(F)F,KNSWNNXPAWSACI-UHFFFAOYSA-N,pentafluoro-lambda5-chlorane,61654,3.6,129.960869,
46001,FI(F)(F)(F)(F)(F)F,XRURPHMPXJDCOO-UHFFFAOYSA-N,heptafluoro-lambda7-iodane,85645,4.7,259.89329,
162802,C1=CC=C2C(=C1)O[Si-2]34(O2)(OC5=CC=CC=C5O3)OC6...,JIICOOGKGWBPSB-UHFFFAOYSA-N,,3470663,,352.040315,
266999,CC1=CC(=[N+](C(=C1)C)[Br-][N+]2=C(C=C(C=C2C)C)C)C,DWDPGQBPOJKMQW-UHFFFAOYSA-N,"2,4,6-trimethyl-1-(2,4,6-trimethylpyridin-1-iu...",44629788,5.5,321.09664,
277611,C1=CC=C(C=C1)[Si](Cl)(Cl)[Cl-]Cl,MIUKAZMTBNWUDP-UHFFFAOYSA-N,dichloro-chlorochloranuidyl-phenylsilane,53400412,,244.891462,


In [27]:
pubchemlite = pubchemlite.dropna(subset=['ROMol'])

In [28]:
salts = pubchemlite.SMILES_pubchem.str.contains(r'\.')

salts.sum() #Is 0 so no salt removal needed

np.int64(0)

In [29]:
def contains_carbon(mol):
    try:
        atomic_no = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        return bool(6 in atomic_no)
    except:
        return np.nan

pubchemlite['contains_carbon'] = pubchemlite.ROMol.apply(lambda x: contains_carbon(x))
pubchemlite = pubchemlite[pubchemlite.contains_carbon == True]
pubchemlite = pubchemlite.drop(columns=['contains_carbon'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pubchemlite['contains_carbon'] = pubchemlite.ROMol.apply(lambda x: contains_carbon(x))


In [30]:
uncharger = rdMolStandardize.Uncharger()  # neutralize the molecule (if possible)
pubchemlite['ROMol'] = pubchemlite.ROMol.apply(lambda x: uncharger.uncharge(x))

# def neutralize_atoms(mol):
#     try:
#         pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
#         at_matches = mol.GetSubstructMatches(pattern)
#         at_matches_list = [y[0] for y in at_matches]
#         if len(at_matches_list) > 0:
#             for at_idx in at_matches_list:
#                 atom = mol.GetAtomWithIdx(at_idx)
#                 chg = atom.GetFormalCharge()
#                 hcount = atom.GetTotalNumHs()
#                 atom.SetFormalCharge(0)
#                 atom.SetNumExplicitHs(hcount - chg)
#                 atom.UpdatePropertyCache()
#         return mol
#     except:
#         return mol

#pubchemlite_subset['ROMol'] = pubchemlite_subset.ROMol.apply(lambda x: neutralize_atoms(x))
pubchemlite.ROMol.apply(lambda x: Chem.RemoveStereochemistry(x)) 
pubchemlite['SMILES'] = pubchemlite.ROMol.apply(lambda x: Chem.MolToSmiles(x))
pubchemlite['InChIKey'] = pubchemlite.ROMol.apply(lambda x: inchi.MolToInchiKey(x))
pubchemlite['InChIKey14'] = pubchemlite.InChIKey.apply(lambda x: x.split('-')[0])

In [31]:
pubchemlite_dupl = pubchemlite[pubchemlite.duplicated(subset=['InChIKey', 'SMILES'], keep=False)]
pubchemlite_dupl = pubchemlite_dupl.sort_values(by='InChIKey')
pubchemlite_dupl

Unnamed: 0,SMILES_pubchem,InChIKey_pubchem,name,pubchem_cid,XLogP_pubchem,monoisotopic_mass_pubchem,ROMol,SMILES,InChIKey,InChIKey14
212955,C1=CC=C2C(=C1)N=NN2[O-],VGZMHTGJWOPMTH-UHFFFAOYSA-N,1-oxidobenzotriazole,11805229,0.7,134.035437,<rdkit.Chem.rdchem.Mol object at 0x1739d5770>,On1nnc2ccccc21,ASOKPJOREAFHNY-UHFFFAOYSA-N,ASOKPJOREAFHNY
40303,C1=CC=C2C(=C1)N=NN2O,ASOKPJOREAFHNY-UHFFFAOYSA-N,1-hydroxybenzotriazole,75771,1.2,135.043262,<rdkit.Chem.rdchem.Mol object at 0x1693cf450>,On1nnc2ccccc21,ASOKPJOREAFHNY-UHFFFAOYSA-N,ASOKPJOREAFHNY
72524,C=C1C=NC2=CC=CC=C12,BCNUXXXHEIUHJB-UHFFFAOYSA-N,3-methylideneindole,170404,1.7,129.057849,<rdkit.Chem.rdchem.Mol object at 0x14e988eb0>,C=C1C=Nc2ccccc21,BCNUXXXHEIUHJB-UHFFFAOYSA-N,BCNUXXXHEIUHJB
327565,[CH+]=C1C=NC2=CC=CC=C12,NSQIQDGIPQVASR-UHFFFAOYSA-N,3-methylideneindole,123131584,1.7,128.050024,<rdkit.Chem.rdchem.Mol object at 0x176c96500>,C=C1C=Nc2ccccc21,BCNUXXXHEIUHJB-UHFFFAOYSA-N,BCNUXXXHEIUHJB
259091,C1=CC=C2C(=C1)N(C(=O)C(O2)O)[O-],QUBXBHWFQCVXQY-UHFFFAOYSA-N,"2-hydroxy-4-oxido-1,4-benzoxazin-3-one",25203008,-0.1,180.029683,<rdkit.Chem.rdchem.Mol object at 0x14d15a0a0>,O=C1C(O)Oc2ccccc2N1O,COVOPZQGJGUPEY-UHFFFAOYSA-N,COVOPZQGJGUPEY
15950,C1=CC=C2C(=C1)N(C(=O)C(O2)O)O,COVOPZQGJGUPEY-UHFFFAOYSA-N,"2,4-dihydroxy-1,4-benzoxazin-3-one",28495,0.0,181.037508,<rdkit.Chem.rdchem.Mol object at 0x14d8959a0>,O=C1C(O)Oc2ccccc2N1O,COVOPZQGJGUPEY-UHFFFAOYSA-N,COVOPZQGJGUPEY
393497,CC(=C[CH+]C(=O)O)C,XZWWIRTVQPJTML-UHFFFAOYSA-O,4-methylpent-3-enoic acid,135939139,1.4,113.060255,<rdkit.Chem.rdchem.Mol object at 0x1499d4f90>,CC(C)=CCC(=O)O,CQJHAULYLJXJNL-UHFFFAOYSA-N,CQJHAULYLJXJNL
35975,CC(=CCC(=O)O)C,CQJHAULYLJXJNL-UHFFFAOYSA-N,4-methylpent-3-enoic acid,68158,1.4,114.06808,<rdkit.Chem.rdchem.Mol object at 0x300f18200>,CC(C)=CCC(=O)O,CQJHAULYLJXJNL-UHFFFAOYSA-N,CQJHAULYLJXJNL
320975,CN(C)CCCN(O)S(=O)(=O)CCC(C(C(C(C(C(F)(F)F)(F)F...,CUJCSBFFGKTHSK-UHFFFAOYSA-N,"N-[3-(dimethylamino)propyl]-3,3,4,4,5,5,6,6,7,...",102055960,4.2,528.07523,<rdkit.Chem.rdchem.Mol object at 0x13e478890>,CN(C)CCCN(O)S(=O)(=O)CCC(F)(F)C(F)(F)C(F)(F)C(...,CUJCSBFFGKTHSK-UHFFFAOYSA-N,CUJCSBFFGKTHSK
69861,CN(C)CCCN([O-])S(=O)(=O)CCC(C(C(C(C(C(F)(F)F)(...,COSQFXYPUSWVAZ-UHFFFAOYSA-N,"N-[3-(dimethylamino)propyl]-3,3,4,4,5,5,6,6,7,...",157338,4.2,527.067405,<rdkit.Chem.rdchem.Mol object at 0x14e9d4040>,CN(C)CCCN(O)S(=O)(=O)CCC(F)(F)C(F)(F)C(F)(F)C(...,CUJCSBFFGKTHSK-UHFFFAOYSA-N,CUJCSBFFGKTHSK


In [32]:
pubchemlite_dupl['monoisotopic_mass_new'] = pubchemlite_dupl.ROMol.apply(lambda x: Descriptors.ExactMolWt(x)) #calculates the monoisotopic mass of the molecule to determine which represenetation is the correct one
pubchemlite_dupl['monoisotopic_mass_diff'] = abs(pubchemlite_dupl.monoisotopic_mass_new - pubchemlite_dupl.monoisotopic_mass_pubchem) #gives absolute mass difference

pubchemlite_dupl_removed = pubchemlite_dupl[~(pubchemlite_dupl.monoisotopic_mass_diff > 1)] #Removes the duplicates with a mass difference of more than 1 Da
pubchemlite_dupl_removed = pubchemlite_dupl_removed.drop(columns=['monoisotopic_mass_new', 'monoisotopic_mass_diff']) #drops the columns concerning the mass difference
pubchemlite_dupl_removed

Unnamed: 0,SMILES_pubchem,InChIKey_pubchem,name,pubchem_cid,XLogP_pubchem,monoisotopic_mass_pubchem,ROMol,SMILES,InChIKey,InChIKey14
40303,C1=CC=C2C(=C1)N=NN2O,ASOKPJOREAFHNY-UHFFFAOYSA-N,1-hydroxybenzotriazole,75771,1.2,135.043262,<rdkit.Chem.rdchem.Mol object at 0x1693cf450>,On1nnc2ccccc21,ASOKPJOREAFHNY-UHFFFAOYSA-N,ASOKPJOREAFHNY
72524,C=C1C=NC2=CC=CC=C12,BCNUXXXHEIUHJB-UHFFFAOYSA-N,3-methylideneindole,170404,1.7,129.057849,<rdkit.Chem.rdchem.Mol object at 0x14e988eb0>,C=C1C=Nc2ccccc21,BCNUXXXHEIUHJB-UHFFFAOYSA-N,BCNUXXXHEIUHJB
15950,C1=CC=C2C(=C1)N(C(=O)C(O2)O)O,COVOPZQGJGUPEY-UHFFFAOYSA-N,"2,4-dihydroxy-1,4-benzoxazin-3-one",28495,0.0,181.037508,<rdkit.Chem.rdchem.Mol object at 0x14d8959a0>,O=C1C(O)Oc2ccccc2N1O,COVOPZQGJGUPEY-UHFFFAOYSA-N,COVOPZQGJGUPEY
35975,CC(=CCC(=O)O)C,CQJHAULYLJXJNL-UHFFFAOYSA-N,4-methylpent-3-enoic acid,68158,1.4,114.06808,<rdkit.Chem.rdchem.Mol object at 0x300f18200>,CC(C)=CCC(=O)O,CQJHAULYLJXJNL-UHFFFAOYSA-N,CQJHAULYLJXJNL
320975,CN(C)CCCN(O)S(=O)(=O)CCC(C(C(C(C(C(F)(F)F)(F)F...,CUJCSBFFGKTHSK-UHFFFAOYSA-N,"N-[3-(dimethylamino)propyl]-3,3,4,4,5,5,6,6,7,...",102055960,4.2,528.07523,<rdkit.Chem.rdchem.Mol object at 0x13e478890>,CN(C)CCCN(O)S(=O)(=O)CCC(F)(F)C(F)(F)C(F)(F)C(...,CUJCSBFFGKTHSK-UHFFFAOYSA-N,CUJCSBFFGKTHSK
318649,C/C(=C/C(=O)N(CCCC1C(=O)NC(C(=O)NC(C(=O)NC(C(=...,DKSPATNIHZSGOP-YZSKCGIOSA-N,(Z)-5-[3-[5-[3-[[(E)-4-carboxy-3-methylbut-2-e...,91819949,-3.2,999.403285,<rdkit.Chem.rdchem.Mol object at 0x13e4c46d0>,CC(=CC(=O)N(O)CCCC1NC(=O)CNC(=O)C(CO)NC(=O)C(C...,DKSPATNIHZSGOP-UHFFFAOYSA-N,DKSPATNIHZSGOP
830,COC1=CC2=C(C=C1)N(C(=O)C(O2)O)O,GDNZNIJPBQATCZ-UHFFFAOYSA-N,"2,4-dihydroxy-7-methoxy-1,4-benzoxazin-3-one",2358,0.0,211.048072,<rdkit.Chem.rdchem.Mol object at 0x1763025e0>,COc1ccc2c(c1)OC(O)C(=O)N2O,GDNZNIJPBQATCZ-UHFFFAOYSA-N,GDNZNIJPBQATCZ
37218,C1=CC=C(C=C1)PC2=CC=CC=C2,GPAYUJZHTULNBE-UHFFFAOYSA-N,diphenylphosphane,70017,3.0,186.059837,<rdkit.Chem.rdchem.Mol object at 0x1693965e0>,c1ccc(Pc2ccccc2)cc1,GPAYUJZHTULNBE-UHFFFAOYSA-N,GPAYUJZHTULNBE
161598,C1=CC=C(C=C1)CC(=S)NO,IHTJGIKQNHDTSX-UHFFFAOYSA-N,N-hydroxy-2-phenylethanethioamide,3295299,1.2,167.040485,<rdkit.Chem.rdchem.Mol object at 0x14a3acd60>,ONC(=S)Cc1ccccc1,IHTJGIKQNHDTSX-UHFFFAOYSA-N,IHTJGIKQNHDTSX
70075,C1=CN(C(=N1)[N+](=O)[O-])CC(=O)NO,KAIQUJDTIFXCLP-UHFFFAOYSA-N,N-hydroxy-2-(2-nitroimidazol-1-yl)acetamide,159484,-1.1,186.038905,<rdkit.Chem.rdchem.Mol object at 0x14e9c2260>,O=C(Cn1ccnc1[N+](=O)[O-])NO,KAIQUJDTIFXCLP-UHFFFAOYSA-N,KAIQUJDTIFXCLP


In [None]:
pubchemlite_no_dupl = pubchemlite[~pubchemlite.InChIKey.isin(pubchemlite_dupl.InChIKey)] #Remove any duplicates from the original dataframe

pubchemlite_std = pd.concat([pubchemlite_no_dupl, pubchemlite_dupl_removed], ignore_index=True).reset_index(drop=True) #removes the duplicates from the original dataframe and appends the new ones


In [50]:
pubchemlite_std.shape

(407967, 10)

In [None]:
# with open('2025-04-09_PubChemLite_smiles_cleaned.pkl', 'wb') as f:
#       pickle.dump(pubchemlite, f)

### Standardization of the data using the PubChem StandardUtils

Divide into multiple batches to process over multiple evenings, and then add onto each other when they are all done. 

In [None]:
with open('2025-04-09_PubChemLite_smiles_cleaned.pkl', 'rb') as f:
    pubchemlite = pickle.load(f)

In [None]:
pubchemlite_batches = np.array_split(pubchemlite, 4)

  return bound(*args, **kwds)


In [5]:
pubchemlite_batches[0]

Unnamed: 0,SMILES,InChIKey,CompoundName,PubChem_cid,XLogP,MonoisotopicMass,ROMol,InChIKey14
0,O=C(O)C1=CC=CC(O)C1O,INCSWYKICIYAHB-UHFFFAOYSA-N,"5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...",3,-0.3,156.042259,<rdkit.Chem.rdchem.Mol object at 0x169ff38d0>,INCSWYKICIYAHB
1,CC(O)CN,HXKKHQJGJAFBHI-UHFFFAOYSA-N,1-aminopropan-2-ol,4,-1.0,75.068414,<rdkit.Chem.rdchem.Mol object at 0x169ff3920>,HXKKHQJGJAFBHI
2,NCC(=O)COP(=O)(O)O,HIQNVODXENYOFK-UHFFFAOYSA-N,(3-amino-2-oxopropyl) dihydrogen phosphate,5,-5.0,169.014009,<rdkit.Chem.rdchem.Mol object at 0x169ff3970>,HIQNVODXENYOFK
3,O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1,VYZAHLCBVHPDDF-UHFFFAOYSA-N,"1-chloro-2,4-dinitrobenzene",6,2.3,201.978134,<rdkit.Chem.rdchem.Mol object at 0x169ff39c0>,VYZAHLCBVHPDDF
4,CCC(C)(O)C(O)C(=O)O,PDGXJDXVGMHUIR-UHFFFAOYSA-N,"2,3-dihydroxy-3-methylpentanoic acid",8,-0.4,148.073559,<rdkit.Chem.rdchem.Mol object at 0x169ff3a10>,PDGXJDXVGMHUIR
...,...,...,...,...,...,...,...,...
101987,COC(=O)c1cnc(N)[nH]1,NDTWFWAWLBGFQI-UHFFFAOYSA-N,methyl 2-amino-1H-imidazole-5-carboxylate,588062,-0.1,141.053826,<rdkit.Chem.rdchem.Mol object at 0x32e3a74c0>,NDTWFWAWLBGFQI
101988,O=C(Cn1cncn1)c1ccc(F)cc1F,XCHRPVARHBCFMJ-UHFFFAOYSA-N,"1-(2,4-difluorophenyl)-2-(1,2,4-triazol-1-yl)e...",588080,1.7,223.055718,<rdkit.Chem.rdchem.Mol object at 0x32e3a7510>,XCHRPVARHBCFMJ
101989,O=C(Cl)c1ccc(F)cc1F,JSWRVDNTKPAJLB-UHFFFAOYSA-N,"2,4-difluorobenzoyl chloride",588081,2.6,175.984049,<rdkit.Chem.rdchem.Mol object at 0x32e3a7560>,JSWRVDNTKPAJLB
101990,O=C(Cl)c1cc(F)ccc1F,RLRUKKDFNWXXRT-UHFFFAOYSA-N,"2,5-difluorobenzoyl chloride",588082,2.6,175.984049,<rdkit.Chem.rdchem.Mol object at 0x32e3a75b0>,RLRUKKDFNWXXRT


In [41]:
pubchemlite_head = pubchemlite.head(1000)

In [43]:
from standardizeUtils.standardizeUtils import standardize_structure_with_pubchem
from standardizeUtils.standardizeUtils import standardize_structure_list_with_pubchem

import time
import requests  



def standardize_in_batches(smiles_list, initial_batch_size=1000):
    """
    Standardizes SMILES in batches, reducing batch size on failure.
    Input: SMILES list
    Output: Standardized SMILES list
    """
    total_smiles = len(smiles_list)
    processed_smiles = []
    idx = 0
    batch_size = initial_batch_size # Starting with 1000

    while idx < total_smiles:
        attempt_success = False
        current_batch_size = batch_size

        while current_batch_size >= 100:
            try:
                print(f"Processing batch {idx}:{idx + current_batch_size} (size={current_batch_size})")
                batch = smiles_list[idx : idx + current_batch_size]
                standardized_batch = standardize_structure_list_with_pubchem(batch, 'smiles')
                processed_smiles.extend(standardized_batch)
                idx += current_batch_size
                attempt_success = True
                break  

            except requests.exceptions.RequestException as e:
                print(f"Error occurred: {e}, reducing batch size...")
                current_batch_size //= 2  # Reduce batch size by half on failure > can be changed
                time.sleep(2)  

        if not attempt_success:
            print(f"Skipping batch {idx}:{idx + current_batch_size} due to repeated failures.")
            idx += current_batch_size  

    return processed_smiles

pubchemlite_head['std_SMILES'] = standardize_in_batches(pubchemlite_head.SMILES.to_list())

Processing batch 0:1000 (size=1000)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pubchemlite_head['std_SMILES'] = standardize_in_batches(pubchemlite_head.SMILES.to_list())


In [None]:
pubchemlite_head[pubchemlite_head.SMILES_pubchem!=pubchemlite_head.std_SMILES][['SMILES_pubchem', 'std_SMILES']] # The SMILES from standardization and pubchem smiles might in some cases differ, standardization is therefore needed. 

Unnamed: 0,SMILES_pubchem,std_SMILES
237,CN1CC(=O)N=C1N,CN1CC(=O)NC1=N


In [14]:
#pubchemlite_batches[0]['std_SMILES'] = standardize_in_batches(pubchemlite_batches[0].SMILES.to_list())
#pubchemlite_batches[1]['std_SMILES'] = standardize_in_batches(pubchemlite_batches[1].SMILES.to_list())
pubchemlite_batches[2]['std_SMILES'] = standardize_in_batches(pubchemlite_batches[2].SMILES.to_list())
#pubchemlite_batches[3]['std_SMILES'] = standardize_in_batches(pubchemlite_batches[3].SMILES.to_list())

with open('2025-03-19_PubChemLite_std_smiles_batch3.pkl', 'wb') as f:
    pickle.dump(pubchemlite_batches[2], f)

Processing batch 0:1000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Fc1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCCCC#C[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=Cc1ccccc1[BH](F)(F)F".


Processing batch 1000:2000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(=O)O[BH](OC(C)=O)OC(C)=O".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C[BH](C)(C)C".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "c1cnn([BH](n2cccn2)n2cccn2)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Cc1cc(C)n([BH](n2nc(C)cc2C)n2nc(C)cc2C)n1".
The standardization request failed. The response contains the following "PCT-Status" tag: <

Processing batch 2000:3000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C#Cc1ccccc1".


Processing batch 3000:4000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CCCCl".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C=C[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC[BH](CC)CC".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C[Si](C)(C)C#C[BH](F)(F)F".


Processing batch 4000:5000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C1CC1c1ccccc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Cc1ccc(C=C[BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "FC(F)(F)C(F)(F)[PH](F)(F)(F)(C(F)(F)C(F)(F)F)C(F)(F)C(F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Fc1cccc(F)c1[BH](F)(F)F".
The standardization request failed. The response contains the follow

Processing batch 5000:6000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccc(Cl)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C1O[BH]2(OC1=O)OC(=O)C(=O)O2".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CO[BH](OC)(OC)C(F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(O)c1cccc([BH](F)(F)F)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Sta

Processing batch 6000:7000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Fc1cc(F)cc([BH](F)(F)F)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(=O)c1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)c1ccc([BH](F)(F)F)cc1".


Processing batch 7000:8000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C=Cc1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "N#CC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CBr".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exceptio

Processing batch 8000:9000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C1CCCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=Cc1cc([BH](F)(F)F)cs1".


Processing batch 9000:10000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(F)c(F)c([BH](c3c(F)c(F)c(F)c(F)c3F)c3c(F)c(F)c(F)c(F)c3F)c(F)c2F)c(C)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C=C(C)[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C1O[BH]2(OC(=O)c3ccccc3O2)Oc2ccccc21".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C(F)(F)C(F)(F)F".
The standardizat

Processing batch 10000:11000 (size=1000)
Processing batch 11000:12000 (size=1000)
Processing batch 12000:13000 (size=1000)
Processing batch 13000:14000 (size=1000)
Processing batch 14000:15000 (size=1000)
Processing batch 15000:16000 (size=1000)
Processing batch 16000:17000 (size=1000)
Processing batch 17000:18000 (size=1000)
Processing batch 18000:19000 (size=1000)
Processing batch 19000:20000 (size=1000)
Processing batch 20000:21000 (size=1000)
Processing batch 21000:22000 (size=1000)
Processing batch 22000:23000 (size=1000)
Processing batch 23000:24000 (size=1000)
Processing batch 24000:25000 (size=1000)
Processing batch 25000:26000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCC[BH](CCC)(CCC)CCC".


Processing batch 26000:27000 (size=1000)
Processing batch 27000:28000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cc2ccccc2o1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(Nc1ccccc1)c1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccc(I)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)n1c([BH](F)(F)F)cc2ccccc21".


Processing batch 28000:29000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CN1CCCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CN(Cc1ccccc1)C[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCC(C)C(NC(=O)C(C)NC(=O)CNC(=O)C(C)NC(=O)C(CCCCN)NC(=O)C(CO)NC(=O)C(C)NC(=O)C(CCSC)NC(=O)CN)C(=O)NC(C)C(=O)NCC(=O)NC(CCCCN)C(=O)NC(C(=O)NC(C)C(=O)NC(CCCCN)C(=O)NC(C(=O)NC(C)C(=O)NC(CC(C)C)C(=O)NC(CCCCN)C(=O)NC(C)C(=O)NC(CC(C)C)C(N)=O)C(C)C)C(C)CC".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="

Processing batch 29000:30000 (size=1000)
Processing batch 30000:31000 (size=1000)
Processing batch 31000:32000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CN(C)[AlH](N(C)C)(N(C)C)N(C)C".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "OCCN(CCO[AlH]12OCCN(CCO1)CCO2)CCO[AlH]12OCCN(CCO1)CCO2".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "FC(F)(F)C(O[AlH](OC(C(F)(F)F)(C(F)(F)F)C(F)(F)F)(OC(C(F)(F)F)(C(F)(F)F)C(F)(F)F)OC(C(F)(F)F)(C(F)(F)F)C(F)(F)F)(C(F)(F)F)C(F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' fo

Processing batch 32000:33000 (size=1000)
Processing batch 33000:34000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cccc(Br)c1".


Processing batch 34000:35000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cncc2ccccc12".


Processing batch 35000:36000 (size=1000)
Processing batch 36000:37000 (size=1000)
Processing batch 37000:38000 (size=1000)
Processing batch 38000:39000 (size=1000)
Processing batch 39000:40000 (size=1000)
Processing batch 40000:41000 (size=1000)
Processing batch 41000:42000 (size=1000)
Processing batch 42000:43000 (size=1000)
Processing batch 43000:44000 (size=1000)
Processing batch 44000:45000 (size=1000)
Processing batch 45000:46000 (size=1000)
Processing batch 46000:47000 (size=1000)
Processing batch 47000:48000 (size=1000)
Processing batch 48000:49000 (size=1000)
Processing batch 49000:50000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CN1CCCC1".


Processing batch 50000:51000 (size=1000)
Processing batch 51000:52000 (size=1000)
Processing batch 52000:53000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=Cc1ccc(F)c([BH](F)(F)F)c1".


Processing batch 53000:54000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C[Si](C)(C)CCOC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)O[BH](OC(C)C)(OC(C)C)c1ccccn1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C=Cc1ccc(Br)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CSCCC(NC(=O)C(N)CCCCN)C(=O)NC(CCC(N)=O)C(=O)NC(C(=O)NC(CC(=O)O)C(=O)OC(=O)C1CCCN1C(=O)C1CCCN1C(=O)C(CC(=O)O)NC(=O)C(C

Processing batch 54000:55000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Cc1ccc([BH](F)(F)F)o1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=Cc1ccc([BH](F)(F)F)o1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccn[nH]1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccncc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-er

Processing batch 55000:56000 (size=1000)
Processing batch 56000:57000 (size=1000)
Processing batch 57000:58000 (size=1000)
Processing batch 58000:59000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)NCC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "N#CCC[BH](F)(F)F".


Processing batch 59000:60000 (size=1000)
Processing batch 60000:61000 (size=1000)
Processing batch 61000:62000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(c1ccccc1)[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccc(CN2CCOCC2)cc1".


Processing batch 62000:63000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cccs1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCCCOc1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccccc1OCc1ccccc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Cc1cc(OCc2ccccc2)ccc1[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-

Processing batch 63000:64000 (size=1000)
Processing batch 64000:65000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)COc1ccccc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=Cc1ccc([BH](F)(F)F)s1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cc2ccccc2s1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CNC1CCCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="da

Processing batch 65000:66000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "N#Cc1ccccc1[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Oc1ccc([BH](F)(F)F)cc1".


Processing batch 66000:67000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCCCC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1cccc(I)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccc2nonc2c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(c1ccc([BH](F)(F)F)cc1)N1CCOCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status va

Processing batch 67000:68000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)CC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CC1CCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CC1CCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC=C[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/

Processing batch 68000:69000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "COCCC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CC1CC1".


Processing batch 69000:70000 (size=1000)
Processing batch 70000:71000 (size=1000)
Processing batch 71000:72000 (size=1000)
Processing batch 72000:73000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccccn1".


Processing batch 73000:74000 (size=1000)
Processing batch 74000:75000 (size=1000)
Processing batch 75000:76000 (size=1000)
Processing batch 76000:77000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)NC[BH](F)(F)F".


Processing batch 77000:78000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "OCc1cccc([BH](F)(F)F)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "OC[BH](F)(F)F".


Processing batch 78000:79000 (size=1000)
Processing batch 79000:80000 (size=1000)
Processing batch 80000:81000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CN(C)CCC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "Nc1cccc([BH](F)(F)F)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C=CC1CC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCC(C)(C)[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"

Processing batch 81000:82000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "OC1CN(C[BH](F)(F)F)C1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(=O)NC1C(OC(C)C(NC(=O)C(C)NC(=O)C(CC(N)=O)NC(=O)C(CO)NC(=O)CNC(=O)CNC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=O)C(CO)NC(=O)C2CCC(=O)N2)C(=O)NC(CCCCN)C(=O)NC(CCCCN)C(=O)N2CCCC2C(=O)NC(CNC(C(=O)NC(CC(C)C)C(=O)O)C(C)C)Cc2ccc(O)cc2)CC(CO)C(O)C1OC1CC(CO)C(O)C(O)C1O".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "C=C(CN1CCOCC1)[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-St

Processing batch 82000:83000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CCCc1ccccc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCC(C)C(NC(=O)C(Cc1ccc(O)cc1)NC(=O)CNC(=O)C(CC(=O)O)NC(=O)C(CCCCN)NC(=O)C(N)C(C)C)C(=O)NC(C(=O)NC(CC(=O)O)C(=O)NC(CC(=O)O)C(=O)NC(CCCNC(=N)N)C(=O)NC(CC(N)=O)C(=O)NC1CSSCC(C(=O)NC(CC(N)=O)C(N)=O)NC(=O)C(CCCNC(=N)N)NC(=O)CNC(=O)C2CCCN2C(=O)CNC(=O)C(CCCCN)NC(=O)C(C(C)O)NC(=O)C(CCCNC(=N)N)NC(=O)C(C(C)C)NC(=O)C(Cc2cnc[nH]2)NC(=O)C(CC(=O)O)NC(=O)C2CCCN2C(=O)C(C(C)C)NC(=O)C(CCCCN)NC(=O)C(Cc2ccc(O)cc2)NC(=O)C2CSSCC3NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=O)C(CC(N)=O)NC(=O)C4CSSCC(NC(=O)C(C)NC(=O)C(CC(N)=O)NC(=O)CNC(=O)C(Cc5ccc(O)cc5)NC(=O)C5CCCN5C(=O)C(CO)NC(=O)C(C)NC(=O)C(Cc5c[nH]c6c

Processing batch 83000:84000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C1c2ccccc2C(=O)N1C[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCCCN(CCCC)[AlH](N(CCCC)CCCC)(N(CCCC)CCCC)N(CCCC)CCCC".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "[N-]=[N+]=NCc1ccccc1[BH](F)(F)F".


Processing batch 84000:85000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C1O[BH](F)(F)OC1=O".


Processing batch 85000:86000 (size=1000)
Processing batch 86000:87000 (size=1000)
Processing batch 87000:88000 (size=1000)
Processing batch 88000:89000 (size=1000)
Processing batch 89000:90000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C1CCOCC1".


Processing batch 90000:91000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccccc1OC(F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "COc1ccc([BH](F)(F)F)cc1OC".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCCCc1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)c1ccc2c(c1)CCO2".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-S

Processing batch 91000:92000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "COCCOCC[BH](F)(F)F".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC1CCC([BH](F)(F)F)CC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCOCCC[BH](F)(F)F".


Processing batch 92000:93000 (size=1000)
Processing batch 93000:94000 (size=1000)
Processing batch 94000:95000 (size=1000)
Processing batch 95000:96000 (size=1000)
Processing batch 96000:97000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(c1ccc([BH](F)(F)F)cc1)N1CCCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(c1cccc([BH](F)(F)F)c1)N1CCCCC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CCN(CC)C(=O)c1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(NCc1ccccc1)c1ccc([BH](F)(F)F)cc1".


Processing batch 97000:98000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CON(C)C(=O)c1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC1(C)c2ccccc2-c2ccc([BH](F)(F)F)cc21".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "[N-]=[N+]=NCc1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "[N-]=[N+]=NCc1cccc([BH](F)(F)F)c1".
The standardization request failed. The response contains the fol

Processing batch 98000:99000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)N1CC([BH](F)(F)F)C1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)C1COC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)N1CCC([BH](F)(F)F)CC1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)(C)OC(=O)N1CCC([BH](F)(F)F)C1".


Processing batch 99000:100000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "F[BH](F)(F)CCc1ccccn1".


Processing batch 100000:101000 (size=1000)
Processing batch 101000:102000 (size=1000)


The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "CC(C)C(N)C(=O)OC(C)C1CC=CCCC(O)C(C)(C)C2CCC(C)C3(O2)O[BH]24OC(C(=O)O1)C1(OC(CCC1C)C(C)(C)C(O)CCCC1CC(OC(=O)C3O2)C(C)O1)O4".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "COCCNC(=O)c1ccc([BH](F)(F)F)cc1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "COCCNC(=O)c1cccc([BH](F)(F)F)c1".
The standardization request failed. The response contains the following "PCT-Status" tag: <PCT-Status value="data-error"/>
An exception of class InputDataError was thrown. Return 'None' for query "O=C(c1ccc([BH](F)(F)

In [12]:
pubchemlite_batches[1][pubchemlite_batches[1].std_SMILES.isna()]

Unnamed: 0,SMILES,InChIKey,CompoundName,PubChem_cid,XLogP,MonoisotopicMass,ROMol,InChIKey14,std_SMILES
118475,[2H][BH]([2H])([2H])C#N,BBWWRPHGUGQYTM-BMSJAHLVSA-N,cyano(trideuterio)boranuide,2724022,,43.054685,<rdkit.Chem.rdchem.Mol object at 0x338242980>,BBWWRPHGUGQYTM,
125203,FC(F)(F)c1cc([BH](c2cc(C(F)(F)F)cc(C(F)(F)F)c2...,LSJLFVQOCPUFMW-UHFFFAOYSA-N,"tetrakis[3,5-bis(trifluoromethyl)phenyl]boranuide",2778201,,863.064881,<rdkit.Chem.rdchem.Mol object at 0x33ae56840>,LSJLFVQOCPUFMW,
125930,F[BH](F)(F)c1ccccc1,TWOKLPOPVVGTTR-UHFFFAOYSA-N,trifluoro(phenyl)boranuide,2782845,,145.04364,<rdkit.Chem.rdchem.Mol object at 0x33b610c70>,TWOKLPOPVVGTTR,
125931,COc1ccccc1[BH](F)(F)F,UMTMKESCTWSUTC-UHFFFAOYSA-N,trifluoro-(2-methoxyphenyl)boranuide,2782847,,175.054204,<rdkit.Chem.rdchem.Mol object at 0x33b610cc0>,UMTMKESCTWSUTC,
125932,Cc1ccccc1[BH](F)(F)F,ZMZPFKAYZNLWMF-UHFFFAOYSA-N,trifluoro-(2-methylphenyl)boranuide,2782849,,159.05929,<rdkit.Chem.rdchem.Mol object at 0x33b610d10>,ZMZPFKAYZNLWMF,
125933,COc1ccc([BH](F)(F)F)cc1F,ZRSQBNZILRUUOM-UHFFFAOYSA-N,trifluoro-(3-fluoro-4-methoxyphenyl)boranuide,2782851,,193.044783,<rdkit.Chem.rdchem.Mol object at 0x33b610d60>,ZRSQBNZILRUUOM,
125934,COc1ccc([BH](F)(F)F)cc1,ZBWDBWABYIOLLG-UHFFFAOYSA-N,trifluoro-(4-methoxyphenyl)boranuide,2782853,,175.054204,<rdkit.Chem.rdchem.Mol object at 0x33b610db0>,ZBWDBWABYIOLLG,
125935,CS(=O)(=O)c1ccccc1[BH](F)(F)F,VUPYXBLIMCNRBG-UHFFFAOYSA-N,trifluoro-(2-methylsulfonylphenyl)boranuide,2782857,,223.02119,<rdkit.Chem.rdchem.Mol object at 0x33b610e00>,VUPYXBLIMCNRBG,
125936,CS(=O)(=O)Nc1cccc([BH](F)(F)F)c1,ZKTVPPQABWFBNG-UHFFFAOYSA-N,trifluoro-[3-(methanesulfonamido)phenyl]boranuide,2782859,,238.032089,<rdkit.Chem.rdchem.Mol object at 0x33b610e50>,ZKTVPPQABWFBNG,
125937,F[BH](F)(F)c1cccc(Cl)c1,UBLNYMJDVJUGMF-UHFFFAOYSA-N,(3-chlorophenyl)-trifluoroboranuide,2782861,,179.004668,<rdkit.Chem.rdchem.Mol object at 0x33b610ea0>,UBLNYMJDVJUGMF,


### Connection to terminal to make fingerprints from the SMILES

Will connect to the SIRIUS CLI to use the fingerprinter function, that will convert the standardized SMILES to fingerprints. 

Code made by Assist. Prof. Ida Rahu. 

In [None]:
import subprocess

import pandas as pd
import numpy as np

import requests

email = 'ellinor.samuelsson-hoppe@su.se'
password = 'Canmake123321?'

#subprocess.run('/Applications/sirius.app/Contents/MacOS/sirius') #added to work with sirius command

process = subprocess.Popen(['/Applications/sirius.app/Contents/MacOS/sirius', 'login', '-u', email, '-p'], 
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.PIPE,
                           stdin=subprocess.PIPE,
                           text=True)

process.communicate(input=password + '\n')

stdout, stderr = process.communicate()
print('STDOUT:\n', stdout)
print('STDERR:\n', stderr)

path = '/Users/elli/Library/CloudStorage/OneDrive-Kruvelab/Ellinor - Master thesis/Data/PubChemLite/'

smiles_file = path + '\\.tsv'
output_file = path + '\\.tsv'
fp_version_file = path + '\\.tsv'
charge = 1 # 1 for ESI+/ -1 for ESI-
command = ['/Applications/sirius.app/Contents/MacOS/sirius', '-i', smiles_file, 'fingerprinter', '--charge', str(charge), '-o', output_file, '-v', fp_version_file]

result = subprocess.run(command, capture_output=True, text=True)

print('STDOUT:\n', result.stdout)
print('STDERR:\n', result.stderr)

STDOUT:
 Enter value for --password (Console password input.): 
Active Subscription is: 'sub|adc29910-cf73-11ec-9d64-0242ac120002 - Academic License'.
Login successful!

STDERR:
 Mar 12, 2025 3:18:15 PM org.apache.commons.beanutils.FluentPropertyBeanIntrospector introspect
INFO: Error when creating PropertyDescriptor for public final void org.apache.commons.configuration2.AbstractConfiguration.setProperty(java.lang.String,java.lang.Object)! Ignoring this property.
Mar 12, 2025 3:18:16 PM de.unijena.bioinf.ms.frontend.core.ApplicationCore <clinit>
INFO: Sirius Workspace Successfull initialized at: /Users/elli/.sirius-5.8
Mar 12, 2025 3:18:16 PM de.unijena.bioinf.ms.frontend.core.ApplicationCore <clinit>
INFO: You run SIRIUS 5.8.7-SNAPSHOT on mac os x_x86_64
Mar 12, 2025 3:18:16 PM de.unijena.bioinf.ms.frontend.core.ApplicationCore <clinit>
INFO: You run SIRIUS in 'CLI' mode.
Mar 12, 2025 3:18:16 PM de.unijena.bioinf.ms.frontend.core.ApplicationCore <clinit>
INFO: Sirius was compiled wit

In [None]:
with open('2025-03-12_PubChemLite_cleaned_smiles.pkl', 'wb') as f:
    pickle.dump(pubchemlite_subset, f)