In [1]:
import warnings
import numpy as np
import pandas as pd
from pubchempy import get_compounds, Compound

warnings.filterwarnings("ignore")

In [2]:
def get_smiles(df):
    """This function collects and completes SMILES."""
    for smi, i in zip(df["smiles"], range(df.shape[0])):
        smiles = 'error'
        # noinspection PyBroadException
        try:
            if pd.isnull(smi):
                # use the CAS number or chemical name to 
                # acquire isomeric SMILES to augment null SMILES
                cas = df.loc[i, 'cas_number']
                if pd.isnull(cas):
                    chem = df.loc[i, 'chemical_name']
                    if pd.isnull(chem):
                        df.loc[i, 'smiles_supplementary'] = 'null'
                    else:
                        # generate isomeric SMILES using the chemical name
                        for compound in get_compounds(chem, 'name'):
                            smiles = compound.isomeric_smiles
                        df.loc[i, 'smiles_supplementary'] = smiles
                else:
                    # generate isomeric SMILES using CAS number
                    for compound in get_compounds(cas, 'name'):
                        smiles = compound.isomeric_smiles
                    df.loc[i, 'smiles_supplementary'] = smiles
            else:
                # directly add to the 'smiles supplementary' column using the SMILES
                smiles = df.loc[i, 'smiles']
                df.loc[i, 'smiles_supplementary'] = smiles
        except:
            df.loc[i, 'smiles_supplementary'] = smiles

    return df

In [3]:
def del_uncertain(df):
    """This function deletes incorrect SMILES and saves the incorrect SMILES information separately in another table.""" 
    # convert the 'smiles supplementary' column's type to a string
    df["smiles_supplementary"] = df["smiles_supplementary"].astype(str)
    # filter out rows for which smiles_supplementary are 'error'
    df1 = df.loc[df["smiles_supplementary"].str.contains('error')]
    # delete the 'smiles_supplementary' column is empty and wrong data
    df2 = df.loc[-df["smiles_supplementary"].str.contains('null')]
    df3 = df2.loc[-df2["smiles_supplementary"].str.contains('error')]
    
    # rest the index
    for j in [df1, df3]:
        j.index = range(j.shape[0])

    return df1, df3

In [4]:
if __name__ == "__main__":
    
    # read data
    data = pd.read_excel(r"2_delete_uncertain_VDss.xlsx")
    # augment SMILES
    data = get_smiles(data)
    # separate correct and incorrect SMILES
    smiles_error, delete_uncertain = del_uncertain(data)
    
    # save files
    smiles_error.to_excel('3_smiles_error.xlsx', index=False)
    delete_uncertain.to_excel('3_delete_uncertain_smiles.xlsx', index=False)