In [1]:
import warnings
import pandas as pd
from pubchempy import get_compounds, Compound

warnings.filterwarnings("ignore")

In [2]:
# read data
modify_data = pd.read_excel(r"3_modify.xlsx")
del_uncertain = pd.read_excel(r"3_delete_uncertain_smiles.xlsx")
print(modify_data.shape)
print(del_uncertain.shape)

(128, 13)
(7435, 11)


In [3]:
def get_smiles(df):
    """This function generate isomeric SMILES."""
    columns_1 = ['name1', 'name2']
    columns_2 = ['smiles_1', 'smiles_2']
    for column_1, column_2 in zip(columns_1, columns_2):
        for chem, i in zip(df[column_1], range(df.shape[0])):
            smiles = 'error'
            # noinspection PyBroadException
            try:
                # generate isomeric SMILES using the chemical name
                for compound in get_compounds(chem, 'name'):
                    smiles = compound.isomeric_smiles
                df.loc[i, column_2] = smiles
            except:
                df.loc[i, column_2] = smiles

    return df

In [4]:
def choice_smiles(df):
    """This function compares SMILES and chooses the correct SMILES."""
    for i, smi in zip(range(df.shape[0]), df["smiles_1"]):
        if smi == 'error':
            smi = df.loc[i, "smiles_2"]
            if smi == 'error':
                continue
            else:
                # If only name2 correctly produces SMILES, 
                # maintains them, and updates the chemical name information
                df.loc[i, "smiles_supplementary"] = df.loc[i, "smiles_2"]
                df.loc[i, "chemical_name"] = df.loc[i, "name2"]
        elif df.loc[i, "smiles_2"] == 'error':
            # If only name1 correctly produces SMILES, maintains them, 
            # and updates the chemical name information
            df.loc[i, "smiles_supplementary"] = df.loc[i, "smiles_1"]
            df.loc[i, "chemical_name"] = df.loc[i, "name1"]
        elif df.loc[i, "smiles_1"] == df.loc[i, "smiles_2"]:
            # name1 and neme2 generate the same SMILES and retain one of the information
            df.loc[i, "smiles_supplementary"] = df.loc[i, "smiles_1"]
        else:
            continue

    return df

In [5]:
def update(df, df1):
    """This function deletes error SMILES and merge files."""   
    # delete the row that contains the content of 'error'
    df = df.loc[-df["smiles_supplementary"].str.contains('error')]
    # update index
    df.index = range(df.shape[0])
    # delete extra columns
    df.drop(['name1', 'name2', 'smiles_1', 'smiles_2'], axis=1, inplace=True)
    # combine data
    df = pd.concat([df1, df])
    # #update index
    df.index = range(df.shape[0])
    
    return df

In [6]:
if __name__ == "__main__":
    # get isomeric SMILES
    modify_data = get_smiles(modify_data)
    # choice correct SMILES
    modify_data = choice_smiles(modify_data)
    # deletes error SMILES and merge files
    data = update(modify_data, del_uncertain)
    
    # save file
    data.to_excel("4_delete_uncertain_smiles.xlsx", index=False)

In [7]:
data.shape

(7473, 11)