In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, Descriptors
from mordred import Calculator, descriptors

In [2]:
alldata = pd.read_excel(r"/opt/jupyter_data/data_clean/cleaning/6_alldata.xlsx")

In [3]:
def convert_3d(df, sdf_fname="LogS_3d.sdf"):
    """Convert the smiles to sdf in 3D coordinates."""
    # rest the index
    df.reset_index(drop=True, inplace=True)
    writer = Chem.SDWriter(sdf_fname)
    failed_idx = []

    for row_idx, row in df.iterrows():
        mol = Chem.MolFromSmiles(row["SMILES"])
        logP = Descriptors.MolLogP(mol)
        if mol is not None:
            if str(row["CAS_number"]) != "nan":
                mol_name = str(row["CAS_number"])
            else:
                mol_name = str(row["SMILES"])
            # add property
            mol.SetProp("_Name", mol_name)
            # molecule name
            mol.SetProp("name", mol_name)
            mol.SetProp("CAS_number", str(row["CAS_number"]))
            mol.SetProp("VDss(L/kg)", str(row["VDss(L/kg)"]))
            mol.SetProp("SD", str(row["SD"]))
            mol.SetProp("fu(h)", str(row["fu_h"]))
            mol.SetProp("reference_number", str(row["reference_number"]))
            mol.SetProp("t 1/2", str(row["t_1/2"]))
            mol.SetProp("SMILES", str(row["SMILES"]))
            mol.SetProp("InChI", str(row["InChI"]))
            mol.SetProp("MW", str(row["MW"]))
            mol.SetProp("logP", str(row["logP"]))
            mol.SetProp("logVDss", str(row["logVDss"]))
            # noinspection PyBroadException
            try:
                mol = Chem.AddHs(mol)
                AllChem.EmbedMolecule(mol, randomSeed=9)
                # the following code will raise some errors
                maxIters = 400
                mini_tag = AllChem.MMFFOptimizeMolecule(mol, "MMFF94s", maxIters=maxIters)
                # 0 optimize converged
                # -1 can not set up force field
                # 1 more iterations required
                if mini_tag == 1:
                    AllChem.MMFFOptimizeMolecule(mol,
                                                 "MMFF94s",
                                                 maxIters=maxIters*2)
                elif mini_tag == -1:
                    AllChem.UFFOptimizeMolecule(mol, maxIters=400)
                writer.write(mol)
            except:
                print(f"cannot set up force field for {row_idx}")
                failed_idx.append(row_idx)
        else:
            print(f"cannot read molecule {row_idx}")
            failed_idx.append(row_idx)
    
    writer.close()

    # drop failed molecules
    df_valid = df.drop(failed_idx)

    return df_valid

In [4]:
def drop_error_data(df):
    """This function deletes data that could not be converted into 3D."""
    df_drop_error = pd.DataFrame(columns=df.columns)
    # 
    inlist = [r.strip("\\n") for r in open('LogS_3d.sdf').readlines()]
    j = 0
    for row_idx, smi in zip(range(df.shape[0]), df['smiles']):
        for i in range(len(inlist)):
            # 
            if smi == inlist[i].strip():
                df_drop_error.loc[j] = df.loc[row_idx]
                j += 1
                break
            else:
                continue

    return df_drop_error

In [5]:
def calculator_descriptors(mol):
    """This function computes a descriptor for the data."""
    calc = Calculator(descriptors, ignore_3D=False)
    calculator_descriptor = calc.pandas(Chem.SDMolSupplier(mol))

    return calculator_descriptor

In [6]:
def transform(df):
    """This function converts data from boolean to numeric."""
    notmath_columns = [col for col in df.columns.values if df[col].dtype == 'bool']
    for i in notmath_columns:
        labels = df[i].unique().tolist()
        df[i] = df[i].apply(lambda x: labels.index(x))

    return df

In [7]:
def delete_error(df, df1):
    """This function removes features and data based on critical values."""
    columns_list = df.columns
    error_columns = []
    error_data = []
    error_low = []
    
    for col in columns_list:
        j = 0
        # count the total number of string types in the column
        for i in range(len(df)):
            if isinstance(df.loc[i, col], str):
                j += 1
            else:
                continue
        # group together columns whose total number of strings is greater than a threshold
        if j > 30:
            error_columns.append(col)
        # group together columns whose total number of strings is less than a critical value
        elif j >= 1:
            error_data.append(col)
    
    # delete the corresponding features whose 
    # total number of strings is greater than the critical value
    for i in error_columns:
        df.drop(i, axis=1, inplace=True)
    
    #  delete the data corresponding to the string
    for i in range(len(df)):
        for col in error_data:
            # Judge the data type one by one and delete the data of type string
            if isinstance(df.loc[i, col], str):
                df.drop(i, axis=0, inplace=True)
                df1.drop(i, axis=0, inplace=True)
                error_low.append(i)
                break
            else:
                continue
    df.reset_index(drop=True, inplace=True)
    df1.reset_index(drop=True, inplace=True)

    return df, df1, error_low

In [8]:
if __name__ == "__main__":
    # set the logger
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.CRITICAL)
    # generate 3D files
    df_unique = convert_3d(df=alldata, sdf_fname="../../../VDss_dataset/VDss_3d.sdf")
    # delete data that failed to generate 3D successfully
    data = drop_error_data(df_unique)
    # compute descriptor
    calculator_descriptors = calculator_descriptors("../../../VDss_dataset/VDss_3d.sdf")
    # bool type descriptor converted to numeric type
    features = transform(calculator_descriptors)
    # save feature and data files
    features.to_excel("descriptors.xlsx", index=False)
    data.to_excel("data.xlsx", index=False)

100%|██████████| 2420/2420 [03:38<00:00, 11.07it/s]


In [9]:
# read data
features = pd.read_excel(r"descriptors.xlsx")
data = pd.read_excel(r"data.xlsx")

In [10]:
# output feature information
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Columns: 1826 entries, ABC to mZagreb2
dtypes: float64(948), int64(341), object(537)
memory usage: 33.7+ MB


In [11]:
# remove wrong rows and columns in feature matrix
features, data, error_lows = delete_error(features, data)

In [12]:
# turn data into tables
error_lows = pd.DataFrame(error_lows)

In [13]:
# count the number of duplicate data
features.duplicated().sum()

0

In [14]:
print("feature shape: ", features.shape)

feature shape:  (2370, 1443)


In [15]:
# save files
features.to_excel("descriptors_1.xlsx", index=False)
data.to_excel("final_data.xlsx", index=False)