In [19]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.ML.Descriptors import MoleculeDescriptors as md

In [20]:
def molecule_from_smiles(smiles):
    try:
        # Extract molecule
        molecule = Chem.MolFromSmiles(smiles, sanitize=True)
        if molecule is None:
            return None, "failed"
        # Remove salts
        clean_molecule = rdMolStandardize.LargestFragmentChooser()
        molecule = clean_molecule.choose(molecule)
        # Sanitize molecule again to reflect changes
        Chem.SanitizeMol(molecule)
        return molecule, "succeed"
    
    except Exception as e:
        return None, f"error: {e}"
    
def calculate_descriptors(molecule):
    # Get all descriptors (1D/2D)
    descriptor_names = []
    for descriptor, _ in Descriptors._descList:
        descriptor_names.append(descriptor)
    # Use descriptors to calculate values
    calculator = md.MolecularDescriptorCalculator(descriptor_names)
    descriptor_values = calculator.CalcDescriptors(molecule)
    # Create dictionary
    descriptors = dict(zip(descriptor_names, descriptor_values))
    return descriptors

def calculate_fingerprints(molecule):
    fingerprints = {}
    # Morgan fingerprint with radius 2 (atom and surroundings to radius 2)
    morgan = rdMolDescriptors.GetMorganFingerprintAsBitVect(molecule, radius=2, nBits=2048)
    fingerprints["Morgan2048"] = list(morgan.GetOnBits())
    # RDkit fingerprint
    rdkit = RDKFingerprint(molecule, fpSize=2048)
    fingerprints["RDkit2048"] = list(rdkit.GetOnBits())
    return fingerprints

In [None]:
# Load dataset
dataset = pd.read_excel("in_chemico_dataset.xlsx", engine="openpyxl", skiprows=1)

descriptor_rows = []
fingerprint_rows = []
state_molecules = []
molecules = []

# Compute descriptors and fingerprints
for smiles in dataset["SMILES code"].astype(str):
    # Get molecule
    molecule, state = molecule_from_smiles(smiles)
    state_molecules.append(state)
    molecules.append(molecule)
    # Calculate
    if molecule is None:
        descriptor_rows.append({})
        fingerprint_rows.append({})
        continue
    descriptor_rows.append(calculate_descriptors(molecule))
    fingerprint_rows.append(calculate_fingerprints(molecule))

# Make dataframes
descriptor_data = pd.DataFrame(descriptor_rows)
fingerprint_data = pd.DataFrame(fingerprint_rows)

# Merge them
output = pd.concat([dataset.reset_index(drop=True), descriptor_data, fingerprint_data], axis=1)
# Add log data
output["MoleculeStatus"] = state_molecules

# Save to xlsx
with pd.ExcelWriter("in_chemico_dataset_processed.xlsx", engine="openpyxl") as writer:
    output.to_excel(writer, index=False, sheet_name="Descriptors-Fingerprints")

print(f"Rows: {len(output)}/Columns: {output.shape[1]}")
print(output.head().to_string(index=False))

[13:47:33] Running LargestFragmentChooser
[13:47:33] Fragment: C=C[C@H]1CN2CC[C@H]1C[C@H]2[C@H](O)c1ccnc2ccc(OC)cc12
[13:47:33] New largest fragment: C=C[C@H]1CN2CC[C@H]1C[C@H]2[C@H](O)c1ccnc2ccc(OC)cc12 (48)
[13:47:33] Fragment: Cl
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Fragment: CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21
[13:47:33] New largest fragment: CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21 (40)
[13:47:33] Fragment: Cl
[13:47:33] Running LargestFragmentChooser
[13:47:33] Running LargestFragmentChooser
[13:47:33] Fragment: CC1c2cccc(O)c2C(O)=C2C(=O)C3(O)C(O)=C(C(N)=O)C(=O)C(N(C)C)C3C(O)C21
[13:47:33] New largest fragment: CC1c2cccc(O)c2C(O)=C2C(=O)C3(O)C(O

Rows: 162/Columns: 232
                          Name                                                                                              IUPAC name CAS registry number    Structure  Phototoxicity                                                      SMILES code                            Sources               Note    Unnamed: 8 Unnamed: 9  Unnamed: 10 Unnamed: 11  MaxAbsEStateIndex  MaxEStateIndex  MinAbsEStateIndex  MinEStateIndex      qed       SPS   MolWt  HeavyAtomMolWt  ExactMolWt  NumValenceElectrons  NumRadicalElectrons  MaxPartialCharge  MinPartialCharge  MaxAbsPartialCharge  MinAbsPartialCharge  FpDensityMorgan1  FpDensityMorgan2  FpDensityMorgan3  BCUT2D_MWHI  BCUT2D_MWLOW  BCUT2D_CHGHI  BCUT2D_CHGLO  BCUT2D_LOGPHI  BCUT2D_LOGPLOW  BCUT2D_MRHI  BCUT2D_MRLOW   AvgIpc  BalabanJ    BertzCT      Chi0     Chi0n     Chi0v      Chi1    Chi1n    Chi1v    Chi2n    Chi2v    Chi3n    Chi3v    Chi4n    Chi4v  HallKierAlpha           Ipc    Kappa1   Kappa2   Kappa3  LabuteASA  PE