# 🧬 Step 3 — Descriptor Calculation  
**📅 Date:** 21 August 2025  
**👩‍🔬 Author:** Mohammed Farzana Begum  
**📁 Notebook Path:** notebooks/step3_descriptor_setup.ipynb  

---

## 🎯 Objective  
Generate molecular descriptors for curated meroterpenoids using RDKit and Mordred. These descriptors will serve as input features for ML modeling in Step 4.


In [1]:
from rdkit import Chem
print(Chem.MolFromSmiles("CCO"))


<rdkit.Chem.rdchem.Mol object at 0x00000204CC10EDD0>


In [2]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
import pandas as pd
import numpy as np


## 📥 Load Curated SMILES  
Load canonical SMILES from the cleaned dataset. These molecules have been standardized and deduplicated in Step 2.


In [6]:
import os
print(os.getcwd())


c:\Users\farza\Biotecnika-ML-Screening-of-Plant-Meroterpenoids-for-Anticancer-Activity-Farzana\notebooks


In [2]:
import pandas as pd

df_raw = pd.read_csv("../data/raw_meroterpenoids.csv")
df_raw.head()


Unnamed: 0,compound_id,smiles
0,CMPD001,CC1=CC(=O)C=CC1O
1,CMPD002,CCC2C1CCC(C2)C(=O)O
2,CMPD003,CC(C)C1=CC=C(C=C1)C(C)C(=O)O


In [4]:
from rdkit import Chem
from rdkit.Chem import SaltRemover
import pandas as pd

df_raw = pd.read_csv("../data/raw_meroterpenoids.csv")

def standardize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        remover = SaltRemover.SaltRemover()
        mol = remover.StripMol(mol)
        Chem.SanitizeMol(mol)
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

df_raw["canonical_smiles"] = df_raw["smiles"].apply(standardize_smiles)
df_curated = df_raw.dropna(subset=["canonical_smiles"])
df_curated.to_csv("../data/curated_smiles.csv", index=False)
df_curated.head()


Unnamed: 0,compound_id,smiles,canonical_smiles
0,CMPD001,CC1=CC(=O)C=CC1O,CC1=CC(=O)C=CC1O
1,CMPD002,C1CCC2C(C1)CCC2C(=O)O,O=C(O)C1CCC2CCCCC21
2,CMPD003,CC(C)C1=CC=C(C=C1)C(C)C(=O)O,CC(C)c1ccc(C(C)C(=O)O)cc1


In [5]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
import pandas as pd
import numpy as np

# Load curated SMILES
df = pd.read_csv("../data/curated_smiles.csv")

# Define descriptor functions
def compute_basic_descriptors(mol):
    return {
        "MolWt": Descriptors.MolWt(mol),
        "LogP": Descriptors.MolLogP(mol),
        "TPSA": rdMolDescriptors.CalcTPSA(mol),
        "NumHDonors": rdMolDescriptors.CalcNumHBD(mol),
        "NumHAcceptors": rdMolDescriptors.CalcNumHBA(mol),
        "NumRotatableBonds": rdMolDescriptors.CalcNumRotatableBonds(mol),
        "RingCount": rdMolDescriptors.CalcNumRings(mol),
    }

def compute_ecfp(mol, n_bits=1024, radius=2):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    vec = np.zeros(n_bits, dtype=int)
    for i in fp.GetOnBits():
        vec[i] = 1
    return vec

# Build descriptor matrix
rows = []
for _, row in df.iterrows():
    mol = Chem.MolFromSmiles(row["canonical_smiles"])
    if mol is None:
        continue
    desc = compute_basic_descriptors(mol)
    fp = compute_ecfp(mol)
    for i, bit in enumerate(fp):
        desc[f"ECFP_{i}"] = bit
    desc["compound_id"] = row["compound_id"]
    rows.append(desc)

# Save matrix
desc_df = pd.DataFrame(rows).set_index("compound_id")
desc_df.to_csv("../data/descriptor_matrix.csv")
desc_df.head()




Unnamed: 0_level_0,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,RingCount,ECFP_0,ECFP_1,ECFP_2,...,ECFP_1014,ECFP_1015,ECFP_1016,ECFP_1017,ECFP_1018,ECFP_1019,ECFP_1020,ECFP_1021,ECFP_1022,ECFP_1023
compound_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CMPD001,124.139,0.4325,37.3,1,2,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
CMPD002,168.236,2.2874,37.3,1,1,1,2,0,0,0,...,0,0,0,0,0,1,0,0,0,1
CMPD003,192.258,2.9981,37.3,1,1,3,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [1]:
import pandas as pd

# Assuming you have these two lists already
compound_ids = ['CMPD001', 'CMPD002', 'CMPD003']  # Replace with your actual IDs
canonical_smiles = ['CC(C)C1=CC=CC=C1C(=O)O', 'COC1=CC=CC=C1C(=O)O', 'CCC(=O)OC1=CC=CC=C1']  # Replace with your actual SMILES

# Create the DataFrame
curated_df = pd.DataFrame({
    'compound_id': compound_ids,
    'SMILES': canonical_smiles
})

# Save to CSV
curated_df.to_csv('../data/curated_smiles.csv', index=False)
print("✅ curated_smiles.csv saved to data/ folder")


✅ curated_smiles.csv saved to data/ folder


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

# Load curated SMILES
df_smiles = pd.read_csv('../data/curated_smiles.csv')

# Convert SMILES to Mol objects
df_smiles['Mol'] = df_smiles['SMILES'].apply(Chem.MolFromSmiles)

# Initialize Morgan fingerprint generator
generator = GetMorganGenerator(radius=2, fpSize=1024)

# Generate fingerprints
def get_ecfp(mol):
    fp = generator.GetFingerprint(mol)
    return list(fp)

# Apply fingerprint function
fp_matrix = df_smiles['Mol'].apply(get_ecfp)
fp_df = pd.DataFrame(fp_matrix.tolist())

# Combine with Compound_ID
df_descriptors = pd.concat([df_smiles[['Compound_ID']], fp_df], axis=1)

# Save descriptor matrix
df_descriptors.to_csv('../data/descriptor_matrix.csv', index=False)


In [4]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

# Load curated SMILES
df_smiles = pd.read_csv('../data/curated_smiles.csv')

# Convert SMILES to Mol objects
df_smiles['Mol'] = df_smiles['SMILES'].apply(Chem.MolFromSmiles)

# Initialize Morgan fingerprint generator
generator = GetMorganGenerator(radius=2, fpSize=1024)

# Generate fingerprints
def get_ecfp(mol):
    fp = generator.GetFingerprint(mol)
    return list(fp)

fp_matrix = df_smiles['Mol'].apply(get_ecfp)
fp_df = pd.DataFrame(fp_matrix.tolist())

# Combine with Compound_ID
df_descriptors = pd.concat([df_smiles[['Compound_ID']], fp_df], axis=1)

# Save descriptor matrix
df_descriptors.to_csv('../data/descriptor_matrix.csv', index=False)


In [5]:
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import Chem, DataStructs
import numpy as np
import pandas as pd

df_smiles = pd.read_csv('../data/curated_smiles.csv')
df_smiles['Mol'] = df_smiles['SMILES'].apply(Chem.MolFromSmiles)

generator = GetMorganGenerator(radius=2, fpSize=1024)

def mol_to_fp(mol):
    arr = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(generator.GetFingerprint(mol), arr)
    return arr

fp_array = np.array([mol_to_fp(mol) for mol in df_smiles['Mol']])
df_fp = pd.DataFrame(fp_array)
df_fp.insert(0, 'Compound_ID', df_smiles['Compound_ID'])
df_fp.to_csv('../data/descriptor_matrix.csv', index=False)


In [7]:
shutil.copy('../data/descriptor_matrix.csv', '../backups/descriptor_matrix_backup.csv')


'../backups/descriptor_matrix_backup.csv'

In [6]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import pandas as pd
import numpy as np

df = pd.read_csv('../data/virtual_library.csv')

# Create Morgan fingerprint generator
generator = GetMorganGenerator(radius=2, fpSize=1024)

def smiles_to_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = generator.GetFingerprint(mol)
    return np.array(fp)

# Generate fingerprints
fps = df['SMILES'].apply(smiles_to_fp)
fps = fps.dropna()

# Convert to DataFrame
fp_matrix = pd.DataFrame(fps.tolist())
fp_matrix['Compound_ID'] = df.loc[fps.index, 'Compound_ID'].values
fp_matrix['SMILES'] = df.loc[fps.index, 'SMILES'].values

fp_matrix.to_csv('../data/virtual_descriptors.csv', index=False)


In [7]:
import pandas as pd
from rdkit import Chem

df = pd.read_csv('../data/curated_smiles.csv')

def is_valid(smiles):
    return Chem.MolFromSmiles(smiles) is not None

df['Valid'] = df['SMILES'].apply(is_valid)
df_invalid = df[df['Valid'] == False]
df_invalid.to_csv('../data/invalid_smiles.csv', index=False)


In [8]:
df_curated = pd.read_csv('../data/curated_smiles.csv')
print("Total curated SMILES:", len(df_curated))


Total curated SMILES: 119


In [9]:
df_fp = pd.read_csv('../data/virtual_descriptors.csv')
print("Total descriptors generated:", len(df_fp))


Total descriptors generated: 110


In [10]:
missing_ids = set(df_curated['Compound_ID']) - set(df_fp['Compound_ID'])
print("Missing Compound_IDs:", missing_ids)


Missing Compound_IDs: {'M024', 'M054', 'M055', 'M080', 'M036', 'M051', 'M056', 'M046', 'M049'}
