In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Fragments
from collections import Counter

# Load the dataset
datasets = pd.read_excel('ML-improve.xlsx')

# Extract SMILES information
smis = datasets.SMILES

# Define functional groups using SMARTS patterns
functional_groups = {
    "Total Hydroxyl (-OH)": "[OH]",  
    "Carboxyl (-COOH)": "C(=O)[OH]",  # Ensure C=O is directly bonded to OH
    "Benzene Ring": "c1ccccc1",
    "Ether (-O-)": "[#6]-O-[#6]",  # Ensure O is between two carbon atoms
    "Fluorine (-F)": "[F]",
    "Chlorine (-Cl)": "[Cl]",
    "Methyl (-CH3)": "[CH3]",
    "Sulfonic Acid (-SO3H)": "S(=O)(=O)[OH]",
    "Ester (-COO-)": "C(=O)O[#6]",  # Ensure the ester oxygen is connected to a carbon, not hydrogen
    "Amine (-NH2)": "[NH2]",
    "Amide (-CONH-)": "C(=O)N[#6]",  # Avoid mismatches
    "Sulfonamide (-SO2NH-)": "S(=O)(=O)N[#6]"  # Ensure nitrogen is bonded to a carbon
}

# Function to detect functional groups in a molecule
def detect_fragments(mol):
    fragment_counts = Counter()
    for group, smarts in functional_groups.items():
        if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)):
            fragment_counts[group] += 1
    return fragment_counts

# Count functional groups across the entire dataset
total_fragments = Counter()
for smi in smis:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        total_fragments.update(detect_fragments(mol))

# Get the 12 most common functional groups
top_12_fragments = total_fragments.most_common(12)

# Output the names and SMARTS of the top functional groups
for idx, (frag_name, count) in enumerate(top_12_fragments, 1):
    smiles = functional_groups.get(frag_name, "N/A")
    print(f"{idx}. {frag_name}: {count} times, SMILES: {smiles}")




1. Benzene Ring: 1299 times, SMILES: c1ccccc1
2. Methyl (-CH3): 1276 times, SMILES: [CH3]
3. Total Hydroxyl (-OH): 1201 times, SMILES: [OH]
4. Carboxyl (-COOH): 594 times, SMILES: C(=O)[OH]
5. Ether (-O-): 545 times, SMILES: [#6]-O-[#6]
6. Amine (-NH2): 435 times, SMILES: [NH2]
7. Chlorine (-Cl): 354 times, SMILES: [Cl]
8. Amide (-CONH-): 308 times, SMILES: C(=O)N[#6]
9. Fluorine (-F): 243 times, SMILES: [F]
10. Ester (-COO-): 205 times, SMILES: C(=O)O[#6]
11. Sulfonamide (-SO2NH-): 158 times, SMILES: S(=O)(=O)N[#6]
12. Sulfonic Acid (-SO3H): 92 times, SMILES: S(=O)(=O)[OH]
