In [2]:
import os
import rdkit
import mordred
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import SDMolSupplier
from mordred import Calculator, descriptors

In [3]:
def sanitize_add_hydrogens_and_force_field(mol_file, output_folder):
    try:
        mol = Chem.MolFromMolFile(mol_file, sanitize=False)
        Chem.SanitizeMol(mol)
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)
        AllChem.MMFFOptimizeMolecule(mol)
        output_file = os.path.splitext(os.path.basename(mol_file))[0] + '_sanitized.mol'
        output_path = os.path.join(output_folder, output_file)
        Chem.MolToMolFile(mol, output_path)
        print(f"Sanitized molecule with force field saved to {output_path}")
    except Exception as e:
        print(f"Error processing file {mol_file}: {e}")

def sanitize_folder(input_folder, output_folder):
    if not os.path.exists(input_folder):
        print(f"Folder {input_folder} does not exist.")
        return
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for file in os.listdir(input_folder):
        if file.endswith(".mol"):
            file_path = os.path.join(input_folder, file)
            sanitize_add_hydrogens_and_force_field(file_path, output_folder)

if __name__ == "__main__":
    input_folder = input("Enter the path to the folder containing .mol files: ")
    output_folder = input("Enter the path to the folder where sanitized files should be saved: ")
    sanitize_folder(input_folder, output_folder)


Enter the path to the folder containing .mol files:  /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers
Enter the path to the folder where sanitized files should be saved:  /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned


Sanitized molecule with force field saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/APmitacrylat_sanitized.mol
Sanitized molecule with force field saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/OA_mitDA_sanitized.mol
Sanitized molecule with force field saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/TBS mit buacrylat_sanitized.mol
Sanitized molecule with force field saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/TBS mit acrylat_sanitized.mol
Sanitized molecule with force field saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/OA_mitbuacrylate_sanitized.mol


In [4]:
from rdkit.Chem import AllChem, Descriptors3D
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Descriptors3D import (
        Asphericity,
        Eccentricity,
        InertialShapeFactor,
        NPR1,
        NPR2,
        PMI1,
        PMI2,
        PMI3,
        RadiusOfGyration,
        SpherocityIndex,
        PBF,
    )

DdescList_3D= [
            ("Asphericity", Asphericity),
            ("Eccentricity", Eccentricity),
            ("InertialShapeFactor", InertialShapeFactor),
            ("NPR1", NPR1),
            ("NPR2", NPR2),
            ("PMI1", PMI1),
            ("PMI2", PMI2),
            ("PMI3", PMI3),
            ("RadiusOfGyration", RadiusOfGyration),
            ("SpherocityIndex", SpherocityIndex),
            ("PBF",PBF),
        ]
def calculate_descriptors(mol_file):
    mol = Chem.MolFromMolFile(mol_file)
    if mol is None:
        return None
    descriptors = {}
    for desc_name, desc_func in Descriptors.descList:
        try:
            descriptors[desc_name] = desc_func(mol)
        except Exception as e:
            descriptors[desc_name] = None
    for desc_name, desc_func in DdescList_3D:
        try:
            descriptors[desc_name] = desc_func(mol)
        except Exception as e:
            descriptors[desc_name] = None
            
    return descriptors

def main(folder_path):
    if not os.path.isdir(folder_path):
        print("Folder does not exist.")
        return
    mol_files = [f for f in os.listdir(folder_path) if f.endswith('.mol')]
    descriptors_list = []
    for mol_file in mol_files:
        mol_file_path = os.path.join(folder_path, mol_file)
        descriptors = calculate_descriptors(mol_file_path)
        if descriptors:
            filename = mol_file.replace('_sanitized', '')
            descriptors['Filename'] = filename 
            descriptors_list.append(descriptors)
    df = pd.DataFrame(descriptors_list)

    cols = df.columns.tolist()
    cols = ['Filename'] + [col for col in cols if col != 'Filename']
    df = df[cols]

    output_excel = os.path.join(folder_path, 'descriptors.xlsx')
    df.to_excel(output_excel, index=False)
    print(f"Descriptors saved to {output_excel}")

if __name__ == "__main__":
    folder_path = input("Enter the path to the folder containing .mol files: ")
    main(folder_path)


Enter the path to the folder containing .mol files:  /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned


Descriptors saved to /home/akm/Felix_ML/Moldescr_poly/siRNA_ML/New_revision_polymers/cleaned/descriptors.xlsx
