In [12]:
#load, sanitize add H atoms and forcefield, saves the processed files in a specified output folder
def sanitize_add_hydrogens_and_force_field(mol_file, output_folder):
    try:
        # Load mol file
        mol = Chem.MolFromMolFile(mol_file, sanitize=False)
         # Sanitize molecule
        Chem.SanitizeMol(mol)
        # Add hydrogen atoms
        mol = Chem.AddHs(mol)
        
        # Add force field
        AllChem.EmbedMolecule(mol, randomSeed = 42, useBasicKnowledge=True)
        AllChem.MMFFOptimizeMolecule(mol)
        
        # Write the sanitized molecule with hydrogens and force field added
        output_file = os.path.splitext(os.path.basename(mol_file))[0] + '_sanitized.mol'
        output_path = os.path.join(output_folder, output_file)
        Chem.MolToMolFile(mol, output_path)
    except Exception as e:
        print(f"Error processing file {mol_file}: {e}")

def sanitize_folder(input_folder, output_folder):
    if not os.path.exists(input_folder):
        print(f"Folder {input_folder} does not exist.")
        return
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file in os.listdir(input_folder):
        if file.endswith(".mol"):
            file_path = os.path.join(input_folder, file)
            sanitize_add_hydrogens_and_force_field(file_path, output_folder)

if __name__ == "__main__":
    input_folder = '/Users/nora/Desktop/PhD_LMU/Polymer Descriptors'
    output_folder =  os.path.join(input_folder, 'sanitized')
    sanitize_folder(input_folder, output_folder)


Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Rui22E63_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13B5S3_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13B3S5_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Yan17si_C10-SH_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13B3S4_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Rui22E65_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Rui22E58_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/



Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Yan17si_A19_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Yan17si_A4_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Rui22B7Sc90_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Rui22E56_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13E3_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13E5_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzeng13E7_sanitized.mol
Sanitized molecule with force field saved to /Users/nora/Desktop/PhD_LMU/Polymer Descriptors/sanitized/Tzen

In [None]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors
from rdkit.Chem import SDMolSupplier


# Adjusted process_monomers function
def process_monomers(input_df, main_df, column_name, suffix):
    dfs = []
    for monomer in input_df[column_name]:
        monomer_rows = main_df[main_df['Filename'].str.replace('_sanitized', '') == monomer]  # Ensure filenames match
        # Rename columns except 'Filename'
        monomer_rows = monomer_rows.rename(columns={col: f'{col}{suffix}' for col in monomer_rows.columns if col != 'Filename'})
        dfs.append(monomer_rows)
    return pd.concat(dfs, ignore_index=True, sort=False)
    
def construct_polymer_name(row):
    paper_component = str(row['paper'])
    monomer1_component = str(row['monomer 1'])
    distribution_component = str(row['distribution'])
    monomer2_component = str(row['monomer 2'])
    return f"{paper_component}_{monomer1_component}_{distribution_component}{monomer2_component}"
    
# Ensure to adjust for the correct path handling
output_folder = os.path.join(input_folder, 'sanitized')  

# Mordred Descriptor, can be replaced with any descriptor calculator
# Initialize the calculator mordred
calc = Calculator(descriptors, ignore_3D=False)
data = []
for filename in os.listdir(output_folder):
    if filename.endswith('.mol'):
        file_path = os.path.join(output_folder, filename)
        # Load the molecule from the .mol file
        suppl = SDMolSupplier(file_path)
        for mol in suppl:
            if mol is not None:
                desc_values = calc(mol)
                # Filename modification is not needed here since it's done in process_monomers
                data.append([filename.replace('_sanitized', '')] + list(desc_values.values()))

columns = ['Filename'] + list(desc_values.keys())
df = pd.DataFrame(data, columns=columns)

# write output file to check
output_file = os.path.join(input_folder, 'mordred_descriptors.xlsx')  
df.to_excel(output_file, index=False)

#Concatenation
# Ensure you have the Excel file path
input_file_path = os.path.join(input_folder, 'Input_polymer_composition_siRNA.xlsx')# Adjust if Input File Changes
main_file_path = os.path.join(input_folder, 'mordred_descriptors.xlsx') 

# Load the main DataFrame from the Excel file
main_df = pd.read_excel(main_file_path)
input_df = pd.read_excel(input_file_path)

# Processing monomers and concatenating DataFrames
hydrophil_rows_df = process_monomers(input_df, main_df, 'hydrophil', '1')
lipophil_rows_df = process_monomers(input_df, main_df, 'lipophil', '2')
endcapping_rows_df = process_monomers(input_df, main_df, 'endcapping', '3')

polymer_descriptors_df = input_df[['lipophil_w', 'molecular_weight', 'cell type']].copy()

combined_df = pd.concat([hydrophil_rows_df, lipophil_rows_df, endcapping_rows_df, polymer_descriptors_df], axis=1)

combined_df['Polymer Name'] = input_df.apply(construct_polymer_name, axis=1)

combined_df = combined_df[['Polymer Name'] + [col for col in combined_df.columns if col != 'Polymer Name']]

# Ensure the 'Filename' columns are correctly managed
combined_df = combined_df.drop(columns=[col for col in combined_df.columns if col.startswith('Filename')])

print("Contents of combined_df:")
print(combined_df)


# Specify the path and name of the Excel file you want to save#
output_excel_path = os.path.join(input_folder, 'combined_data.xlsx')

# Save combined_df to an Excel file, without including the index
combined_df.to_excel(output_excel_path, index=False)

print(f"DataFrame saved to {output_excel_path}")

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Contents of combined_df:
                  Polymer Name  \
0           Yan17_PE8K_A1_0CSH   
1        Yan17_PE8K_A1_0.2C6SH   
2       Yan17_PE8K_A1_0.33C6SH   
3        Yan17_PE8K_A1_0.5C6SH   
4       Yan17_PE8K_A1_0.67C6SH   
..                         ...   
617  Rui22_B7Sc90_0.15Sc12-E63   
618   Rui22_B7Sc90_0.5Sc12-E63   
619  Rui22_B7Sc90_0.65Sc12-E63   
620   Rui22_B7Sc90_0.8Sc12-E63   
621     Rui22_B7Sc90_1Sc12-E63   

                                                  ABC1  \
0    module 'numpy' has no attribute 'float'.\n`np....   
1    module 'numpy' has no attribute 'float'.\n`np....   
2    module 'numpy' has no attribute 'float'.\n`np....   
3    module 'numpy' has no attribute 'float'.\n`np....   
4    module 'numpy' has no attribute 'float'.\n`np....   
..                                                 ...   
617  module 'numpy' has no attribute 'float'.\n`np....   
618  module 'numpy' has no attribute 'float'.\n`np....   
619  module 'numpy' has no attribute 'float'