In [None]:
# Notebook Cell 1: Compute and Save Bertz Scores

import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem.GraphDescriptors import BertzCT
from rdkit.Chem import rdMolDescriptors
from tqdm import tqdm

# ---------------------------
# CONFIGURATION
# ---------------------------
input_csv = "../pubchem/train_200k.csv"  # Input CSV file
output_dir = "../indigo_simple_render"
os.makedirs(output_dir, exist_ok=True)
bertz_csv = os.path.join(output_dir, "bertz_scores.csv")

# ---------------------------
# STEP 1: READ CSV & COMPUTE MOLECULE PROPERTIES
# ---------------------------
# The CSV is assumed to have at least the following columns:
# pubchem_cid, InChI, SMILES, num_atoms
df = pd.read_csv(input_csv)

molecule_data = []
print("Computing Bertz scores and other properties...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing molecules"):
    smiles = row['SMILES']
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            continue
        # Compute Bertz complexity
        bertz = BertzCT(mol)
        # Compute total hydrogen atom count (implicit + explicit)
        hydrogen_count = sum(atom.GetTotalNumHs() for atom in mol.GetAtoms())
        # Check if molecule has aromatic rings (using RDKit's descriptor)
        num_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings(mol)
        has_aromatic = num_aromatic_rings > 0
        
        molecule_data.append({
            "pubchem_cid": row["pubchem_cid"],
            "SMILES": smiles,
            "bertz": bertz,
            "hydrogen_atom_count": hydrogen_count,
            "has_aromatic_ring": has_aromatic
        })
    except Exception as e:
        print(f"Error processing index {idx} with SMILES '{smiles}': {e}")

# Convert to DataFrame and save all molecule data (including Bertz scores)
molecule_df = pd.DataFrame(molecule_data)
molecule_df.to_csv(bertz_csv, index=False)
print(f"Saved Bertz scores and molecule properties for {len(molecule_df)} molecules to {bertz_csv}")


Computing Bertz scores and other properties...


Processing molecules:  52%|█████▏    | 104294/200000 [01:45<01:35, 997.24it/s] [22:30:49] Conflicting single bond directions around double bond at index 16.
[22:30:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
Processing molecules: 100%|██████████| 200000/200000 [03:22<00:00, 986.15it/s] 


Saved Bertz scores and molecule properties for 200000 molecules to indigo_simple_render/bertz_scores.csv


In [None]:
# Notebook Cell 2: Render Images for Simplest Molecules (Final Version with White Background & Error Handling)

import os
import pandas as pd
from indigo import Indigo
from indigo.renderer import IndigoRenderer
from tqdm import tqdm

# ---------------------------
# CONFIGURATION
# ---------------------------
bertz_csv = "indigo_simple_render/bertz_scores.csv"  # CSV generated from Notebook Cell 1
output_dir = "indigo_simple_render"
images_dir = os.path.join(output_dir, "images")
output_csv = os.path.join(output_dir, "simple molecules.csv")
os.makedirs(images_dir, exist_ok=True)

# ---------------------------
# STEP 1: LOAD THE MOLECULE PROPERTIES CSV
# ---------------------------
df = pd.read_csv(bertz_csv)

# Filter out entries with Bertz score of 0 (assumed to be invalid molecules)
df = df[df["bertz"] > 0]

# Sort by Bertz complexity (lowest complexity first) and select 5000 molecules
simple_df = df.sort_values(by="bertz").head(5000)
print(f"Selected {len(simple_df)} molecules with the simplest Bertz complexity (excluding bertz==0).")

# ---------------------------
# STEP 2: SET UP INDIGO INSTANCE & INDIGO RENDERER
# ---------------------------
indigo = Indigo()
renderer = IndigoRenderer(indigo)  # Use IndigoRenderer for rendering

# ---------------------------
# STEP 3: RENDER MOLECULES AS IMAGES
# ---------------------------
output_records = []
print("Rendering images for each molecule...")

for _, row in tqdm(simple_df.iterrows(), total=len(simple_df), desc="Rendering Images"):
    smiles = row["SMILES"]
    try:
        # Load the molecule into Indigo
        indigo_mol = indigo.loadMolecule(smiles)
        # Set rendering options for white background
        indigo.setOption("render-background-color", "1, 1, 1")  # White background
        # Compute a 2D layout to improve rendering
        indigo_mol.layout()
    except Exception as e:
        print(f"Indigo failed to load molecule '{smiles}': {e}")
        continue  # Skip this molecule if it fails to load

    # Create an image ID using the PubChem CID
    image_id = f"mol_{row['pubchem_cid']}"
    file_name = f"{image_id}.png"
    file_path = os.path.join(images_dir, file_name)

    try:
        # Render and save the image using IndigoRenderer
        IndigoRenderer.renderToFile(renderer, obj=indigo_mol, filename=file_path)
    except Exception as e:
        print(f"Failed to render molecule '{smiles}': {e}")
        continue  # Skip this molecule if rendering fails

    # If successfully rendered, add to CSV
    output_records.append({
        "image_id": image_id,
        "file_path": file_path,
        "SMILES": smiles,
        "hydrogen_atom_count": row["hydrogen_atom_count"],
        "has_aromatic_ring": row["has_aromatic_ring"]
    })

# ---------------------------
# STEP 4: SAVE THE OUTPUT CSV
# ---------------------------
output_df = pd.DataFrame(output_records)
output_df.to_csv(output_csv, index=False)
print(f"Rendered {len(output_records)} molecules. CSV saved at {output_csv}")

# ---------------------------
# STEP 4: SAVE THE OUTPUT CSV
# ---------------------------
output_df = pd.DataFrame(output_records)
output_df.to_csv(output_csv, index=False)
print(f"Rendered {len(output_records)} molecules. CSV saved at {output_csv}")


Selected 5000 molecules with the simplest Bertz complexity (excluding bertz==0).
Rendering images for each molecule...


Rendering Images:   8%|▊         | 424/5000 [00:00<00:01, 4234.53it/s]

Failed to render molecule 'C.[Si]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule '[Li+].[AlH4-]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'S.[Mn]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule '[Mg]=S': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C.[Pd]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'P.[Tl]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'S=[Rh]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'O.[Al]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CO': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule '[F-].[Cs+]': renderToFile() missing 1 required positional argument: 'self'
Failed to ren

Rendering Images:  19%|█▊        | 936/5000 [00:00<00:00, 4749.66it/s]

Failed to render molecule 'C[C@H]([CH][CH][CH]CO)OC=O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1CCC(C(C1)CO)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1CC[C@@H]([C@@H](C1)CO)O': renderToFile() missing 1 required positional argument: 'self'


Rendering Images:  28%|██▊       | 1411/5000 [00:00<00:00, 4615.11it/s]

Failed to render molecule 'CCCC[C@H]1[C@H](O1)CO': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C)(C)C(C=O)Br': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1CCC(C(C1)N)N.[Cl-].[Cl-].[Br-].[Br-].[Pt+4]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1COCCN1CCNN': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule '[14CH2](C(=O)O)C(=O)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CNCCCC(=O)OC': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCCOS(=O)CC': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C[N+](C)(C)CCl)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'COC(=O)CCCSC': renderToFile() missing 1 required positional argument: 'self'
Fa

Rendering Images:  47%|████▋     | 2328/5000 [00:00<00:00, 4350.78it/s]

Failed to render molecule 'CCCC(C)COC(=O)CCl': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Ce+3]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Ti+3]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Sc+3]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C[NH+]1CCN(CC1)CCCNN.[Cl-]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C)(C)[C@@H](C(=O)[O-])[NH3+]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C[N+](C)(C)C(=O)C=C.[Cl-]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C)P(N1CC1)N2CC2': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCO

Rendering Images:  64%|██████▎   | 3184/5000 [00:00<00:00, 4047.54it/s]

Failed to render molecule 'C([C@H]([C@@H]([C@@H]([C@H](C=O)O)O)O)O)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C([13C@H]([C@@H]([C@@H]([C@H](C=O)O)O)O)O)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C([C@H]([C@H]([C@@H]([C@@H]([14CH]=O)O)O)O)O)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1=CC=NC(=C1)[Te]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCN(CC)CC.CCN(CC)C(=S)S': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCC(CO)NCCNC(CCC)CO.Cl.Cl': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCCCC/C=C/[C@@H](CC(C)C)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C(=C(Br)Br)([N+](=O)[O-])Br': renderToFile() missing 1 required positional argument: 'self'
Failed to rende

Rendering Images:  80%|███████▉  | 3975/5000 [00:00<00:00, 3763.57it/s]

Failed to render molecule 'CC1OCC2C(C2(Cl)Cl)CO1': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CN1C=NC2C1NCNC2': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C\1=N/NC(=S)N/N=C/NN1': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCCC1CC(OC1=O)CO': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCCCC1C(OC(=O)O1)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=O)OC1CCCCC1OC': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC1=C(C(=O)OC1)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CO.CO.C1=CC(=CC=C1Cl)[Bi]': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C[C@H]1CCC=C(C1=O)C': renderToFile() missing 1 required positional argument: 'self'
Fa

Rendering Images:  95%|█████████▍| 4728/5000 [00:01<00:00, 3606.94it/s]

Failed to render molecule 'CCCCC1CCC(CC1)C(=O)NN': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C)CC(=O)NCN1CCCCC1': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCOC(=O)CCC1CCC(CC1)N': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=O)NC(CCS)C(=O)NN': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C[C@H]([C@@H](C(=O)OC)N)OC(C)(C)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCOC[C@@H](C)[C@H]([C@@H](C(=O)O)NC)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'COC(=O)/C=C\1/CCCCCO1': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC/C(=C\1/CCCCCN1)/C=O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C1CCC(=CCCCC(=O)O)C1': renderToFile() mis

Rendering Images: 100%|██████████| 5000/5000 [00:01<00:00, 3929.07it/s]

Failed to render molecule 'C1[C@H](C[C@H]2[C@H](O[C@@H]([C@@H]1N2)CO)O)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCOP(=O)(/C=C/C(C)C)OCC': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(=C)[C@@H]1CC[C@@](C=C1)(C)O': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC1=CC(=O)CC[C@H]1C(C)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'C/C(=C\CC1C(O1)(C)C)/C=C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC(C)C1CC=C(C1)C(=O)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCOC(=O)C(CN(C)C(C)(C)C)Cl': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CC1C2CC1C3(C(C2)O3)C': renderToFile() missing 1 required positional argument: 'self'
Failed to render molecule 'CCCCCCCCCCC1CCOC1=O': ren


