In [1]:
import pandas as pd
from rdkit import Chem

# Load the dataset
csv_path = "synthetic/indigo_resize.csv"
df = pd.read_csv(csv_path)

# Function to determine if a molecule has an aromatic ring
def has_aromatic_ring(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Invalid SMILES
    return int(any(atom.GetIsAromatic() for atom in mol.GetAtoms()))  # 1 if aromatic, 0 otherwise

# Add has_aromatic_ring column
df["has_aromatic_ring"] = df["SMILES"].apply(has_aromatic_ring)

# Drop invalid SMILES rows
df = df.dropna(subset=["has_aromatic_ring"])

# Balance the dataset
min_count = df["has_aromatic_ring"].value_counts().min()

# Undersample majority class
balanced_df = df.groupby("has_aromatic_ring").sample(n=min_count, random_state=42)

# Save balanced dataset
balanced_csv_path = "synthetic/indigo_balanced.csv"
balanced_df.to_csv(balanced_csv_path, index=False)

print(f"Balanced dataset saved to {balanced_csv_path}")




Balanced dataset saved to synthetic/indigo_balanced.csv
