In [7]:
from torch_geometric.datasets import QM9
import pandas as pd
import torch

dataset = QM9(root='qm9_data')

# Names for the 19-target variant of QM9 in PyG
target_names_19 = ['mu','alpha','homo','lumo','gap','r2','zpve','U0','U','H','G','Cv',
                   'U0_atom','U_atom','H_atom','G_atom','A','B','C']

rows = []
for data in dataset:
    y = data.y.view(-1)               # squeeze to shape [num_targets]
    names = target_names_19[:y.numel()]  # handle 12- vs 19-target variants safely
    u0_idx = names.index('U0')        # find index by name
    rows.append({
        "smiles": data.smiles,
        "U0": float(y[u0_idx].item())
    })

df = pd.DataFrame(rows)
print(df.head(), df.shape)
df.to_csv("qm9_smiles_U0.csv", index=False)


              smiles           U0
0  [H]C([H])([H])[H] -1101.487793
1       [H]N([H])[H] -1538.147705
2            [H]O[H] -2079.077881
3          [H]C#C[H] -2103.669434
4             [H]C#N -2541.866943 (130831, 2)


In [10]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

def morgan_fp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=np.uint8)
    # RDKit fills the array in-place:
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

fps = []
keep_mask = []
for s in df['smiles']:
    arr = morgan_fp(s, radius=2, n_bits=2048)
    keep_mask.append(arr is not None)
    fps.append(arr if arr is not None else np.zeros(2048, dtype=np.uint8))

# Filter out any molecules RDKit couldn't parse (rare)
df_fp = df.loc[keep_mask].reset_index(drop=True)
X_fp = np.stack([a for a, k in zip(fps, keep_mask) if k], axis=0)

print("Fingerprint matrix:", X_fp.shape)   # (N, 2048)

# Save as Parquet + CSV (parquet is smaller)
fp_df = pd.DataFrame(X_fp, columns=[f'fp_{i}' for i in range(X_fp.shape[1])])
fp_df.insert(0, 'smiles', df_fp['smiles'].values)
fp_df['U0'] = df_fp['U0'].values

fp_df.to_parquet("qm9_fp_U0.parquet", engine="fastparquet", index=False)
fp_df.to_csv("qm9_fp_U0.csv", index=False)


Fingerprint matrix: (129012, 2048)


In [11]:
# --- deps ---
import numpy as np, pandas as pd, os
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import train_test_split

# Load your minimal table from Step 1
df = pd.read_csv("qm9_smiles_U0.csv")   # columns: smiles, U0

# Set up RDKit descriptor calculator (2D descriptors)
desc_list = Descriptors._descList              # list of (name, fn)
desc_names = [d[0] for d in desc_list]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)

def rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        vals = calc.CalcDescriptors(mol)      # tuple of floats
        return vals
    except Exception:
        return None

# Compute descriptors
desc_rows, keep_mask = [], []
for s in df["smiles"]:
    vals = rdkit_descriptors(s)
    keep_mask.append(vals is not None)
    desc_rows.append(vals if vals is not None else [np.nan]*len(desc_names))

desc_df = pd.DataFrame(desc_rows, columns=desc_names)
meta = df.loc[keep_mask].reset_index(drop=True)
desc_df = desc_df.loc[keep_mask].reset_index(drop=True)  # keep symmetry

# Attach meta + target
desc_df.insert(0, "smiles", meta["smiles"].values)
desc_df["U0"] = meta["U0"].values

# --- Cleaning ---
# 1) Replace infs with NaN
desc_df = desc_df.replace([np.inf, -np.inf], np.nan)

# 2) Drop columns that are entirely NaN
all_nan_cols = [c for c in desc_df.columns if c not in ("smiles","U0") and desc_df[c].isna().all()]
desc_df = desc_df.drop(columns=all_nan_cols)

# 3) Drop constant columns (zero variance)
const_cols = []
for c in desc_df.columns:
    if c in ("smiles","U0"): 
        continue
    s = desc_df[c]
    if s.nunique(dropna=True) <= 1:
        const_cols.append(c)
desc_df = desc_df.drop(columns=const_cols)

# 4) Simple impute remaining NaNs with column medians (rare)
num_cols = [c for c in desc_df.columns if c not in ("smiles","U0")]
desc_df[num_cols] = desc_df[num_cols].apply(lambda s: s.fillna(s.median()))

print("Descriptor table shape:", desc_df.shape)
print("Dropped all-NaN:", len(all_nan_cols), "| Constant:", len(const_cols))

# --- Save (CSV + Parquet fallback) ---
def save_table(df, base):
    df.to_csv(f"{base}.csv", index=False)
    try:
        df.to_parquet(f"{base}.parquet", engine="pyarrow", index=False)
    except Exception as e1:
        try:
            import fastparquet  # noqa: F401
            df.to_parquet(f"{base}.parquet", engine="fastparquet", index=False)
        except Exception as e2:
            print(f"[warn] parquet failed; wrote CSV only. ({e1} | {e2})")

save_table(desc_df, "qm9_desc_U0")

Descriptor table shape: (129012, 196)
Dropped all-NaN: 0 | Constant: 23


In [12]:
def move_target_first(df, target="U0"):
    cols = df.columns.tolist()
    # Put target first, then everything else in the same order
    new_cols = [target] + [c for c in cols if c != target]
    return df[new_cols]


In [13]:
df = pd.read_csv("qm9_smiles_U0.csv")
df = move_target_first(df)
df.to_csv("qm9_smiles_U0.csv", index=False)
fp_df = pd.read_csv("qm9_fp_U0.csv")
fp_df = move_target_first(fp_df)
fp_df.to_csv("qm9_fp_U0.csv", index=False)
desc_df = pd.read_csv("qm9_desc_U0.csv")
desc_df = move_target_first(desc_df)
desc_df.to_csv("qm9_desc_U0.csv", index=False)



In [14]:
import pandas as pd

fp = pd.read_csv("qm9_fp_U0.csv")
desc = pd.read_csv("qm9_desc_U0.csv")

print("FP rows:", fp.shape[0])
print("DESC rows:", desc.shape[0])


FP rows: 129012
DESC rows: 129012


In [15]:
fp_set = set(fp["smiles"])
desc_set = set(desc["smiles"])

print("Only in FP:", len(fp_set - desc_set))
print("Only in DESC:", len(desc_set - fp_set))
print("Intersection:", len(fp_set & desc_set))


Only in FP: 0
Only in DESC: 0
Intersection: 128910


In [16]:
same_order = (fp["smiles"].tolist() == desc["smiles"].tolist())
print("Same order:", same_order)


Same order: True


In [17]:
# Identify the first mismatch, if any
for i, (s_fp, s_desc) in enumerate(zip(fp["smiles"], desc["smiles"])):
    if s_fp != s_desc:
        print("First mismatch at row:", i)
        print("FP:", s_fp)
        print("DESC:", s_desc)
        break
else:
    print("All SMILES match in order.")


All SMILES match in order.
