In [1]:
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

In [2]:
# Load dataset
qm9 = tfds.load("qm9", split="train", as_supervised=False)
print(qm9)

2025-11-21 18:55:20.318893: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-21 18:55:20.932479: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-21 18:55:24.700758: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-21 18:55:26.116481: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


<_PrefetchDataset element_spec={'A': TensorSpec(shape=(), dtype=tf.float32, name=None), 'B': TensorSpec(shape=(), dtype=tf.float32, name=None), 'C': TensorSpec(shape=(), dtype=tf.float32, name=None), 'Cv': TensorSpec(shape=(), dtype=tf.float32, name=None), 'G': TensorSpec(shape=(), dtype=tf.float32, name=None), 'G_atomization': TensorSpec(shape=(), dtype=tf.float32, name=None), 'H': TensorSpec(shape=(), dtype=tf.float32, name=None), 'H_atomization': TensorSpec(shape=(), dtype=tf.float32, name=None), 'InChI': TensorSpec(shape=(), dtype=tf.string, name=None), 'InChI_relaxed': TensorSpec(shape=(), dtype=tf.string, name=None), 'Mulliken_charges': TensorSpec(shape=(29,), dtype=tf.float32, name=None), 'SMILES': TensorSpec(shape=(), dtype=tf.string, name=None), 'SMILES_relaxed': TensorSpec(shape=(), dtype=tf.string, name=None), 'U': TensorSpec(shape=(), dtype=tf.float32, name=None), 'U0': TensorSpec(shape=(), dtype=tf.float32, name=None), 'U0_atomization': TensorSpec(shape=(), dtype=tf.float3

In [3]:
# Extracting data from dataset
def extract_qm9_data(dataset, properties, max_samples=10000):
    smiles_list = []
    labels = []

    for i, sample in enumerate(tfds.as_numpy(dataset)):
        if i >= max_samples:
            break

        smiles = sample["SMILES"].decode("utf-8")
        y = [sample[prop] for prop in properties]

        smiles_list.append(smiles)
        labels.append(y)

    return smiles_list, labels

# Target properties
TARGET_PROPERTIES = ["mu", "alpha", "gap", "Cv", "num_atoms"]

# Extracted data
smiles, y = extract_qm9_data(qm9, TARGET_PROPERTIES, max_samples=5000)

# Scaling target
label_scaler = StandardScaler()
y_scaled = label_scaler.fit_transform(y)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(smiles, y_scaled, test_size=0.2, random_state=42)

2025-11-21 18:55:51.600749: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:396] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-11-21 18:55:57.540361: W tensorflow/core/kernels/data/cache_dataset_ops.cc:917] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [4]:
# Tokenize SMILES
class QM9Dataset(Dataset):
    def __init__(self, smiles, targets, tokenizer):
        self.smiles = smiles
        self.targets = targets
        self.tokenizer = tokenizer


    def __len__(self):
        return len(self.smiles)


    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.smiles[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
            'labels': torch.tensor(self.targets[idx], dtype=torch.float)
        }

# Load data into model
class ChemBERTaMulti(torch.nn.Module):
    def __init__(self, n_outputs):
        super().__init__()
        # Pre trained model
        self.encoder = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
        # Head output
        self.head = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, n_outputs)
        )


    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.head(pooled)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

train_ds = QM9Dataset(X_train, y_train, tokenizer)
test_ds = QM9Dataset(X_test, y_test, tokenizer)


train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)


model = ChemBERTaMulti(n_outputs=len(TARGET_PROPERTIES))
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.MSELoss()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ChemBERTaMulti(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(767, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [None]:
# Training
for epoch in range(50):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        preds = model(input_ids, attention_mask)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()


    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

In [None]:
# Evaluation
model.eval()
preds_all, true_all = [], []


with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        preds = model(input_ids, attention_mask)


        preds_all.append(preds.cpu().numpy())
        true_all.append(labels.numpy())


preds_all = np.vstack(preds_all)
true_all = np.vstack(true_all)

print("===== ChemBERTa Metrics =====")
for i, prop in enumerate(['mu', 'alpha', 'gap', 'Cv', 'num_atoms']):
    mae = mean_absolute_error(true_all[:, i], preds_all[:, i])
    rmse = np.sqrt(mean_squared_error(true_all[:, i], preds_all[:, i]))
    r2 = r2_score(true_all[:, i], preds_all[:, i])
    print(f"{prop}: MAE={mae:.4f}, RMSE={rmse:.4f}, R2={r2:.4f}")