In [29]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

# ------------------------------------------------------------------
# Reproducibility & device safety
# ------------------------------------------------------------------
SEED = 38
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)
print(f"Using device: {DEVICE}")

Using device: cuda


In [30]:
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModelForSequenceClassification.from_pretrained(
    "seyonec/ChemBERTa-zinc-base-v1", 
    num_labels=2  # Binary classification
).to(DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
df = pd.read_csv('./data/HIV.csv')
df.drop('activity', axis=1, inplace=True)
df.rename(columns={'HIV_active': 'label'}, inplace=True)

train_df, test_df = train_test_split(
    df, test_size=0.20, stratify=df["label"], random_state=SEED
)
train_df, val_df = train_test_split(
    train_df, test_size=0.20, stratify=train_df["label"], random_state=SEED
)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

Train size: 26320, Validation size: 6581, Test size: 8226


In [32]:
def encode(texts):
    """Return input_ids & attention_mask tensors (on CPU)."""
    enc = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    return enc["input_ids"], enc["attention_mask"]

label_map = {0: "inactive", 1: "active"}    

def build_dataset(frame: pd.DataFrame):
    """Return a DataLoader for the given DataFrame."""
    input_ids, attention_mask = encode(frame["smiles"])
    labels = torch.tensor(frame["label"].values)
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=32, shuffle=True)  # Shuffle for training

train_loader = build_dataset(train_df)
val_loader = build_dataset(val_df)

In [33]:
optimiser = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # epochs = 3
scheduler = get_linear_schedule_with_warmup(
    optimiser, num_warmup_steps=0, num_training_steps=total_steps
)

In [34]:
def epoch_metrics(logits_list, labels_list):
    preds = torch.cat(logits_list).argmax(dim=1).cpu().numpy()
    gold = torch.cat(labels_list).cpu().numpy()
    acc = accuracy_score(gold, preds)
    f1 = f1_score(gold, preds, average="macro")
    return acc, f1


def save_checkpoint(model, optimiser, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(
        {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimiser.state_dict(),
        },
        path,
    )
    print(f"Model saved to {path}")

In [35]:
EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    # ---- train ----------------------------------------------------
    model.train()
    running_loss = 0.0
    for ids, masks, labels in tqdm(train_loader, desc="Training"):
        ids, masks, labels = ids.to(DEVICE), masks.to(DEVICE), labels.to(DEVICE)

        optimiser.zero_grad()
        outputs = model(
            input_ids=ids, attention_mask=masks, labels=labels, return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimiser.step()
        scheduler.step()

        running_loss += loss.item()

    train_loss = running_loss / len(train_loader)

    # ---- validate -------------------------------------------------
    model.eval()
    val_loss = 0.0
    all_logits, all_labels = [], []
    with torch.no_grad():
        for ids, masks, labels in tqdm(val_loader, desc="Validating"):
            ids, masks, labels = ids.to(DEVICE), masks.to(DEVICE), labels.to(DEVICE)
            outputs = model(
                input_ids=ids, attention_mask=masks, labels=labels, return_dict=True
            )
            val_loss += outputs.loss.item()
            all_logits.append(outputs.logits.cpu())
            all_labels.append(labels.cpu())

    val_loss /= len(val_loader)
    acc, f1 = epoch_metrics(all_logits, all_labels)

    print(
        f"Train loss: {train_loss:.4f} | "
        f"Val loss: {val_loss:.4f} | "
        f"Accuracy: {acc:.4f} | "
        f"F1: {f1:.4f}"
    )

# ------------------------------------------------------------------
# Save final checkpoint
# ------------------------------------------------------------------
save_checkpoint(model, optimiser, Path("models/chemb_hiv_bert.pth"))


Epoch 1/3


Training: 100%|██████████| 823/823 [02:11<00:00,  6.28it/s]
Validating: 100%|██████████| 206/206 [00:10<00:00, 19.19it/s]


Train loss: 0.1327 | Val loss: 0.1328 | Accuracy: 0.9669 | F1: 0.5848

Epoch 2/3


Training: 100%|██████████| 823/823 [02:11<00:00,  6.25it/s]
Validating: 100%|██████████| 206/206 [00:10<00:00, 19.04it/s]


Train loss: 0.1066 | Val loss: 0.1205 | Accuracy: 0.9676 | F1: 0.6680

Epoch 3/3


Training: 100%|██████████| 823/823 [02:11<00:00,  6.26it/s]
Validating: 100%|██████████| 206/206 [00:10<00:00, 19.03it/s]


Train loss: 0.0915 | Val loss: 0.1260 | Accuracy: 0.9664 | F1: 0.6766
Model saved to models/chemb_hiv_bert.pth


In [37]:
# Encode test data and create DataLoader
test_loader = build_dataset(test_df)

model.eval()                                   # switch to inference mode
logits_list, labels_list = [], []

with torch.no_grad():
    for ids, masks, labels in tqdm(test_loader, desc="Testing"):
        ids, masks = ids.to(DEVICE), masks.to(DEVICE)
        outputs = model(input_ids=ids, attention_mask=masks, return_dict=True)
        logits_list.append(outputs.logits.cpu())
        labels_list.append(labels)

logits = torch.cat(logits_list)
y_true = torch.cat(labels_list).numpy()
y_pred = logits.argmax(dim=1).numpy()

accuracy  = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall    = recall_score(y_true, y_pred, average="macro")
f1        = f1_score(y_true, y_pred, average="macro")

print("\n=== Test-set metrics ===")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

print("\n--- Per-class report ---")
target_names = ["negative", "positive"]
print(classification_report(y_true, y_pred, target_names=target_names))

print("\n--- Confusion matrix ---")
print(confusion_matrix(y_true, y_pred))

Testing: 100%|██████████| 258/258 [00:13<00:00, 19.47it/s]


=== Test-set metrics ===
Accuracy : 0.9711
Precision: 0.8207
Recall   : 0.6749
F1-score : 0.7245

--- Per-class report ---
              precision    recall  f1-score   support

    negative       0.98      0.99      0.99      7937
    positive       0.66      0.36      0.46       289

    accuracy                           0.97      8226
   macro avg       0.82      0.67      0.72      8226
weighted avg       0.97      0.97      0.97      8226


--- Confusion matrix ---
[[7885   52]
 [ 186  103]]



