### Install packages & libraries

In [None]:
!pip install --pre deepchem

In [None]:
# !pip install transformers==4.30.2
!pip install simpletransformers
!pip install datasets
!pip install wandb
!pip install matplotlib
!pip install torch==2.6.0

In [None]:
import sys

!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']

In [4]:
import sys

# !git clone https://github.com/seyonechithrananda/bert-loves-chemistry.git
sys.path.append("./bert-loves-chemistry")

### Experiment settings & configuration

In [None]:
import os
os.environ["CUID_VISIBLE_DEVICES"] = "4"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Import required libraries
import pandas as pd
import numpy as np
import os
import sklearn.metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import deepchem
deepchem.__version__

from rdkit import Chem

In [None]:
# Configuration
DATA_PATH = '' # Path to the dataset
MODEL_FOLDER = '' # Path to save the model
WANDB_API_KEY = 'YOUR_WANDB_API_KEY'  # Replace with your actual WandB API key

# Splitting method configuration
SPLITTING_METHOD = "random"  # Options: "random", "stratified_kfold", "scaffold"
N_SPLITS = 5  # For K-fold cross-validation
TEST_SIZE = 0.2
VALID_SIZE = 0.2
SEED = 42

# Model hyperparameters
model_name = 'ChemBERTa-molecular-classification'
project_name = 'molecular-classification'
EPOCHS = 10
BATCH_SIZE = 16
learning_rate = 2e-5
optimizer = 'AdamW'
patience = 3
manual_seed = SEED

print(f"Configuration loaded - Splitting method: {SPLITTING_METHOD}")

Configuration loaded - Splitting method: random


In [None]:
# Initialize wandb
if WANDB_API_KEY:
    wandb.login(key=WANDB_API_KEY)
else:
    print("Warning: WANDB_API_KEY not provided. Wandb logging may not work.")

In [4]:
def compute_metrics(preds, labels):
    """
    Compute classification metrics including sensitivity and specificity
    """
    # preds: (n_samples, n_classes) -> probability scores
    # labels: (n_samples,) -> true labels
    
    pred_labels = preds.argmax(axis=1)
    cm = confusion_matrix(labels, pred_labels)
    
    # Handle binary classification
    if cm.shape == (2, 2):
        TN, FP, FN, TP = cm.ravel()
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        
        metrics = {
            "TN": int(TN),
            "FP": int(FP),
            "FN": int(FN),
            "TP": int(TP),
            "sensitivity": float(sensitivity),
            "specificity": float(specificity),
        }
    else:
        # Multi-class classification
        accuracy = accuracy_score(labels, pred_labels)
        metrics = {
            "accuracy": float(accuracy)
        }
    
    # Log metrics to wandb if available
    try:
        wandb.log(metrics)
    except:
        pass
    
    return metrics

In [None]:
wandb_kwargs = {'name': model_name}

classification_args = {
    'evaluate_each_epoch': True,
    'evaluate_during_training_verbose': True,
    'evaluate_during_training': True,
    'best_model_dir': MODEL_FOLDER,
    'no_save': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'save_best_model': True,
    'save_steps': -1,
    'save_limit': 1,
    'num_train_epochs': EPOCHS,
    'use_early_stopping': True,
    'early_stopping_patience': patience,
    'early_stopping_delta': 0.001,
    'early_stopping_metric': 'eval_loss',
    'early_stopping_metric_minimize': True,
    'early_stopping_consider_epochs': True,
    'fp16': False,
    'optimizer': optimizer,
    'adam_betas': (0.95, 0.999),
    'learning_rate': learning_rate,
    'manual_seed': manual_seed,
    'train_batch_size': BATCH_SIZE,
    'eval_batch_size': BATCH_SIZE,
    'auto_weights': True,
    'wandb_project': project_name,
    'wandb_kwargs': wandb_kwargs,
    'compute_metrics': compute_metrics,
    'use_multiprocessing' : False,
    'use_multiprocessing_for_evaluation': False,
    'save_model_every_epoch': False,
}


print("Model configuration completed.")

### Dataset loading & splitting

In [None]:
# Load and validate dataset
df = pd.read_csv(DATA_PATH)
print(f"Loaded CSV with shape: {df.shape}")
print("Columns:", df.columns.tolist())

# Sanity check
assert "SMILES" in df.columns and "label" in df.columns, "Expected columns: SMILES and label"

# Remove rows with missing SMILES or labels
df = df.dropna(subset=['SMILES', 'label'])
print(f"After removing NaN values: {df.shape}")

# Check label distribution
print("\nLabel distribution:")
print(df['label'].value_counts())
print("\nLabel distribution (normalized):")
print(df['label'].value_counts(normalize=True))

In [None]:
# Scaffold splitting function
def scaffold_split(df, test_size=0.2, valid_size=0.2, seed=42):
    """
    Split dataset based on molecular scaffolds to avoid data leakage
    """
    np.random.seed(seed)
    
    # Generate scaffolds for each molecule
    scaffolds = defaultdict(list)
    
    for idx, smiles in enumerate(df['SMILES']):
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)
                scaffolds[scaffold].append(idx)
            else:
                # If molecule parsing fails, use SMILES as scaffold
                scaffolds[smiles].append(idx)
        except:
            # If scaffold generation fails, use SMILES as scaffold
            scaffolds[smiles].append(idx)
    
    # Sort scaffolds by size (largest first)
    scaffold_sets = list(scaffolds.values())
    scaffold_sets.sort(key=len, reverse=True)
    
    # Split scaffolds into train, valid, test
    total_size = len(df)
    test_target = int(total_size * test_size)
    valid_target = int(total_size * valid_size)
    
    train_indices, valid_indices, test_indices = [], [], []
    train_size, valid_size_current, test_size_current = 0, 0, 0
    
    for scaffold_set in scaffold_sets:
        if test_size_current < test_target:
            test_indices.extend(scaffold_set)
            test_size_current += len(scaffold_set)
        elif valid_size_current < valid_target:
            valid_indices.extend(scaffold_set)
            valid_size_current += len(scaffold_set)
        else:
            train_indices.extend(scaffold_set)
            train_size += len(scaffold_set)
    
    train_df = df.iloc[train_indices].reset_index(drop=True)
    valid_df = df.iloc[valid_indices].reset_index(drop=True)
    test_df = df.iloc[test_indices].reset_index(drop=True)
    
    return train_df, valid_df, test_df

In [None]:
# Data splitting based on selected method
if SPLITTING_METHOD == "random":
    print("Using Random Splitting...")
    
    # First split: train+valid vs test
    train_val_df, test_df = train_test_split(
        df,
        test_size=TEST_SIZE,
        random_state=SEED,
    )
    
    # Second split: train vs valid
    train_df, valid_df = train_test_split(
        train_val_df,
        test_size=VALID_SIZE,
        random_state=SEED,
    )
    
    print(f"Train: {train_df.shape}")
    print(f"Valid: {valid_df.shape}")
    print(f"Test : {test_df.shape}")
    
    # Check label distribution
    print("\nLabel distribution:")
    print("Train:\n", train_df["label"].value_counts(normalize=True))
    print("Valid:\n", valid_df["label"].value_counts(normalize=True))
    print("Test:\n", test_df["label"].value_counts(normalize=True))

elif SPLITTING_METHOD == "scaffold":
    print("Using Scaffold Splitting...")
    
    train_df, valid_df, test_df = scaffold_split(df, TEST_SIZE, VALID_SIZE, SEED)
    
    print(f"Train: {train_df.shape}")
    print(f"Valid: {valid_df.shape}")
    print(f"Test : {test_df.shape}")
    
    # Check label distribution
    print("\nLabel distribution:")
    print("Train:\n", train_df["label"].value_counts(normalize=True))
    print("Valid:\n", valid_df["label"].value_counts(normalize=True))
    print("Test:\n", test_df["label"].value_counts(normalize=True))

elif SPLITTING_METHOD == "stratified_kfold":
    print(f"Using Stratified K-Fold Cross-Validation (k={N_SPLITS})...")
    
    # For K-fold, we'll use the entire dataset and split during CV
    # Keep a separate test set for final evaluation
    train_val_df, test_df = train_test_split(
        df,
        test_size=TEST_SIZE,
        stratify=df["label"],
        random_state=SEED,
    )
    
    print(f"Train+Valid: {train_val_df.shape}")
    print(f"Test: {test_df.shape}")

else:
    raise ValueError(f"Unknown splitting method: {SPLITTING_METHOD}")


# Add logging steps calculation
if SPLITTING_METHOD != "stratified_kfold":
    classification_args['logging_steps'] = max(1, len(train_df) // BATCH_SIZE)

### Training & Evaluation

In [None]:
if SPLITTING_METHOD in ["random", "scaffold"]:
    print(f"\nTraining with {SPLITTING_METHOD} splitting...")
    
    # Prepare data for simpletransformers (rename columns)
    train_df_model = train_df.rename(columns={"SMILES": "text", "label": "labels"}).copy()
    valid_df_model = valid_df.rename(columns={"SMILES": "text", "label": "labels"}).copy()
    test_df_model = test_df.rename(columns={"SMILES": "text", "label": "labels"}).copy()
    classification_args["output_dir"] = f"outputs/exp_2_bbbp_{SPLITTING_METHOD}/"
    
    # Initialize model
    model = ClassificationModel('roberta', 'DeepChem/ChemBERTa-77M-MLM', args=classification_args)
    
    # Train model
    print("Starting training...")
    model.train_model(train_df_model, eval_df=valid_df_model)
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    result, model_outputs, wrong_predictions = model.eval_model(test_df_model, acc=accuracy_score)
    print("Test Results:", result)

elif SPLITTING_METHOD == "stratified_kfold":
    print(f"\nTraining with Stratified K-Fold Cross-Validation (k={N_SPLITS})...")

    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score

    X = train_val_df["SMILES"]
    y = train_val_df["label"]
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    acc_list = []
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"\n=== Fold {fold}/{N_SPLITS} ===")

        fold_train_df = train_val_df.iloc[train_idx].rename(columns={"SMILES": "text", "label": "labels"}).copy()
        fold_val_df = train_val_df.iloc[val_idx].rename(columns={"SMILES": "text", "label": "labels"}).copy()

        classification_args["logging_steps"] = max(1, len(fold_train_df) // BATCH_SIZE)
        fold_dir = os.path.join(MODEL_FOLDER, f"fold_{fold}")
        os.makedirs(fold_dir, exist_ok=True)
        classification_args["best_model_dir"] = fold_dir
        classification_args["output_dir"] = f"outputs/exp_2_clintox_random/fold_{fold}"

        model = ClassificationModel("roberta", "DeepChem/ChemBERTa-77M-MLM", args=classification_args, cuda_device=4, use_cuda=True)

        print(f"Training fold {fold}...")
        model.train_model(fold_train_df, eval_df=fold_val_df)

        result, _, _ = model.eval_model(fold_val_df, acc=accuracy_score)
        acc = result.get("acc", 0)
        print(f"Fold {fold} Accuracy: {acc:.4f}")

        acc_list.append(acc)
        fold_results.append(result)

    # K-Fold summary
    print("\nCross-validation summary:")
    for i, acc in enumerate(acc_list, start=1):
        print(f"Fold-{i}: {acc:.4f}")
    print(f"Mean Accuracy: {np.mean(acc_list):.4f} Â± {np.std(acc_list):.4f}")