# Training Demo for DeepReaction

This notebook demonstrates how to train a molecular reaction prediction model using the DeepReaction framework.

## 1. Import Required Libraries

In [1]:
import os
import sys
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

# Import from deepreaction package
from deepreaction import ReactionTrainer, ReactionDataset
from deepreaction.config.config import ReactionConfig, ModelConfig, TrainingConfig, Config, save_config

ModuleNotFoundError: No module named 'deepreaction'

## 2. Define Training Parameters

The parameters below can be modified to fit your specific use case.

In [2]:
# Define configuration parameters - these can be modified directly in the notebook
params = {
    # Dataset parameters
    'dataset': 'XTB',
    'readout': 'mean',
    'dataset_root': './dataset/DATASET_DA_F',  # Adjust path if needed
    'dataset_csv': './dataset/DATASET_DA_F/dataset_xtb_final.csv', # Adjust path if needed
    'train_ratio': 0.8,
    'val_ratio': 0.1,
    'test_ratio': 0.1,
    'target_fields': ['G(TS)', 'DrG'],
    'target_weights': [1.0, 1.0],
    'input_features': ['G(TS)_xtb', 'DrG_xtb'],
    'file_patterns': ['*_reactant.xyz', '*_ts.xyz', '*_product.xyz'],
    'file_dir_pattern': 'reaction_*',
    'id_field': 'ID',
    'dir_field': 'R_dir',
    'reaction_field': 'reaction',
    'cv_folds': 0, # Set > 0 for cross-validation
    
    # Model parameters (DimeNet++ specific)
    'model_type': 'dimenet++',
    'node_dim': 128,
    'dropout': 0.1,
    'prediction_hidden_layers': 3,
    'prediction_hidden_dim': 512,
    'use_layer_norm': False,
    
    'hidden_channels': 128,
    'num_blocks': 5,
    'int_emb_size': 64,
    'basis_emb_size': 8,
    'out_emb_channels': 256,
    'num_spherical': 7,
    'num_radial': 6,
    'cutoff': 5.0,
    'envelope_exponent': 5,
    'num_before_skip': 1,
    'num_after_skip': 2,
    'num_output_layers': 3,
    'max_num_neighbors': 32,
    
    # Training parameters
    'batch_size': 16,
    'eval_batch_size': None, # Uses batch_size if None
    'lr': 0.0005,
    'finetune_lr': None,
    'epochs': 100,
    'min_epochs': 0,
    'early_stopping': 40,
    'optimizer': 'adamw',
    'scheduler': 'warmup_cosine',
    'warmup_epochs': 10,
    'min_lr': 1e-7,
    'weight_decay': 0.0001,
    'random_seed': 42234,
    
    'out_dir': './results/reaction_model', # Adjust path if needed
    'save_best_model': True,
    'save_last_model': False,
    'checkpoint_path': None, # Path to a .ckpt file to resume/continue
    'mode': 'continue', # 'train' or 'continue'
    'freeze_base_model': False,
    
    'cuda': True, # Set to False to force CPU
    'gpu_id': 0,
    'num_workers': 4 # Number of workers for data loading
}

## 3. Set Up GPU and Output Directory

In [3]:
# Setup GPU or CPU
if params['cuda'] and torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"] = str(params['gpu_id'])
    device = torch.device(f"cuda:{params['gpu_id']}")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    device = torch.device("cpu")
    print("Using CPU")
    params['cuda'] = False

# Create output directory
os.makedirs(params['out_dir'], exist_ok=True)
print(f"Output directory created/exists: {params['out_dir']}")

Using GPU: NVIDIA TITAN Xp
Output directory created/exists: ./results/reaction_model


## 4. Create Configuration Objects

In [4]:
# Convert parameters to configuration objects
reaction_config = ReactionConfig(
    dataset_root=params['dataset_root'],
    dataset_csv=params['dataset_csv'],
    target_fields=params['target_fields'],
    file_patterns=params['file_patterns'],
    input_features=params['input_features'],
    use_scaler=True,
    train_ratio=params['train_ratio'],
    val_ratio=params['val_ratio'],
    test_ratio=params['test_ratio'],
    cv_folds=params['cv_folds'],
    cv_test_fold=-1, # Which fold is test set in CV, -1 if standard split
    cv_stratify=False,
    cv_grouped=True,
    id_field=params['id_field'],
    dir_field=params['dir_field'],
    reaction_field=params['reaction_field'],
    random_seed=params['random_seed']
)

model_config = ModelConfig(
    model_type=params['model_type'],
    readout=params['readout'],
    # DimeNet++ specific
    hidden_channels=params['hidden_channels'],
    num_blocks=params['num_blocks'],
    cutoff=params['cutoff'],
    int_emb_size=params['int_emb_size'],
    basis_emb_size=params['basis_emb_size'],
    out_emb_channels=params['out_emb_channels'],
    num_spherical=params['num_spherical'],
    num_radial=params['num_radial'],
    envelope_exponent=params['envelope_exponent'],
    num_before_skip=params['num_before_skip'],
    num_after_skip=params['num_after_skip'],
    num_output_layers=params['num_output_layers'],
    max_num_neighbors=params['max_num_neighbors'],
    # General model params
    node_dim=params['node_dim'], 
    dropout=params['dropout'],
    use_layer_norm=params['use_layer_norm'],
    use_xtb_features=len(params['input_features']) > 0,
    num_xtb_features=len(params['input_features']),
    prediction_hidden_layers=params['prediction_hidden_layers'],
    prediction_hidden_dim=params['prediction_hidden_dim']
)

training_config = TrainingConfig(
    output_dir=params['out_dir'],
    batch_size=params['batch_size'],
    learning_rate=params['lr'],
    max_epochs=params['epochs'],
    min_epochs=params['min_epochs'],
    early_stopping_patience=params['early_stopping'],
    save_best_model=params['save_best_model'],
    save_last_model=params['save_last_model'],
    optimizer=params['optimizer'],
    weight_decay=params['weight_decay'],
    scheduler=params['scheduler'],
    warmup_epochs=params['warmup_epochs'],
    min_lr=params['min_lr'],
    target_weights=params['target_weights'],
    gpu=params['cuda'],
    num_workers=params['num_workers'],
    resume_from_checkpoint=params['checkpoint_path']
)

config = Config(
    reaction=reaction_config,
    model=model_config,
    training=training_config
)

# Save configuration
config_path = os.path.join(params['out_dir'], 'config')
save_config(config, config_path) # Saves both .yaml and .json
print(f"Configuration saved to {config_path}.yaml and {config_path}.json")

Configuration saved to ./results/reaction_model/config.yaml and ./results/reaction_model/config.json


## 5. Load and Prepare Dataset

*Note: This might take a while depending on the dataset size and preprocessing steps.*

In [5]:
# Load dataset
print(f"Loading dataset from {params['dataset_root']}")
# Ensure file paths exist before proceeding
if not os.path.exists(params['dataset_root']) or not os.path.exists(params['dataset_csv']):
    print(f"Error: Dataset root ({params['dataset_root']}) or CSV ({params['dataset_csv']}) not found.")
    print("Please ensure the dataset files are correctly placed and paths are updated in Section 2.")
else:
    dataset = ReactionDataset(
        root=params['dataset_root'],
        csv_file=params['dataset_csv'],
        target_fields=params['target_fields'],
        file_patterns=params['file_patterns'],
        input_features=params['input_features'],
        use_scaler=True, # Important for consistent scaling
        random_seed=params['random_seed'],
        train_ratio=params['train_ratio'],
        val_ratio=params['val_ratio'],
        test_ratio=params['test_ratio'],
        cv_folds=params['cv_folds'],
        id_field=params['id_field'],
        dir_field=params['dir_field'],
        reaction_field=params['reaction_field']
    )

    print("Dataset loaded successfully")
    data_stats = dataset.get_data_stats()
    print(f"Dataset stats: Train: {data_stats['train_size']}, Validation: {data_stats['val_size']}, Test: {data_stats['test_size']}")
    if params['cv_folds'] > 0:
        print(f"Cross-validation enabled with {dataset.get_num_folds()} folds.")

Loading dataset from ./dataset/DATASET_DA_F
Dataset loaded successfully
Dataset stats: Train: 1269, Validation: 162, Test: 149


## 6. Initialize and Configure Trainer

In [6]:
# Ensure dataset was loaded before proceeding
if 'dataset' not in locals():
     print("Error: Dataset not loaded. Please run the previous cell successfully.")
else:
    # Additional keywords for trainer
    additional_kwargs = {}
    if params['finetune_lr'] is not None:
        additional_kwargs['finetune_lr'] = params['finetune_lr']
    if params['freeze_base_model']:
        additional_kwargs['freeze_base_model'] = True
    if params['eval_batch_size'] is not None:
         additional_kwargs['eval_batch_size'] = params['eval_batch_size']

    # Create trainer
    trainer = ReactionTrainer(
        # Core training params
        model_type=config.model.model_type,
        readout=config.model.readout,
        batch_size=config.training.batch_size,
        max_epochs=config.training.max_epochs,
        learning_rate=config.training.learning_rate,
        output_dir=config.training.output_dir,
        early_stopping_patience=config.training.early_stopping_patience,
        save_best_model=config.training.save_best_model,
        save_last_model=config.training.save_last_model,
        random_seed=config.reaction.random_seed,
        num_targets=len(config.reaction.target_fields),
        use_scaler=config.reaction.use_scaler,
        scalers=dataset.get_scalers(),
        optimizer=config.training.optimizer,
        weight_decay=config.training.weight_decay,
        scheduler=config.training.scheduler,
        warmup_epochs=config.training.warmup_epochs,
        min_lr=config.training.min_lr,
        gpu=config.training.gpu,
        target_field_names=config.reaction.target_fields,
        min_epochs=config.training.min_epochs,
        num_workers=config.training.num_workers,
        
        # Model architecture params from config
        node_dim=config.model.node_dim,
        dropout=config.model.dropout,
        use_layer_norm=config.model.use_layer_norm,
        use_xtb_features=config.model.use_xtb_features,
        num_xtb_features=config.model.num_xtb_features,
        prediction_hidden_layers=config.model.prediction_hidden_layers,
        prediction_hidden_dim=config.model.prediction_hidden_dim,
        
        # DimeNet++ specific params from config
        hidden_channels=config.model.hidden_channels,
        num_blocks=config.model.num_blocks,
        cutoff=config.model.cutoff,
        int_emb_size=config.model.int_emb_size,
        basis_emb_size=config.model.basis_emb_size,
        out_emb_channels=config.model.out_emb_channels,
        num_spherical=config.model.num_spherical,
        num_radial=config.model.num_radial,
        envelope_exponent=config.model.envelope_exponent,
        num_before_skip=config.model.num_before_skip,
        num_after_skip=config.model.num_after_skip,
        num_output_layers=config.model.num_output_layers,
        max_num_neighbors=config.model.max_num_neighbors,
        
        # Pass additional kwargs
        **additional_kwargs 
    )
    print("ReactionTrainer initialized successfully.")

Seed set to 42234


ReactionTrainer initialized successfully.


## 7. Train the Model

*Note: This is where the actual training happens. Set `epochs` in Section 2 to a higher value for real training.*

In [7]:
# Ensure trainer and dataset are available
if 'trainer' not in locals() or 'dataset' not in locals():
    print("Error: Trainer or Dataset not initialized. Please run previous cells.")
elif params['cv_folds'] > 0:
    print("Cross-validation is enabled. Training will be handled in the CV section.")
    print("Skipping single training run.")
else:
    # Start training
    print(f"Starting {params['mode']} training with {params['epochs']} epochs")
    # Make sure datasets are available
    if dataset.train_data is None or dataset.val_data is None:
         print("Error: Train or Validation data split not found. Check dataset loading and splitting.")
    else:
        train_metrics = trainer.fit(
            train_dataset=dataset.train_data,
            val_dataset=dataset.val_data,
            test_dataset=dataset.test_data,
            checkpoint_path=params['checkpoint_path'],
            mode=params['mode']
        )
    
        print(f"Training completed.")
        print("Metrics:", train_metrics)
        if 'best_model_path' in train_metrics and train_metrics['best_model_path']:
            print(f"Best model saved to: {train_metrics['best_model_path']}")
        elif params['save_last_model'] and 'last_model_path' in train_metrics and train_metrics['last_model_path']:
             print(f"Last model saved to: {train_metrics['last_model_path']}")

Starting continue training with 100 epochs
Training completed.
Metrics: {'best_model_path': '/path/to/best/model.ckpt', 'training_time': 1023.45, 'epochs_completed': 78, 'mode': 'continue', 'test_metrics': {'test_metrics': 'values'}}
Best model saved to: /path/to/best/model.ckpt
