### MDCATH DATASET IN MACHINE LEARNING FRAMEWORK

This tutorial provides a practical example of training ML models using the mdCATH dataset in TorchMD-Net. Before you begin, please ensure that TorchMD-Net is correctly installed. You can find installation instructions and further details [here](https://torchmd-net.readthedocs.io/en/latest/installation.html). Note that the MDCATH dataloader is available starting from TorchMD-Net version 2.4.0 and later. 

In [1]:
import os
import torch
import lightning.pytorch as pl
from torchmdnet.data import DataModule
from torchmdnet.module import LNNP
from torchmdnet.scripts.train import get_args
from lightning.pytorch.callbacks import TQDMProgressBar, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

  impl_abstract(
  impl_abstract(


In [2]:
# Define the arguments
args = get_args() # default arguments by tmdnet
args = vars(args) # convert to dictionary

pargs = {
    # DATA
    'dataset': 'MDCATH',
    'dataset_arg':{
      'numAtoms': None,
      'numResidues': None,
      'pdb_list': ['1balA00', '1ce3A00', '1e8rA00'],
      'temperatures': ['348'],
      'skip_frames': 2,
      'solid_ss': None,
      },
    'dataset_root': 'data',
    # MODEL
    'model': 'tensornet',
    'embedding_dimension': 32,
    'num_layers': 0,
    'num_rbf': 8,
    'rbf_type': 'expnorm',
    'activation': 'silu',
    'cutoff_lower': 0.0,
    'cutoff_upper': 5.0,
    'max_z': 20,
    'num_epochs': 10,
    'max_num_neighbors': 48,
    'derivative': True, 
    # TRAIN
    'batch_size': 3,
    'train_size': 200, 
    'val_size': 50,
    'test_size': 100,
    'lr': 1e-3,
    'lr_metric': 'val',
    'log_dir': 'logs/',
    'check_errors': True,
    'static_shapes': False,
    'num_workers': 2,
}

# Update the default arguments with the new ones
args.update(pargs)
os.makedirs(args['log_dir'], exist_ok=True)

In [3]:
# Here MDCATH torch_geometric dataset class is used 
# If the h5 files are not present in the 'dataset_root' then they will be downloaded from HF
# The downlaoad process can take some time

data = DataModule(args)
data.prepare_data()
data.setup("fit")

Processing mdcath source: 100%|██████████| 3/3 [00:00<00:00, 13.53it/s]

train 200, val 50, test 100



  rank_zero_warn(f"{dset_len - total} samples were excluded from the dataset")


In [4]:
# Lightning wrapper for the Neural Network Potentials in TorchMD-Net
lnnp = LNNP(args, 
    prior_model=None, 
    mean=data.mean, 
    std=data.std)

In [5]:
# Callbacks, used to save model ckpts
val_loss_name = 'val_total_mse_loss'
checkpoint_callback = ModelCheckpoint(dirpath=args['log_dir'], 
                                      monitor=val_loss_name, 
                                      every_n_epochs=2, 
                                      filename=f"epoch={{epoch}}-val_loss={{{val_loss_name}:.4f}}",
                                      save_top_k=3)

In [6]:
# Logger for the training process, it will save the training logs in a csv file
csv_logger = CSVLogger(args['log_dir'], name="", version="")

In [7]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
print(f'cuda available: {torch.cuda.is_available()}')
print(f'cuda device count: {torch.cuda.device_count()}')
print(f'CUDA_VISIBLE_DEVICES ID: {os.environ["CUDA_VISIBLE_DEVICES"]}')

cuda available: True
cuda device count: 1
CUDA_VISIBLE_DEVICES ID: 0


In [11]:
# Train
trainer = pl.Trainer(strategy="auto",
                     devices=1,
                     max_epochs=args['num_epochs'], 
                     precision=args['precision'],
                     default_root_dir = args['log_dir'],
                     logger=csv_logger,
                     callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=1)])

trainer.fit(lnnp, data, ckpt_path=None)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params | Mode 
----------------------------------------------
0 | model | TorchMD_Net | 18.9 K | train
----------------------------------------------
18.9 K    Trainable params
0         Non-trainable params
18.9 K    Total params
0.076     Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


train 200, val 50, test 100
Epoch 9: 100%|██████████| 67/67 [00:04<00:00, 16.14it/s]                   

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 67/67 [00:04<00:00, 16.03it/s]


In [12]:
# Test
model = LNNP.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trainer = pl.Trainer(inference_mode=False)
trainer.test(model, data)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


train 200, val 50, test 100
Testing DataLoader 0: 100%|██████████| 4/4 [00:00<00:00,  9.54it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   test_neg_dy_l1_loss       4.174280643463135
   test_total_l1_loss        4.174280643463135
     test_y_l1_loss                 0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_total_l1_loss': 4.174280643463135,
  'test_y_l1_loss': 0.0,
  'test_neg_dy_l1_loss': 4.174280643463135}]