# Dataset exploration: MD17 revised_aspirin and aspirin_CCSD

In [59]:
from torchmdnet.datasets import MD17

import torch
from torch.utils.data import random_split

In [54]:
root = "datasets/md17"
rev_aspirin_dataset = MD17(root, "revised_aspirin")
ccsd_aspirin_train = MD17(root, "aspirin_CCSD", train = True)
ccsd_aspirin_test = MD17(root, "aspirin_CCSD", train = False)

In [45]:
print(rev_aspirin_dataset[0])
print(ccsd_aspirin_train[0])
print(ccsd_aspirin_test[0])

# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.MD17.html
# 21 molecules, 3 dimensions
# y = total energy (kcal/mol)
# pos = Cartesian position (Angstroms)
# z = atomic number
# neg_dy = forces on each atom (kcal/mol/Angstrom)

Data(y=[1], pos=[21, 3], z=[21], neg_dy=[21, 3])
Data(y=[1, 1], pos=[21, 3], z=[21], neg_dy=[21, 3])
Data(y=[1, 1], pos=[21, 3], z=[21], neg_dy=[21, 3])


In [46]:
print(len(rev_aspirin_dataset))
print(len(ccsd_aspirin_train))
print(len(ccsd_aspirin_test))

# number of data points

100000
1000
500


In [61]:
generator = torch.Generator().manual_seed(0)
rev_aspirin_train, rev_aspirin_val, rev_aspirin_test = random_split(
                                                            rev_aspirin_dataset, 
                                                            (0.01, 0.01, 0.98), 
                                                            generator=generator
                                                            )

print(len(rev_aspirin_train))
print(len(rev_aspirin_val))
print(len(rev_aspirin_test))

# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.MD17.html:
# "It is advised to not train a model on more than 1,000 samples from the original or revised MD17 dataset."
# as such, only using 1000 samples to train

1000
1000
98000


# Getting a Model Training: TorchMD_GN

In [62]:
from torchmdnet.models.model import create_model
from torchmdnet.models.torchmd_gn import TorchMD_GN
from torchmdnet.optimize import TorchMD_GN_optimized

from torch.utils.data import DataLoader
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import tqdm
import tqdm.notebook
import time


In [53]:
# trying to manually run a training loop as opposed to using torchmd-train

gpu_num = 0
device = torch.device(f"cude:{gpu_num}")

epochs = 10
batch_size = 16
learning_rate = 0.0001

model = create_model({
    'embedding_dimension': 128,
    'num_layers': 6,
    'num_rbf': 50,
    'rbf_type': 'expnorm',
    'trainable_rbf': True,
    'activation': 'silu',
    'neighbor_embedding': True,
    'cutoff_lower': 0.0,
    'cutoff_upper': 5.0,
    'max_z': 100,
    'max_num_neighbors': 32,
    'model': 'graph-network',
    'aggr': 'add',
    'derivative': False,
    'atom_filter': -1,
    'prior_model': None,
    'output_model': 'Scalar',
    'reduce_op': 'add'
})

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# scheduler =
criterion = torch.nn.L1Loss()

trainloader = DataLoader(rev_aspirin_train, 
                         batch_size=batch_size,
                         shuffle=True)
valloader = DataLoader(rev_aspirin_val, 
                         batch_size=batch_size,
                         shuffle=True)


training_loss_per_epoch = []
val_loss_per_epoch = []

model.train()
for epoch in range(epochs):
    training_losses = []
    for y, pos, z, neg_dy in tqdm.notebook.tqdm(trainloader, unit="batch"):
        y = y.to(device)
        pos = pos.to(device)
        z = z.to(device)
        neg_dy = neg_dy.to(device)

        optimizer.zero_grad()
        pred = model(z, pos #TODO batch)