In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mlelec.data.dataset import precomputed_molecules, MoleculeDataset, MLDataset, get_dataloader
import torch
from ase.io import read
import ase
from mlelec.models.linear import LinearTargetModel
from mlelec.features.acdc import compute_features_for_target

In [3]:
import sys 
import pyscf, pyscfad
print(sys.version_info)
print(pyscf.__version__)
print(pyscfad.__version__)


sys.version_info(major=3, minor=11, micro=5, releaselevel='final', serial=0)
2.3.0
0.1.2


In [4]:
water_data = MoleculeDataset(mol_name='water_1000', frame_slice=slice(0,100),  device='cuda', aux=['overlap', 'orbitals'], target=["fock", "dipole_moment"])
ml_data = MLDataset(molecule_data=water_data, device ='cuda', model_strategy = "coupled")

ml_data._shuffle(random_seed=5380)
ml_data._split_indices(train_frac=0.7, val_frac=0.2)
#assumed args.model_type = 'acdc'
if ml_data.features is None: 
    ml_data._set_features(compute_features_for_target(ml_data, device='cuda'))


Loading structures
examples/data/water_1000/sto-3g/fock.hickle
examples/data/water_1000/sto-3g/dipole_moment.hickle
Computing features with default hypers




In [5]:
train_dl, val_dl, test_dl = get_dataloader(ml_data, model_return= 'tensor')

## training on a tiny dataset for now 

In [6]:
model = LinearTargetModel(dataset = ml_data, nlayers = 1, nhidden = 16, bias = False, device = 'cuda')

In [18]:
from cmath import inf
best = inf
early_stop_criteria =  10 

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=10, verbose=True)
import mlelec.metrics as mlmetrics
val_interval = 10
loss_fn = getattr(mlmetrics, 'L2_loss')
losses=[]
early_stop_count = 0
for epoch in range(300):
    # train_pred = []
    # target=[]
    model.train(True)
    train_loss =0
    for i, data in enumerate(train_dl):
        optimizer.zero_grad()
        pred = model(data['input'], return_type='tensor', batch_indices=data['idx'])
        # target.append(data['output'])
        # train_pred.append(pred)
        loss = loss_fn(pred, data['output'])
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    # train_loss = loss_fn(torch.cat(train_pred), torch.cat(target))
    # print(train_loss - epoch_loss)
    losses.append(train_loss)
    # scheduler.step(train_loss)
    model.train(False)

    if epoch% val_interval == 0:
        # val_pred = []
        # val= []
        val_loss = 0
        for i, data in enumerate(val_dl):
            pred = model(data['input'], return_type='tensor', batch_indices=data['idx'])
            vloss = loss_fn(pred, data['output'])
            val_loss += vloss.item()
            # val.append(data['output'])
            # val_pred.append(pred)
        new_best = val_loss < best 
        if new_best:
            best = val_loss
            torch.save(model.state_dict(), 'best_model.pt')
            early_stop_count = 0
        else: 
            early_stop_count+=1
        if early_stop_count > early_stop_criteria:
            print(f'Early stopping at epoch {epoch}')
            print(f'Epoch {epoch}, train loss {train_loss/len(ml_data.train_idx)}')

            print(f'Epoch {epoch} val loss {val_loss/len(ml_data.val_idx)}')
            # Save last best model
            break

        # val_loss = loss_fn(torch.cat(val_pred), torch.cat(val))
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, train loss {train_loss/len(ml_data.train_idx)}')

        print(f'Epoch {epoch} val loss {val_loss/len(ml_data.val_idx)}')
    

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.loglog(losses)

NameError: name 'losses' is not defined

### Input model prediction into pyscf to compute dipole moment 

In [7]:
import os
os.environ['PYSCFAD_BACKEND']='torch'

import torch
from pyscf import gto

from pyscfad import numpy as np
from pyscfad import ops
from pyscfad.ml.scf import hf
import pyscf.pbc.tools.pyscf_ase as pyscf_ase

Using PyTorch backend.




In [8]:
from mlelec.data.pyscf_calculator import _instantiate_pyscf_mol
from mlelec.utils.twocenter_utils import fix_orbital_order, unfix_orbital_order
import mlelec.metrics as mlmetrics

In [9]:
import sys
import os
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        sys._jupyter_stdout = sys.stdout

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        sys._jupyter_stdout = sys.stdout

In [10]:
def compute_dipole_moment(frames, fock_predictions, overlaps):
    dipoles  = []
    for i, frame in enumerate(frames):
        mol = _instantiate_pyscf_mol(frame)
        mf = hf.SCF(mol)
        fock = torch.autograd.Variable(fock_predictions[i].type(torch.float64), requires_grad=True)

        mo_energy, mo_coeff = mf.eig(fock, overlaps[i])
        mo_occ = mf.get_occ(mo_energy) # get_occ returns a numpy array
        mo_occ = ops.convert_to_tensor(mo_occ)
        dm1 = mf.make_rdm1(mo_coeff, mo_occ)
        dip = mf.dip_moment(dm=dm1)
        dipoles.append(dip)
    return torch.stack(dipoles)

In [12]:

from IPython.utils import io
# with HiddenPrints():
with io.capture_output() as captured:

    fock_predictions = model.forward(ml_data.features, return_type='tensor')
    # convert prediction back to pyscf order 
    fock_predictions = unfix_orbital_order(fock_predictions, ml_data.structures, ml_data.molecule_data.aux_data['orbitals'])
    dipole_predictions = compute_dipole_moment(ml_data.structures, fock_predictions,ml_data.molecule_data.aux_data['overlap']); 

In [13]:
error = mlmetrics.L2_loss(dipole_predictions, ml_data.molecule_data.target['dipole_moment'])
print('MSE on dipole', error/len(dipole_predictions))

MSE on dipole tensor(1.2969, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


## Indirect learning of dipole moment through pyscfad 

In [19]:
import mlelec.metrics as mlmetrics

In [20]:
def compute_dipole_moment_from_batchidx(ml_data: MLDataset, batch_fock, batch_indices):
    # Convert fock predictions back to pyscf order
    # Compute dipole moment for each molecule in batch
    batch_frames = [ml_data.structures[i] for i in batch_indices]
    batch_fock = unfix_orbital_order(batch_fock, batch_frames, ml_data.molecule_data.aux_data['orbitals'])
    batch_overlap = ml_data.molecule_data.aux_data['overlap'][batch_indices]
    return compute_dipole_moment(batch_frames, batch_fock, batch_overlap)


In [None]:
from cmath import inf
best = inf
early_stop_criteria =  10 

In [21]:
model = LinearTargetModel(dataset = ml_data, nlayers = 1, nhidden = 16, bias = False, device = 'cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=10, verbose=True)

val_interval = 10

In [22]:
loss_fn = getattr(mlmetrics, 'L2_loss')
losses=[]
early_stop_count = 0
for epoch in range(300):
    model.train(True)
    train_loss =0
    for i, data in enumerate(train_dl):
        optimizer.zero_grad()
        batch_indices = data['idx']
        pred = model(data['input'], return_type='tensor', batch_indices=batch_indices)
        with io.capture_output() as captured:
        # with HiddenPrints():
            train_dip_pred = compute_dipole_moment_from_batchidx(ml_data, pred, batch_indices=batch_indices)
        loss = loss_fn(train_dip_pred, ml_data.molecule_data.target['dipole_moment'][batch_indices])
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    # train_loss = loss_fn(torch.cat(train_pred), torch.cat(target))
    # print(train_loss - epoch_loss)
    losses.append(train_loss)
    # scheduler.step(train_loss)
    model.train(False)

    if epoch% val_interval == 0:
        val_loss = 0
        for i, data in enumerate(val_dl):
            batch_indices = data['idx']
            pred = model(data['input'], return_type='tensor', batch_indices=batch_indices)
            with io.capture_output() as captured:
            # with HiddenPrints():
                val_dip_pred = compute_dipole_moment_from_batchidx(ml_data, pred, batch_indices=batch_indices)
            vloss = loss_fn(val_dip_pred, ml_data.molecule_data.target['dipole_moment'][batch_indices])
            val_loss += vloss.item()
        new_best = val_loss < best 
        if new_best:
            best = val_loss
            # torch.save(model.state_dict(), 'best_model_dipole.pt')
            early_stop_count = 0
        else: 
            early_stop_count+=1
        if early_stop_count > early_stop_criteria:
            print(f'Early stopping at epoch {epoch}')
            print(f'Epoch {epoch}, train loss {train_loss/len(ml_data.train_idx)}')

            print(f'Epoch {epoch} val loss {val_loss/len(ml_data.val_idx)}')
            # Save last best model
            break

        # val_loss = loss_fn(torch.cat(val_pred), torch.cat(val))
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, train loss {train_loss/len(ml_data.train_idx)}')

        print(f'Epoch {epoch} val loss {val_loss/len(ml_data.val_idx)}')
    

Epoch 0, train loss 5.532426070307148
Epoch 0 val loss 3.5213770260359607
Epoch 10, train loss 5.544810590305144
Epoch 10 val loss 4.055684175440703
Epoch 20, train loss 4.964313791188237
Epoch 20 val loss 6.295765874356405


The default features and model is quite bad - so no wonder losses are high

In [None]:
https://stackoverflow.com/questions/62067400/understanding-accumulated-gradients-in-pytorch
    

In [None]:
# grad of dipole moment


for ifr, pred in enumerate(predicted_xyz[:]):
    #gradient of the x component of the p vector
    gradients[ifr][:, 0,:] = torch.autograd.grad(pred[0], systems[ifr].positions, retain_graph = True)[0]
    #gradient of the y component of the p vector
    gradients[ifr][:, 1,:] = torch.autograd.grad(pred[1], systems[ifr].positions, retain_graph = True)[0]
    #gradient of the z component of the p vector
    gradients[ifr][:, 2,:] = torch.autograd.grad(pred[2], systems[ifr].positions, retain_graph = True)[0]

## Calculate the target dipole moment of water molecule in case data not found

In [None]:
from mlelec.data.pyscf_calculator import calculator

calc= calculator(
        path="examples/data/water_1000/",
        mol_name="water_1000",
        frame_slice="0:1",
        target = ['fock', 'dipole_moment'],
    )
calc.calculate(   basis_set="sto-3g", verbose = 1)

Loading
Number of frames:  1
['0 O 1s    ', '0 O 2s    ', '0 O 2px   ', '0 O 2py   ', '0 O 2pz   ', '1 H 1s    ', '2 H 1s    ']
converged: True
Dipole moment(X, Y, Z, Debye):  1.50259,  1.24095,  0.00000


In [None]:
calc.ao_labels

defaultdict(list, {8: ['1s', '2s', '2px', '2py', '2pz'], 1: ['1s']})

In [None]:
calc.save_results(path= 'examples/data/water/')

1 s
2 s
2 px
2 py
2 pz
1 s
{8: [[1, 0, 0], [2, 0, 0], [2, 1, 1], [2, 1, -1], [2, 1, 0]], 1: [[1, 0, 0]]}
All done, results saved at:  examples/data/water/


