# Generate Approximate Hessians
Like the previous notebook, fit an approximate model and use that to compute the Hessian. Instead of treating the Hessian parameters as separate, we try here to fit a forcefield using the data.

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from jitterbug.model.dscribe.local import make_gpr_model, train_model, DScribeLocalCalculator, DScribeLocalEnergyModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dscribe.descriptors.soap import SOAP
from ase.vibrations import VibrationsData
from ase.db import connect
from pathlib import Path
from tqdm import tqdm
from io import StringIO
import pandas as pd
import numpy as np
import warnings
import torch
import json
import ase

Configuration

In [None]:
db_path = '../1_explore-sampling-methods/data/simple-uniform/caffeine_pm7_None_at_pm7_None_d=5.00e-03.db'
device = 'cuda'
overwrite = True
inducing_points = 256
l_max = 3
n_max = 1
cutoff = 6
initial_lengthscale = 100

Derived

In [None]:
run_name, sampling_options = Path(db_path).name[:-3].rsplit("_", 1)
exact_path = Path('../0_create-test-set/data/exact/') / f'{run_name}_d=0.01-ase.json'
sampling_name = Path(db_path).parent.name
out_name = '_'.join([run_name, sampling_name, sampling_options])
out_dir = Path('data/soap/')

Skip if done

In [None]:
if (out_dir / f'{out_name}-full.json').exists() and not overwrite:
    raise ValueError('Already done!')

## Read in the Data
Get all computations for the desired calculation and the exact solution

In [None]:
with connect(db_path) as db:
    data = [a.toatoms() for a in db.select('')]
print(f'Loaded {len(data)} structures')

Read in the exact Hessian

In [None]:
with open(exact_path) as fp:
    exact_vibs = VibrationsData.read(fp)

In [None]:
exact_hess = exact_vibs.get_hessian_2d()
exact_zpe = exact_vibs.get_zero_point_energy()

## Fit a Hessian with All Data
Fit a model with the parameters tuned above

In [None]:
soap = SOAP(
    species=list(set(data[0].get_chemical_symbols())),
    n_max=n_max,
    l_max=l_max,
    periodic=False,
    r_cut=cutoff
)

In [None]:
model = DScribeLocalEnergyModel(
    reference=data[0],
    model_fn=lambda x: make_gpr_model(data[0].get_atomic_numbers(), x, 
                                      num_inducing_points=inducing_points,
                                      fix_inducing_points=True,
                                      use_ard_kernel=True,
                                      initial_lengthscale=initial_lengthscale),
    descriptors=soap,
    num_calculators=1,
    device=device,
    train_options=dict(steps=1024, batch_size=128, learning_rate=0.01, patience=128, verbose=True),
)

Plot the model performance

In [None]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    hess_models = model.train(data)

Compare the energies

In [None]:
true_e = np.array([a.get_potential_energy() for a in data])

In [None]:
pred_e = np.array([hess_models[0].get_potential_energy(a) for a in data])

In [None]:
mae = np.abs(true_e - pred_e).mean()
print(f'MAE: {mae * 1000:.2e} meV')

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))

ax.scatter(1000 * (pred_e - true_e.min()), 1000 * (true_e - true_e.min()), s=5, alpha=0.8)

ax.set_xlim(ax.get_ylim())
ax.set_ylim(ax.get_ylim())

ax.plot(ax.get_xlim(), ax.get_xlim(), 'k--')

ax.set_xlabel('E, ML (meV)')
ax.set_ylabel('E, True (meV)')

Compare the forces estimated at a zero displacement to the true value

In [None]:
actual_forces = data[0].get_forces()

In [None]:
pred_forces = hess_models[0].get_forces(data[0])

In [None]:
print(f'Maximum force: {np.abs(pred_forces).max():.2e} eV/Angstrom')

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(4, 2))

for ax, l, h in zip(axs, ['Actual', 'Estimated'], [actual_forces, pred_forces]):
    ax.matshow(h, vmin=-0.05, vmax=0.05, aspect='auto', cmap='RdBu')

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    ax.set_title(l, fontsize=10)

fig.tight_layout()

Get the mean Hessian

In [None]:
%%time
approx_hessian = model.mean_hessian(hess_models)

Compare to exact answer

In [None]:
exact_hess[:3, :3]

In [None]:
approx_hessian[:3, :3]

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(4, 2))

for ax, l, h in zip(axs, ['Exact', 'Approximate'], [exact_hess, approx_hessian]):
    ax.matshow(h, vmin=-100, vmax=100, cmap='RdBu')

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    ax.set_title(l, fontsize=10)

fig.tight_layout()

Get the zero point energy

In [None]:
approx_vibs = VibrationsData.from_2d(data[0], approx_hessian)

In [None]:
approx_vibs.get_zero_point_energy()

In [None]:
exact_zpe

The two differ, but I'm not sure how important the difference is.

Save it to disk

In [None]:
out_dir = Path('data/soap')
out_dir.mkdir(exist_ok=True, parents=True)
with open(f'data/soap/{out_name}-full.json', 'w') as fp:
    approx_vibs.write(fp)

## Plot as a Function of Data
See what happens as we add more data to the training

In [None]:
model.train_options['verbose'] = False

In [None]:
steps = np.linspace(5, len(data), 16, dtype=int)
print(f'Plotting at {len(steps)} steps: {", ".join(map(str, steps[:5]))}, ...')

In [None]:
zpes = []
with open(out_dir / f'{out_name}-increment.json', 'w') as fp:
    for count in tqdm(steps):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            hess_model = model.train(data[:count])

        approx_hessian = model.mean_hessian(hess_model)
        
        # Save the incremental
        print(json.dumps({'count': int(count), 'hessian': approx_hessian.tolist()}), file=fp)
        
        # Compute the ZPE
        approx_vibs = VibrationsData.from_2d(data[0], approx_hessian)
        zpes.append(approx_vibs.get_zero_point_energy())

Plot it

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 2))

ax.plot(steps[:len(zpes)], zpes)

ax.set_xlim([0, steps.max()])
ax.plot(ax.get_xlim(), [exact_zpe]*2, 'k--')

ax.set_xlabel('Energies')
ax.set_ylabel('ZPE (eV)')

fig.tight_layout()