To install rascal:
(NOTE: See the top-level README for the most up-to-date installation instructions.)
+ mkdir ../build 
+ cd build
+ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF ..
+ make -j 4
+ make install

In [None]:
%matplotlib inline
from matplotlib import pylab as plt

import time
import rascal
import json

import ase
from ase.io import read, write
from ase.build import make_supercell
from ase.visualize import view
import numpy as np

from rascal.representations import SphericalInvariants
from rascal.models import Kernel, train_gap_model
from rascal.models.asemd import ASEMLCalculator
from rascal.utils import from_dict, to_dict, CURFilter, dump_obj, load_obj, get_score, print_score

In [None]:
import urllib.request
# a collection of distorted ethanol molecules from the ANI-1 dataset 
# (see https://github.com/isayev/ANI1_dataset) with energies and forces computed using DFTB+ 
# (see https://www.dftbplus.org/)
url = 'https://raw.githubusercontent.com/lab-cosmo/librascal-example-data/833b4336a7daf471e16993158322b3ea807b9d3f/inputs/molecule_conformers_dftb.xyz'
# Download the file from `url`, save it in a temporary directory and get the
# path to it (e.g. '/tmp/tmpb48zma.txt') in the `structures_fn` variable:
structures_fn, headers = urllib.request.urlretrieve(url)
structures_fn

# Utility functions

In [None]:
def extract_ref(frames,info_key='dft_formation_energy_per_atom_in_eV',array_key='zeros'):
    y,f = [], []
    for frame in frames:
        y.append(frame.info[info_key])
        if array_key is None:
            pass
        elif array_key == 'zeros':
            f.append(np.zeros(frame.get_positions().shape))
        else:
            f.append(frame.get_array(array_key))
    y= np.array(y)
    try:
        f = np.concatenate(f)
    except:
        pass
    return y,f


# Build a Force Field

In [None]:
# Total number of structure to load
N = 100
# Number of structure to train the model with
f = int(0.8*N)

# load the structures
frames = read(structures_fn,':{}'.format(N))


global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

# split the structures in 2 sets
ids = list(range(N))
np.random.seed(10)
np.random.shuffle(ids)

train_ids = ids[:f]
test_ids = ids[f:]

frames_train = [frames[ii] for ii in ids[:f]]
frames_test = [frames[ii] for ii in ids[f:]]

# Isolated atom contributions
self_contributions = {
    1: -6.492647589968434,
    6: -38.054950840332474,
    8: -83.97955098636527,
}

In [None]:
# split the dataset in a train and a test set
y_train, f_train = extract_ref(frames_train,'dftb_energy_eV','dftb_forces_eV_per_Ang')
y_test, f_test = extract_ref(frames_test,'dftb_energy_eV','dftb_forces_eV_per_Ang')

In [None]:
# define the parameters of the spherical expansion
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=6, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              normalize=False,
              radial_basis="GTO",
              compute_gradients=True,
              expansion_by_species_method='structure wise',
              )


soap = SphericalInvariants(**hypers)
# compute the representation of all the structures
managers = soap.transform(frames)

In [None]:
# select the sparse points for the sparse kernel method with CUR on the whole training set
n_sparse = {1:40,6:30,8:20}
compressor = CURFilter(soap, n_sparse, act_on='sample per species')
X_sparse = compressor.select_and_filter(managers)

In [None]:
# set up the sparse kernel and compute the representation for the train set
zeta = 1
kernel = Kernel(soap, name='GAP', zeta=zeta, target_type='Structure', kernel_type='Sparse')

managers_train = managers.get_subset(train_ids)

In [None]:
# build the KNM matrix for training with forces and energies (see train_gap_model for more details)
KNM = kernel(managers_train, X_sparse)
KNM_down = kernel(managers_train, X_sparse, grad=(True, False))
KNM = np.vstack([KNM, KNM_down])
del KNM_down
KNM_down = []


In [None]:
# train a GAP model 
model = train_gap_model(kernel, frames_train, KNM, X_sparse, y_train, self_contributions, 
                        grad_train=-f_train, lambdas=[0.1, 0.01], jitter=1e-13)

In [None]:
# # the model can be serialized to a python dictionary
# model_serialized = to_dict(model)
# # and recovered from such dictionary
# model_copy = from_dict(model_serialized)

In [None]:
# make predictions on the test set
managers_test = managers.get_subset(test_ids)
y_pred = model.predict(managers_test)
f_pred = model.predict_forces(managers_test)

In [None]:
# basic assessement of the quality of the trained model
print_score(y_pred, y_test)
print_score(f_pred.flatten(), f_test.flatten())
plt.plot(y_test, y_pred, 'o')
plt.title("correlation plot")
plt.xlabel("predicted energies [eV]")
plt.ylabel("reference energies [eV]")

The correlation isn't great, but it's enough to get a working potential.

In order to improve the accuracy and start getting _quantitative_ accuracy, we would need to increase the number of sparse environments (`n_sparse` above), as well as the number of training structures, both of which would also require more memory and time to train.  Be aware that you can easily exceed the memory limits even of typical supercomputer nodes with just a few hundred sparse points and a few thousand training atoms!

In [None]:
# save the model to a file in json format for future use
dump_obj('/tmp/mymodel.json', model)

# Test the model on dimer configurations

In [None]:
# you can load the previously trained model
model = load_obj('/tmp/mymodel.json')

In [None]:
#creating atoms pairs, H is 1, C is 6 and O is 8, the first atom is the origin one
pairs = [[1,1],[6,6],[8,8],[6,1],[8,1],[6,8]]
ndists = 40 #number of distances to look at
dists = np.linspace(0.1,4.9,ndists) #distance list, can be changed 
print('Number of configurations: ', len(pairs)*len(dists))

In [None]:
frames = []
for p in pairs:
    for d in dists:
        #using ase we can create the cell and place the atoms
        atoms = ase.Atoms(numbers=p,pbc=False,cell=np.eye(3)*10,positions=[[0,0,0],[d,0,0]])
        frames.append(atoms)
X = soap.transform(frames)
e_pairs = model.predict(X)
e_pairs -= e_pairs.mean()

In [None]:
for pair_to_plot in pairs:
    i = pairs.index(pair_to_plot)

    fig, ax = plt.subplots()
    ax.plot(dists,e_pairs[i*ndists:(i+1)*ndists],'--xb',linewidth=1)
    ax.set_xlabel('Distance (A)')
    ax.set_ylabel('Predicted energy (eV)')
    ax.set_title('Bond energy between {} and {}'.format(*pair_to_plot))
    plt.tight_layout()
    plt.show()

# Use it to run a MD simulation

In [None]:
from ase.md import MDLogger
from ase.md.langevin import Langevin
from ase import units
from ase.io.trajectory import Trajectory
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution

In [None]:
from rascal.models.asemd import ASEMLCalculator

In [None]:
# Use the model loaded above
soap = model.get_representation_calculator()
calc = ASEMLCalculator(model, soap)

In [None]:
log_fn = '/tmp/md.log'
filename = '/tmp/md.traj'

T = 200

In [None]:
%%time

atoms = read(structures_fn, 0)

MaxwellBoltzmannDistribution(atoms, T* units.kB)

atoms.set_calculator(calc)

traj = Trajectory(filename, mode='w', atoms=atoms, master=None)

dyn = Langevin(atoms, 0.5 * units.fs, units.kB * T, 0.002)

dyn.attach(MDLogger(dyn, atoms, log_fn, header=True, stress=False,
           peratom=False, mode="w"), interval=50)

dyn.attach(traj.write, interval=10)

dyn.run(200)

Uncomment and run the cell below to examine the trajectory using the ASE viewer:

In [None]:
#view(read(filename,':'))