# Cascade: serial prototype implementation

Here we use some of the classes we've written to create a serial prototype run of cascade

This is the minimum viable run, intended to inform upcoming design decisions before distributed runs.

No science is done here. 

In [40]:
from glob import glob
from pathlib import Path

from ase.io import read, write
from ase.io.trajectory import Trajectory
from ase import units
from ase.md import MDLogger, VelocityVerlet
import numpy as np
from mace.calculators import mace_mp


from cascade.utils import canonicalize
from cascade.auditor import RandomAuditor
from cascade.learning.torchani import TorchANI
from cascade.learning.torchani.build import make_output_nets, make_aev_computer

## Read in structure
We'll do these simulations on a Si 2x2x2 with a vacancy

In [2]:
atoms = read('../0_setup/initial-geometries/si-vacancy-2x2x2.vasp')

## Set up calculator

We'll use a small MACE model as our *target*.   
That is to say, MACE is our ground truth physics.   
(We want fast for this prototype)

In [3]:
device = 'cuda:0'
calc = mace_mp('small')

  torch.load(f=model_path, map_location=device)


Using Materials Project MACE for MACECalculator with /home/mike/.cache/mace/20231210mace128L0_energy_epoch249model
Using float32 for MACECalculator, which is faster but less accurate. Recommended for MD. Use float64 for geometry optimization.
Default dtype float32 does not match model dtype float64, converting models to float32.


## Set up learner

We'll fit two ANI models to MACE

In [5]:
learner = TorchANI()

In [38]:
species = list(set(atoms.symbols))
aev = make_aev_computer(species)

model = aev, make_output_nets(species, aev), dict((s, 0.) for s in species)

## Class for trajectories

In [7]:
class CascadeTrajectory:
    """A class to encasplulate a cascade trajectory

    This is useful for reading and auditing trajectories
    so we know where to start sampling from (e.g., after the last trusted timestep)
    """

    def __init__(self,
                 path: str,
                 last_trusted_timestep: int = 0):
        self.path = path
        self.last_trusted_timestep = last_trusted_timestep
    
    def read(self, index=':', *args, **kwargs):
        return read(self.path, *args, index=index, **kwargs)

    def get_untrusted_segment(self):
        return read(self.path, index=f'{self.last_trusted_timestep+1}:')
    
    def trim_untrusted_segment(self):
        # todo: is there a way to do this without loading into memory?
        write(self.path, read(self.path, index=f':{self.last_trusted_timestep+1}'))
        

### tests 

#### Todo: (these should go in a test suite if we're keeping this), update the coords or something to make sure the right things are getting deleted

In [8]:
write('test.traj', [atoms, atoms.copy()])

In [9]:
traj = CascadeTrajectory('test.traj')

In [10]:
traj.read()

[Atoms(symbols='Si63', pbc=True, cell=[10.86, 10.86, 10.86]),
 Atoms(symbols='Si63', pbc=True, cell=[10.86, 10.86, 10.86])]

In [11]:
traj.get_untrusted_segment()

[Atoms(symbols='Si63', pbc=True, cell=[10.86, 10.86, 10.86])]

In [12]:
read('test.traj', index=':1')

[Atoms(symbols='Si63', pbc=True, cell=[10.86, 10.86, 10.86])]

In [13]:
traj.trim_untrusted_segment()

In [14]:
traj.read()

[Atoms(symbols='Si63', pbc=True, cell=[10.86, 10.86, 10.86])]

## train initial models
I just can't stomach starting with comepletely untrained ani models

In [15]:
class CanonicalWriter():

    def __init__(self, path):
        self.path = path

    def __call__(self): 

        with Trajectory(self.path, mode='a') as traj: 
            traj.write(canonicalize(atoms))

In [16]:
from time import perf_counter

In [17]:
n_training_frames = 128
atoms.calc = calc
dynamics = VelocityVerlet(atoms, timestep=1*units.fs)

In [18]:
md_logger = MDLogger(np, atoms, 'train.log', stress=True)
traj_writer = CanonicalWriter('train.traj')
dynamics.attach(md_logger)
dynamics.attach(traj_writer)

In [19]:
%%time
dynamics.run(n_training_frames)

CPU times: user 9.1 s, sys: 5.31 s, total: 14.4 s
Wall time: 7.42 s


True

## Set up protype run

In [20]:
seeds = [0, 1]

In [21]:
total_steps = 128
increment_steps = 64

# while not done:
#     pass_ix = 1
    
#     # set up the directory to hold the trajectory for this pass
#     run_dir = Path(f'cascade-md') / name
#     pass_dir = run_dir / f'chunk={chunk_ix}-pass={pass_ix}'
#     pass_dir.mkdir(exist_ok=True, parents=True)

#     # pull in initial conidtions or last frame from the most recent trusted chunk
#     if chunk_ix == 1: 
#         atoms = initial_conditions[name]
#     else:
#         last_pass = chunk_passes[chunk_ix-1]
#         atoms = read(Path(run_dir)/name/f'chunk={chunk_ix-1}-{last_pass}', 
#                      index='-1')

#     # we save the trajectory in chunks, inluding every pass at simulating that chunk
#     logfile = str(pass_dir / 'md.log')
#     trajfile = str(pass_dir / 'md.traj')
    
#     # setup the ml-driven dynamics
#     atoms.calc = calc_ml
#     dyn = NPT(atoms,
#       timestep=0.5 * units.fs,
#       temperature_K=298,
#       ttime=100 * units.fs,
#       pfactor=0.01,
#       externalstress=0,
#       logfile=logfile,
#       trajectory=trajfile,
#       append_trajectory=False)
#     # timestep indexing
#     # start = (chunk_ix-1) * chunk_size # the actual starting timestep
#     # stop = min(chunk_size, chunk_size*chunk_ix)
#     # there is probably a nice mathy way to do this
#     resulting_steps = chunk_ix * chunk_size # how many total timesteps will be achieved
#     if resulting_steps < total_steps: 
#         chunk_steps = chunk_size
#     else: 
#         chunk_steps = total_steps - ((chunk_ix-1)*chunk_size)

#     # run the dynamics for this chunk
#     dyn.run(chunk_steps)

#     # read in the recent chunk
#     chunk = read(trajfile)
#     break

In [36]:
np.random.RandomState(None).uniform(0, 1)

0.5192068632195513