In [1]:
import math
import random

import pymatgen as pmg
import numpy as np

import lammps

In [2]:
a = 4.2
alpha = 90
lattice = pmg.Lattice.from_parameters(a, a, a, alpha, alpha, alpha)
symbols = ['Mg', 'Mg', 'Mg', 'Mg', 'O', 'O', 'O', 'O']
positions = np.array([
    (0, 0, 0), (2.1, 2.1, 0), (2.1, 0, 2.1), (0, 2.1, 2.1), # Mg
    (2.1, 0, 0), (0, 2.1, 0), (0, 0, 2.1), (2.1, 2.1, 2.1)  # O
]).reshape(-1, 3)

structure = pmg.Structure(lattice, symbols, positions, coords_are_cartesian=True)

In [3]:
# to make a fair speed comparison lets precompute some values
elements = [pmg.Element(symbol) for symbol in set(symbols)]
elements

[Element O, Element Mg]

In [4]:
structure

Structure Summary
Lattice
    abc : 4.2 4.2 4.2
 angles : 90.0 90.0 90.0
 volume : 74.08800000000001
      A : 4.2 0.0 2.571758278209442e-16
      B : 6.754115128641874e-16 4.2 2.571758278209442e-16
      C : 0.0 0.0 4.2
PeriodicSite: Mg (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: Mg (2.1000, 2.1000, 0.0000) [0.5000, 0.5000, -0.0000]
PeriodicSite: Mg (2.1000, 0.0000, 2.1000) [0.5000, 0.0000, 0.5000]
PeriodicSite: Mg (0.0000, 2.1000, 2.1000) [-0.0000, 0.5000, 0.5000]
PeriodicSite: O (2.1000, 0.0000, 0.0000) [0.5000, 0.0000, -0.0000]
PeriodicSite: O (0.0000, 2.1000, 0.0000) [-0.0000, 0.5000, -0.0000]
PeriodicSite: O (0.0000, 0.0000, 2.1000) [0.0000, 0.0000, 0.5000]
PeriodicSite: O (2.1000, 2.1000, 2.1000) [0.5000, 0.5000, 0.5000]

In [5]:
%%timeit
lmp = lammps.Lammps()

215 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [6]:
def initialize_lammps(structure, elements, atom_types):
    lmp = lammps.Lammps(units='metal', args=[
        '-log', 'none',
        '-screen', 'none'
    ])
    lmp.command('atom_modify map yes id yes sort 10000 5.0')
    
    # set unit cell
    lmp.box.from_lattice_const(len(elements), 
                               np.array(structure.lattice.abc), 
                               np.array(structure.lattice.angles) * (math.pi/180))
    
    # set element mass
    for element, atom_type in zip(elements, lmp.system.atom_types):
        atom_type.mass = element.atomic_mass
    
    velocities = structure.site_properties.get('velocities')
    lmp.system.create_atoms(atom_types, structure.cart_coords+1e-8, velocities)
    
    script = """
    pair_style  buck 10
    pair_coeff 1 1 1309362.2766468062 0.104 0.0
    pair_coeff 1 2 9892.357 0.20199 0.0
    pair_coeff 2 2 2145.7345 0.3 30.2222
    
    fix 1 all nve
    """
    for line in script.split('\n'):
        lmp.command(line)
    return lmp

# Lets test system constuction time from scratch

It linearly scales with the number of atoms. Majority amount of time is in setup. If you can avoid setting up each time the simluations will take very little time.

 - (1, 1, 1)     8 atoms - 333 us
 - (2, 2, 2)    64 atoms - 536 us
 - (3, 3, 3)   216 atoms -   1 ms
 - (4, 4, 4)   512 atoms -   2 ms
 - (5, 5, 5)  1000 atoms -   4 ms
 - (10,10,10) 8000 atoms -  32 ms
 - (20,20,20)64000 atoms - 261 ms

In [61]:
supercell_structure = structure * (20, 20, 20)
atom_types = np.array([elements.index(atom.specie)+1 for atom in supercell_structure], dtype=np.intc)

In [62]:
%%timeit
lmp = initialize_lammps(supercell_structure, elements, atom_types)

258 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# How long does a run with 0 steps take

Could be a little less than twice as long. Seconds tests with laptop unplugged yielded different results.

 - (1, 1, 1)     8 atoms - 246 us
 - (2, 2, 2)    64 atoms -   1 ms
 - (3, 3, 3)   216 atoms -   4 ms
 - (4, 4, 4)   512 atoms -  12 ms
 - (5, 5, 5)  1000 atoms -  20 ms
 - (10,10,10) 8000 atoms - 162 ms
 - (20,20,20)64000 atoms -  1.5 s

In [65]:
supercell_structure = structure * (10, 10, 10)
atom_types = np.array([elements.index(atom.specie)+1 for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

In [66]:
%%timeit
lmp.run(0)

299 ms ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# What is the cost of successive runs vs single run

time: full run(100), run(1)x100 with pre post, run(1)x100 without pre post, run(10)x10 without pre post.

This demonstrates that we can get nearly the exact same performance when we turn of pre and post `<10%` maximum penalty.

 - (1, 1, 1)     8 atoms - 12 ms, 50 ms, 17 ms, 12 ms
 - (2, 2, 2)    64 atoms - 93 ms, 360 ms, 113 ms, 96 ms  
 - (3, 3, 3)   216 atoms - 325 ms, 1.13 s, 374 ms, 324 ms  
 - (4, 4, 4)   512 atoms - 742 ms, 3.15 s, 884 ms, 781 ms 
 - (5, 5, 5)  1000 atoms - 1.5 s, 5.6 s, 1.76 s, 1.54 s
 - (10,10,10) 8000 atoms - 12 s, 44 s, 14 s, 13 s
 - (20,20,20)64000 atoms - about 8x slower. you get the picture
 
Point is that you can get the same performance with single run steps vs combined

In [39]:
supercell_structure = structure * (20, 20, 20)
atom_types = np.array([elements.index(atom.specie)+1 for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

In [None]:
%%timeit

lmp.run(100)

In [None]:
%%timeit
for i in range(100):
    lmp.run(1)

In [None]:
%%timeit
# pre: dont recompute neighbor lists and forces before run
# post: dont print timing information on run
for i in range(100):
    lmp.run(1, pre=False, post=False)

In [None]:
%%timeit
for i in range(10):
    lmp.run(10, pre=False, post=False)

# How long does it take to get the forces from simluation

 - (1, 1, 1)     8 atoms -   3 us
 - (2, 2, 2)    64 atoms -   4 us
 - (3, 3, 3)   216 atoms -   4 us
 - (4, 4, 4)   512 atoms -   4 us
 - (5, 5, 5)  1000 atoms -   5 us
 - (10,10,10) 8000 atoms -  22 us
 - (20,20,20)64000 atoms - 160 us

even with the largest system of 64000 atoms it is negligible `157 us`. smallest 8 atoms `3 us`

In [67]:
supercell_structure = structure * (20, 20, 20)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)
lmp.run(0)

In [68]:
%%timeit
lmp.system.forces

730 µs ± 52.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# How long does it take to set the values vs the internal routine

We would like to compare setting velocities to zero in python vs `velocity all set 0 0 0`

timing: python routine, lammps routine 

 - (1, 1, 1)        8 atoms -   4 us,  2.75 us
 - (2, 2, 2)       64 atoms -   5 us,     3 us
 - (3, 3, 3)      216 atoms -   6.5 us, 3.2 us
 - (4, 4, 4)      512 atoms -   9.5 us,   4 us
 - (5, 5, 5)    1,000 atoms -   15 us,    5 us
 - (10,10,10)   8,000 atoms -   87 us,   22 us 
 - (20,20,20)  64,000 atoms -  693 us,  153 us
 - (50,50,50) 125,000 atoms -  11.3 ms, 3.65 ms
 
So python setting properties is about 4 times slower resetting all values. 

In [12]:
supercell_structure = structure * (50, 50, 50)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

In [13]:
%%timeit
lmp.system.velocities = np.zeros((lmp.system.total, 3), dtype=np.float)

11.3 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
lmp.command('velocity all set 0 0 0')

3.65 ms ± 49 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Does unordered vs ordered gather offer any benifits?

timing: ordered gather, unordered gather

 - (1, 1, 1)        8 atoms -   3.65 us,  3.81 us
 - (2, 2, 2)       64 atoms -   4.75 us,  4.09 us
 - (3, 3, 3)      216 atoms -   6.16 us,  4.47 us
 - (4, 4, 4)      512 atoms -   10 us,    6.25 us
 - (5, 5, 5)    1,000 atoms -   15 us,     7.5 us
 - (10,10,10)   8,000 atoms -   85 us,    27.8 us 
 - (20,20,20)  64,000 atoms -  823 us,   316.0 us
 - (50,50,50) 125,000 atoms - 14.2 ms,    5.19 ms
 
So we see that by gathering atoms unordered we can get significant speed increases with large systems.

You can gather the atom `id` beforehand to be able to associate an atom with an id.

In [36]:
n = 50
supercell_structure = structure * (n, n, n)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

In [37]:
%%timeit
x = lmp.system.global_gather_property_ordered('x')

13.8 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%%timeit
x = lmp.system.global_gather_property_unordered('x')

5.15 ms ± 19 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Is is faster to get a subset of atoms?

**YES** it is orders of magnitude faster to get only a subset of atom

In [14]:
n = 20
supercell_structure = structure * (n, n, n)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

atom_ids = np.array([1, 2, 3], dtype=np.intc)

In [15]:
%%timeit
x = lmp.system.global_gather_property_subset('x', atom_ids)

4.85 µs ± 130 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [16]:
%%timeit
x = lmp.system.global_gather_property_ordered('x')[:3]

850 µs ± 8.81 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Is it faster to set a subset of the atoms?

**YES** not as much so

In [20]:
n = 20
supercell_structure = structure * (n, n, n)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

atom_ids = np.array([1, 2, 3], dtype=np.intc)
velocities = np.zeros((3, 3), dtype=np.double)

In [21]:
%%timeit
lmp.system.global_scatter_property_subset('v', atom_ids, velocities)

101 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [22]:
velocities = np.zeros((lmp.system.total, 3), dtype=np.double)

In [23]:
%%timeit
lmp.system.global_scatter_property_ordered('v', velocities)

658 µs ± 7.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Benchmark to reflect DFTFIT workflow

Problem: (1, 1, 1) 64 atoms - buckingham potential

 - using `subprocess` - 205.151 ms (5 per second) 1X
 - using `pymatgen-lammps` - 6.8 ms (147 per second) 30X
 - using `lammps-cython` - 295 us (3389 per second) 690X
 
More Realistic Problem: (2, 2, 2) 64 atoms - buckingham potential

 - using `lammps-cython` - 1.55 ms
 
What does this mean? Means that calculations that took 20 hours will now take 50 minutes.

In [72]:
def initialize_lammps(structure, elements, atom_types):
    lmp = lammps.Lammps(units='metal', args=[
        '-log', 'none',
        '-screen', 'none'
    ])
    lmp.command('atom_modify map yes id yes sort 10000 5.0')
    
    # set unit cell
    lmp.box.from_lattice_const(len(elements), 
                               np.array(structure.lattice.abc), 
                               np.array(structure.lattice.angles) * (math.pi/180))
    
    # set element mass
    for element, atom_type in zip(elements, lmp.system.atom_types):
        atom_type.mass = element.atomic_mass
    
    velocities = structure.site_properties.get('velocities')
    lmp.system.create_atoms(atom_types, structure.cart_coords+1e-8, velocities)
    
    lmp.thermo.add('my_ke', 'ke', 'all')
    
    return lmp

def mimic_dftfit_algo(lmp):
    # update potential
    script = """
    pair_style  buck 10
    pair_coeff 1 1 1309362.2766468062 0.104 0.0
    pair_coeff 1 2 {} 0.20199 0.0
    pair_coeff 2 2 2145.7345 0.3 {}
    """
    a_2 = random.uniform(9000, 11000)
    c_3 = random.uniform(20, 40)
    for line in script.format(a_2, c_3).split('\n'):
        lmp.command(line)
        
    # Run static calculations
    lmp.run(0)
    
    # Get Forces, Energy, and Stresses
    S = lmp.thermo.computes['thermo_press'].vector
    values = {
        'forces': lmp.system.forces.copy(),
        'energy': lmp.thermo.computes['thermo_pe'].scalar + lmp.thermo.computes['my_ke'].scalar,
        'stress': np.array([
            [S[0], S[3], S[5]],
            [S[3], S[1], S[4]],
            [S[5], S[4], S[2]]
        ])
    }

In [73]:
supercell_structure = structure * (1, 1, 1)
atom_types = np.array([elements.index(atom.specie) for atom in supercell_structure], dtype=np.intc)
lmp = initialize_lammps(supercell_structure, elements, atom_types)

In [74]:
%%timeit
mimic_dftfit_algo(lmp)

295 µs ± 5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
