In [176]:
from argparse import ArgumentParser
import ase
from ase import Atoms
from ase.calculators.calculator import PropertyNotImplementedError
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_definitions import (
    atomic_forces_pd,
    cauchy_stress_pd,
)
import h5py
import numpy as np
from pathlib import Path
import sys


In [169]:
client = MongoDatabase('test2',drop_database=True)

In [167]:
DATASET_FP = Path('/Users/piper/Code/colabfit/data/structures_packed/structures/')

In [161]:
def reader(filepath: Path):
    atoms = []
    with h5py.File(filepath) as f:
        file_key = list(f.keys())[0]

        # Do not need indexing (i.e., one value per configuration)
        cells = np.array(f[file_key]['structures']['chunk_arrays']['cell'])
        pbcs = list(f[file_key]['structures']['chunk_arrays']['pbc'])
        names = [id.decode() for id in f[file_key]['structures']['chunk_arrays']['identifier']]
        stress = np.array( f[file_key]['structures']['chunk_arrays']['stress'])
        energy = np.array( f[file_key]['structures']['chunk_arrays']['energy'])
        start_index = np.array(f[file_key]['structures']['chunk_arrays']['start_index'])
        num_atoms = np.array(f[file_key]['structures']['chunk_arrays']['length'])
        
        
        # Need indexing (multiple rows per configuration)
        forces = np.array( f[file_key]['structures']['element_arrays']['forces'])
        coords = np.array( f[file_key]['structures']['element_arrays']['positions'])
        element = np.array(["Mg" for x in coords])

    # Remove first index to avoid blank array
    start_index = start_index[1:]
 
    forces = np.split(forces, start_index)
    coords = np.split(coords, start_index)
    element = np.split(element, start_index)

    for coords, element, pbcs, cells, stress, energy, forces, names in zip(
        coords, element, pbcs, cells, stress, energy, forces, names):
        atom = Atoms(positions=coords, symbols=element, pbc=pbcs, cell=cells)
        atom.info['stress'] = stress
        atom.info['energy'] = energy
        atom.info['forces'] = forces
        atom.info['name'] = f"{file_key}_{names}"
        atoms.append(atom)
    
    return atoms
    

In [168]:
configurations = load_data(
    file_path=DATASET_FP,
    file_format='folder', 
    name_field='name',  
    elements=["Mg"],
    reader=reader,
    glob_string='*.h5',
    generator=False,
)

9it [00:53,  5.97s/it]


In [178]:
client.insert_property_definition(atomic_forces_pd)
client.insert_property_definition(cauchy_stress_pd)

In [182]:
metadata = { 'software': {'value':'MLIP'},
            'method': {'value':'DFT'},
            # not clear what energy was measured
            'energy': {'field': 'energy'}
}

property_map = {
        
    'cauchy-stress': [{
        'stress':   {'field': 'stress',  'units': 'Unknown'},
            '_metadata': metadata

    }],
    'atomic-forces': [{
        'forces':   {'field': 'forces',  'units': 'Unknown'},
            '_metadata': metadata

    }],
}

In [183]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_do_ids = list(zip(*ids))

Preparing to add configurations to Database: 100%|██████████| 47253/47253 [03:20<00:00, 235.90it/s]


In [184]:
cs_regexes = [
    ['mg_edmonds_2022_Everything',
     'Everything*',
     'Configurations from mg_edmonds_2022 in Everything dataset (defined by dataset author)'
    ],
    ['mg_edmonds_2022_EverythingNoShear',
     'EverythingNoShear*',
     'Configurations from mg_edmonds_2022 in EverythingNoShear dataset'
    ],
    ['mg_edmonds_2022_Hydro',
     'Hydro*',
     'Configurations from mg_edmonds_2022 in Hydro dataset'
    ],
    ['mg_edmonds_2022_IntMin',
     'IntMin*',
     'Configurations from mg_edmonds_2022 in IntMin dataset'
    ],
    ['mg_edmonds_2022_RandSPG',
     'RandSPG*',
     'Configurations from mg_edmonds_2022 in RandSPG dataset'
    ],
    ['mg_edmonds_2022_Rattle',
     'Rattle*',
     'Configurations from mg_edmonds_2022 in Rattle dataset'
    ],
    ['mg_edmonds_2022_Shear',
     'Shear*',
     'Configurations from mg_edmonds_2022 in Shear dataset'
    ],
    ['mg_edmonds_2022_VolMin',
     'VolMin*',
     'Configurations from mg_edmonds_2022 in VolMin dataset'
    ],
]

cs_ids = []

for i, (name, regex, desc) in enumerate(cs_regexes):
    co_ids = client.get_data(
        'configurations',
        fields='hash',
        query={'hash': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({name}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc,name=name)

    cs_ids.append(cs_id)


Configuration set 0 (mg_edmonds_2022_Everything):   16748
Configuration set 1 (mg_edmonds_2022_EverythingNoShear):   12371
Configuration set 2 (mg_edmonds_2022_Hydro):    4547
Configuration set 3 (mg_edmonds_2022_IntMin):     922
Configuration set 4 (mg_edmonds_2022_RandSPG):     927
Configuration set 5 (mg_edmonds_2022_Rattle):    4583
Configuration set 6 (mg_edmonds_2022_Shear):   16748
Configuration set 7 (mg_edmonds_2022_VolMin):     926


In [None]:
client.insert_dataset(
    cs_ids,
    all_do_ids,
    name='MG_edmonds_2022',
    authors=[
        "M. Poul"
    ],
    links=[
        "https://github.com/eisenforschung/magnesium-mtp-training-data",
        "doi:10.17617/3.A3MB7Z",
        "https://arxiv.org/abs/2207.04009"
    ],
    description = '16748 configurations of magnesium with gathered energy'
    ', stress and forces at the DFT level of theory.',
    verbose=True,
)