In [2]:
import ase
from ase import Atoms
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings
from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.property_definitions import potential_energy_pd, atomic_forces_pd, cauchy_stress_pd
import numpy as np
from pathlib import Path

In [3]:
DATASET_FP = Path("/Users/piper/Code/colabfit/data/gfn-xtb-si/npz/")
client = MongoDatabase('test',drop_database=True)


In [4]:
def reader(file):
    npz = np.load(file)
    name = file.stem
    atoms = []
    for xyz, energy, gradients in zip(
        npz["xyz"], npz["energy"],
        npz["gradients"]
        ):
        atoms.append(
            Atoms(
                numbers=npz["numbers"],
                positions=xyz,
                pbc=False,
                info={
                    "name": name,
                    "potential_energy": energy,
                    "nuclear_gradients": gradients,
                },
            )
        )
    return atoms

In [5]:
configurations = load_data(
    file_path=DATASET_FP,
    file_format='folder', 
    name_field='name',  
    elements=['O', 'Si', 'C', 'H', 'N', 'Cl', 'S', 'F', 'P', 'Br'],
    reader=reader,
    glob_string='*.npz',
    generator=False,
)

344it [00:05, 58.12it/s]


In [10]:
client.insert_property_definition(potential_energy_pd)


In [11]:
metadata = { 'software': {'value':'Amsterdam Modeling Suite'},
            'method': {'value':'revPBE'}
}
property_map = {
    'potential-energy': [{
        'energy':   {'field': 'potential_energy',  'units': 'Hartree'},
        'per-atom': {'value': False, 'units': None},
        '_metadata': metadata
    }],
    
}

In [12]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_do_ids = list(zip(*ids))

Preparing to add configurations to Database: 100%|██████████| 4931/4931 [00:13<00:00, 376.52it/s]


In [13]:
co_ids = client.get_data(
    'configurations',
    fields='hash',
    query={'hash': {'$in': all_co_ids}},
    ravel=True
).tolist()

desc = "All configurations from GFN-xTB dataset"
cs_ids = []
cs_id = client.insert_configuration_set(co_ids, description=desc,name='GFN-xTB')
cs_ids.append(cs_id)

In [14]:
ds_id = client.insert_dataset(
    cs_ids,
    all_do_ids,
    name='GFN-xTB_jcim_2021',
    authors=[
        'L. Komissarov, T. Verstraelen'
    ],
    links=[
        'https://doi.org/10.24435/materialscloud:14-4m',
        'https://doi.org/10.1021/acs.jcim.1c01170'
    ],
    description = '10,000 configurations of organosilicon compounds '
    'with energies predicted by an improved GFN-xTB Hamiltonian parameterization, '
    'using revPBE.',
    verbose=True,
)

Aggregating configuration info: 100%|██████████| 4893/4893 [00:01<00:00, 4857.70it/s]
Aggregating data_object info: 100%|██████████| 4893/4893 [00:00<00:00, 46017.56it/s]
Updating CA->DS relationships: 100%|██████████| 4893/4893 [00:00<00:00, 42151.60it/s]
