In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('example', nprocs=1)#, drop_database=True)

In [None]:
import numpy as np
from colabfit.tools.configuration import Configuration

In [None]:
property_map = {
    'energy-forces-stress': {
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'per-atom': {'field': 'per-atom', 'units': None},
    }
}

In [None]:
def reader(file_path):
    with open(file_path, 'r') as infile:
        while infile:
            header = infile.readline()

            if header == '': break

            name = header.split('#')[0].strip()

            _ = infile.readline()  # scaling factor; always 1

            cell = np.array([
                [float(_.strip()) for _ in infile.readline().split()],
                [float(_.strip()) for _ in infile.readline().split()],
                [float(_.strip()) for _ in infile.readline().split()],
            ])

            natoms = int(infile.readline().split()[0])

            _ = infile.readline()  # coordinates type; always 'C' for Cartesian

            positions = []
            forces    = []
            symbols   = []
            for _ in range(natoms):
                x, y, z, fx, fy, fz, s = infile.readline().split()

                positions.append([float(x), float(y), float(z)])
#                 forces.append([float(fx), float(fy), float(fz)])
                symbols.append(s)

            energy = float(infile.readline().split()[0])

            # Assuming periodic in all directions
            c = Configuration(symbols=symbols, cell=cell, positions=np.array(positions), pbc=True)

            c.info['_name'] = name

            c.info['energy'] = energy
#             c.arrays['forces'] = np.array(forces)

            c.info['per-atom'] = False

            yield c

In [None]:
configurations = list(load_data(
    file_path='/colabfit/data/mishin/',
    file_format='folder',
    name_field='_name',
    elements=['Ta'],
    default_name='Ta_PINN_2021',
    reader=reader,
    glob_string='*.dat',
    generator=False,
    verbose=True,
))

In [None]:
pso = PropertySettings(
    method='VASP',
    description='energies/forces. All supercell energies (per atom) '\
        'were shifted by a constant such that the DFT energy of the '\
        'equilibrium BCC structure is equal to the negative experimental '\
        'cohesive energy of BCC Ta.',
    files=None,
    labels=['PBE', 'GGA'],
)

In [None]:
client.insert_property_definition({
    'property-id': 'energy-forces-stress',
    'property-title': 'Basic outputs from a static calculation',
    'property-description':
        'Energy, forces, and stresses from a calculation of a '\
        'static configuration. Energies must be specified to be '\
        'per-atom or supercell. If a reference energy has been '\
        'used, this must be specified as well.',

    'energy': {
        'type': 'float',
        'has-unit': True,
        'extent': [],
        'required': False,
        'description':
            'The potential energy of the system.'
    },
    'forces': {
        'type': 'float',
        'has-unit': True,
        'extent': [":", 3],
        'required': False,
        'description':
            'The [x,y,z] components of the force on each particle.'
    },
    'stress': {
        'type': 'float',
        'has-unit': True,
        'extent': [3, 3],
        'required': False,
        'description':
            'The full Cauchy stress tensor of the simulation cell'
    },

    'per-atom': {
        'type': 'bool',
        'has-unit': False,
        'extent': [],
        'required': True,
        'description':
            'If True, "energy" is the total energy of the system, '\
            'and has NOT been divided by the number of atoms in the '\
            'configuration.'
    },
    'reference-energy': {
        'type': 'float',
        'has-unit': True,
        'extent': [],
        'required': False,
        'description':
            'If provided, then "energy" is the energy (either of '\
            'the whole system, or per-atom) LESS the energy of '\
            'a reference configuration (E = E_0 - E_reference). '\
            'Note that "reference-energy" is just provided for '\
            'documentation, and that "energy" should already have '\
            'this value subtracted off. The reference energy must '\
            'have the same units as "energy".'
    },
})

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso},
    generator=False,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
ids

In [None]:
len(set(all_co_ids)), len(set(all_pr_ids))

This dataset was found to have 4 configurations that had more than one property pointing to them. In all cases, this was due to a difference in computed energies on the order of 1e-4 or smaller.

In [None]:
for co_doc in client.configurations.find({'_id': {'$in': all_co_ids}, 'relationships.properties.1': {'$exists': True}}):
    print(co_doc['names'])
    for pr_doc in client.properties.find({'_id': {'$in': all_pr_ids}, 'relationships.configurations': co_doc['_id']}):
        print('\t', pr_doc['energy-forces-stress']['energy'])

In [None]:
cs_regexes = {
    'bcc.small.strain':
        'BCC structures with small homogeneous strains',
    'eos_A15':
        'A15 structures with isotropic strains at 0K',
    'eos_bcc':
        'BCC structures with isotropic strains at 0K',
    'eos_beta-Ta-shifted':
        'beta-Ta structures with isotropic strains at 0K',
    'eos_diamond':
        'Diamond structures with isotropic strains at 0K',
    'eos_fcc':
        'FCC structures with isotropic strains at 0K',
    'eos_hcp':
        'HCP structures with isotropic strains at 0K',
    'eos_hex':
        'Simple hexagonal structures with isotropic strains at 0K',
    'eos_sc':
        'Simple cubic structures with isotropic strains at 0K',
    'eos_dimer':
        'Dimer structures with isotropic strains at 0K',
    'eos_trimer-linear':
        'Linear trimer structures with isotropic strains at 0K',
    'eos_trimer-triangle':
        'Triangular trimer structures with isotropic strains at 0K',
    'eos_tetrahedron':
        'Tetrahedron structures with isotropic strains at 0K',
    'twinpath':
        'Samples along the twinning-antitwinning deformation path',
    'defpath_hex':
        'Samples along the hexagonal deformation path from BCC to HCP',
    'defpath_ortho':
        'Samples along the orthorhombic deformation path from BCC to BCT',
    'defpath_tetra':
        'Samples along the tetragonal deformation path from BCC to FCC to BCT',
    'defpath_trigo':
        'Samples along the trigonal deformation path from BCC to SC to FCC',
    'gamma_110':
        'Samples of the gamma surface in the (110) plane',
    'gamma_112':
        'Samples of the gamma surface in the (112) plane',
    'liquid':
        'NVT-MD snapshots of liquid structures at various temperatures',
    'bcc.nvt':
        'NVT-MD snapshots of BCC structures at various temperatures',
    'shockwave_100':
        'BCC structures with large uniaxial strain along the [100] direaction',
    'shockwave_110':
        'BCC structures with large uniaxial strain along the [110] direaction',
    'shockwave_111':
        'BCC structures with large uniaxial strain along the [111] direaction',
    'dislocation':
        'NEB images from a dislocation relaxation run',
    'cluster':
        'Spherical clusters up to 2nd-/3rd-/4th- nearest-neighbor distances',
    'gb_111':
        'Sigma3 (111) grain boundary structures',
    'gb_112':
        'Sigma3 (112) grain boundary structures',
    'gb_210':
        'Sigma5 (210) grain boundary structures',
    'gb_310':
        'Sigma5 (310) grain boundary structures',
    'vacancy':
        'Vacancy configurations from NVT-MD and NEB calculations',
    'interstitial':
        'Interstitial configurations from NVT-MD and NEB calculations',
    'surface':
        'Relaxed (100), (110), and (111) surface structures, plus NVT-MD samples at 2500K',
}

cs_ids = []

for i, (regex, desc) in enumerate(cs_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='Ta_PINN_2021',
    authors=[
        'Yi-Shen Lin', 'Ganga P. Purja Pun', 'Yuri Mishin'
    ],
    links=[
        'https://arxiv.org/abs/2101.06540'
    ],
    description=\
        "A dataset consisting of the energies of supercells "\
        "containing from 1 to 250 atoms. The supercells represent energy-volume relations for 8 "\
        "crystal structures of Ta, 5 uniform deformation paths between pairs of structures, vacancies, "\
        "interstitials, surfaces with low-index orientations, 4 symmetrical tilt grain boundaries, "\
        "γ-surfaces on the (110) and (211) fault planes, a "\
        " [111] screw dislocation, liquid Ta, and "\
        "several isolated clusters containing from 2 to 51 atoms. Some of the supercells contain "\
        "static atomic configurations. However, most are snapshots of ab initio MD simulations at "\
        "different densities, and temperatures ranging from 293 K to 3300 K. The BCC structure "\
        "was sampled in the greatest detail, including a wide range of isotropic and uniaxial deformations.",
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'bcc.small.strain': ['bcc', 'strain'],
    'eos_A15': ['a15', 'strain', 'eos'],
    'eos_bcc': ['bcc', 'strain', 'eos'],
    'eos_beta-Ta-shifted': ['beta-ta', 'strain', 'eos'],
    'eos_diamond': ['diamond', 'strain', 'eos'],
    'eos_fcc': ['fcc', 'strain', 'eos'],
    'eos_hcp': ['hcp', 'strain', 'eos'],
    'eos_hex': ['sh', 'strain', 'eos'],
    'eos_sc': ['sc', 'strain', 'eos'],
    'eos_dimer': ['dimer', 'strain', 'eos'],
    'eos_trimer-linear': ['trimer', 'strain', 'eos'],
    'eos_trimer-triangle': ['trimer', 'strain', 'eos'],
    'eos_tetrahedron': ['tetrahedron', 'strain', 'eos'],
    'twinpath': ['twinning', 'anti-twinning', 'deformation_path'],
    'defpath_hex': ['deformation_path'],
    'defpath_ortho': ['deformation_path'],
    'defpath_tetra': ['deformation_path'],
    'defpath_trigo': ['deformation_path'],
    'gamma_110': ['gamma_surface'],
    'gamma_112': ['gamma_surface'],
    'liquid': ['md', 'liquid', 'nvt'],
    'bcc.nvt': ['md', 'bcc', 'nvt'],
    'shockwave_100': ['bcc', 'strain'],
    'shockwave_110': ['bcc', 'strain'],
    'shockwave_111': ['bcc', 'strain'],
    'dislocation': ['dislocation'],
    'cluster': ['cluster'],
    'gb_111': ['grain_boundary', 'sigma3'],
    'gb_112': ['grain_boundary', 'sigma3'],
    'gb_210': ['grain_boundary', 'sigma5'],
    'gb_310': ['grain_boundary', 'sigma5'],
    'vacancy': ['vacancy', 'nvt', 'md'],
    'interstitial': ['interstitial', 'nvt', 'md'],
    'surface': ['surface', 'nvt', 'md'],
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
ds_id = '10817583392748138384'
dataset = client.get_dataset(ds_id, resync=True)['dataset']

In [None]:
    dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(
    dataset.aggregated_info['property_fields'],
    yscale='log',
    ids=dataset.property_ids,
    verbose=True
)

In [None]:
client.dataset_to_markdown(
    ds_id=ds_id,
    base_folder='/colabfit/markdown/'+dataset.name,
    html_file_name='README.md',
    data_format='mongo',
    data_file_name=None,
    yscale='log'
)