In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

import numpy as np

client = MongoDatabase('colabfit_rebuild', nprocs=1)

In [None]:
name = 'W_PRB2019'

configurations = list(load_data(
    file_path='/colabfit/data/acclab_helsinki/W/2019-05-24/training-data/db_W.xyz',
    file_format='xyz',
    name_field='config_type',
    elements=['W'],
    default_name=name,
    verbose=True,
))

In [None]:
client.insert_property_definition({
    'property-id': 'energy-forces-stress',
    'property-title': 'A default property for storing energies, forces, and stress',
    'property-description': 'Energies and forces computed using DFT',

    'energy': {'type': 'float', 'has-unit': True, 'extent': [],      'required': False, 'description': 'Cohesive energy'},
    'forces': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Atomic forces'},
    'stress': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Stress'},
})

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'force', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'GPa'},
        'per-atom': {'field': 'per-atom', 'units': None},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
def tform(c):
    c.info['per-atom'] = False
    
    if 'virial' in c.info:
        c.info['virial'] = (c.info['virial']/np.abs(np.linalg.det(np.array(c.cell))))*-160.21766208

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    transform=tform,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

In [None]:
configuration_set_regexes = {
    '.*':
        'Configurations with a specific focus on properties relevant for '\
        'simulations of radiation-induced collision cascades and the damage '\
        'they produce, including a realistic repulsive potential for the '\
        'short-range many-body cascade dynamics and a good description of the '\
        'liquid phase',
    '^sc':
        'Simple cubic crystals with random lattice distortions',
    '^liquid':
        'Liquid W with densities around the experimental density of 17.6 g/cm^3',
    'short_range':
        'BCC crystals with random interstitial atom defects to capture '\
        'short-range many-body dynamics',
    '^vacancy':
        'Single-vacancy configurations',
    'di-vacancy':
        'Divacancy configurations',
    'phonon':
        'MD snapshots taken at 1000K for three different volumes',
    'slice_sample':
        "Randomly distorted primitive bcc unit cells drawn from Szlachta et "\
        "al.'s database",
    'fcc':
        'FCC crystals with random lattice distortions',
    'bcc_distorted':
        'BCC configurations with random strains up to +/- 30% to help train '\
        'the far-from-equilibrium elastic response',
    'dimer':
        'Dimers to fit to the full dissociation curve starting from 1.1 '\
        'angstrom',
    'surface_111':
        '(111) surface configurations',
    'C15':
        'C15 configurations with random lattice distortions',
    'dia':
        'Diamond configurations with random lattice distortions',
    'hcp':
        'HCP configurations with random lattice distortions',
    'surf_liquid':
        'Damaged and half-molten (110) and (100) surfaces',
    'surface_100':
        'Configurations with single self-interstitial defects',
    '^sia':
        'Configurations with single self-interstitial defects',
    'surface_112':
        '(112) surface configurations',
    'surface_110':
        '(110) surface configurations',
    'tri-vacancy':
        'Trivacancy configurations',
    'A15':
        'A15 configurations with random lattice distortions',
    'isolated_atom':
        'Isolated W atom',
    'di-SIA':
        'Configurations with two self-interstitial defects',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='W_PRB2019',
    authors=[
        'J. Byggmästar', 'A. Hamedani', 'K. Nordlund', 'F. Djurabekova',
    ],
    links=[
        'https://journals.aps.org/prb/abstract/10.1103/PhysRevB.100.144105',
        'https://gitlab.com/acclab/gap-data/-/tree/master/W/2019-05-24',
    ],
    description='This dataset was originally designed to fit a GAP '\
    'potential with a specific focus on properties relevant for simulations '\
    'of radiation-induced collision cascades and the damage they produce, '\
    'including a realistic repulsive potential for the short-range many-body '\
    'cascade dynamics and a good description of the liquid phase.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'phonon':
        'aimd',
    'hcp':
        'hcp',
    '^sc':
        ['sc', 'strain'],
    '^liquid':
        'liquid',
    'short_range':
        ['bcc', 'interstitial', 'warning', 'large_forces', 'repulsive'],
    '^vacancy':
        'vacancy',
    'di-vacancy':
        ['vacancy', 'divacancy'],
    'fcc':
        ['fcc', 'strain'],
    'bcc_distorted':
        ['bcc', 'strain'],
    'dimer':
        ['dimer', 'warning', 'large_forces', 'repulsive'],
    'surface':
        'surface',
    'C15':
        ['c15', 'strain'],
    'dia':
        ['diamond', 'strain'],
    'surf_liquid':
        'surface',
    '^sia$':
        'interstitial',
    'tri-vacancy':
        ['vacancy', 'divacancy', 'trivacancy'],
    'A15':
        ['a15', 'strain'],
    'isolated_atom':
        'isolated_atom',
    'di-SIA':
        'interstitial',
    'slice_sample':
        ['bcc', 'strain'],
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log')