In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

import numpy as np

client = MongoDatabase('colabfit_rebuild', nprocs=1)

In [None]:
name = 'V_PRM2019'

configurations = list(load_data(
    file_path='/colabfit/data/acclab_helsinki/V/training-data/db_V.xyz',
    file_format='xyz',
    name_field='config_type',
    elements=['V'],
    default_name=name,
    verbose=True,
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'force', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'GPa'},
        'per-atom': {'field': 'per-atom', 'units': None},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
def tform(c):
    c.info['per-atom'] = False
    
    if 'virial' in c.info:
        c.info['virial'] = (c.info['virial']/np.abs(np.linalg.det(np.array(c.cell))))*-160.21766208

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    transform=tform,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

In [None]:
configuration_set_regexes = {
    '.*':
        'Configurations designed to ensure machine-learning of elastic, '\
        'thermal, and defect properties, as well as surface energetics, '\
        'melting, and the structure of the liquid phase.',
    '^liquid':
        'Liquid configurations with densities around the experimental density',
    '^sia':
        'Configurations with single self-interstitial defects',
    '^vacancy':
        'Single-vacancy configurations',
    'A15':
        'A15 configurations with random lattice distortions',
    'bcc_distorted':
        'BCC configurations with random strains up to +/- 30% to help train '\
        'the far-from-equilibrium elastic response',
    'C15':
        'C15 configurations with random lattice distortions',
    'di-sia':
        'Configurations with two self-interstitial defects',
    'di-vacancy':
        'Divacancy configurations',
    'dia':
        'Diamond configurations with random lattice distortions',
    'dimer':
        'Dimers to fit to the full dissociation curve starting from 1.1 '\
        'angstrom',
    'fcc':
        'FCC crystals with random lattice distortions',
    'gamma_surface':
        'Configurations representing the full gamma surface',
    'hcp':
        'HCP configurations with random lattice distortions',
    'isolated_atom':
        'Isolated W atom',
    'phonon':
        'MD snapshots taken at 1000K for three different volumes',
    'sc':
        'Simple cubic crystals with random lattice distortions',
    'short_range':
        'BCC crystals with random interstitial atom defects to capture '\
        'short-range many-body dynamics',
    'slice_sample':
        "Randomly distorted primitive bcc unit cells drawn from Szlachta et "\
        "al.'s database",
    'surf_liquid':
        'Damaged and half-molten (110) and (100) surfaces',
    'surface_100':
        'Configurations with single self-interstitial defects',
    'surface_110':
        '(110) surface configurations',
    'surface_111':
        '(111) surface configurations',
    'surface_112':
        '(112) surface configurations',
    'tri-vacancy':
        'Trivacancy configurations',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='V_PRM2019',
    authors=[
        'J. Byggmästar', 'K. Nordlund', 'F. Djurabekova',
    ],
    links=[
        'https://journals.aps.org/prmaterials/abstract/10.1103/PhysRevMaterials.4.093802'
        'https://gitlab.com/acclab/gap-data/-/tree/master/V'
    ],
    description='This dataset was designed to ensure machine-learning '\
    'of V elastic, thermal, and defect properties, as well as surface '\
    'energetics, melting, and the structure of the liquid phase. The dataset '\
    'was constructed by starting with the dataset from J. Byggmästar et al., '\
    'Phys. Rev. B 100, 144105 (2019), then rescaling all of the '\
    'configurations to the correct lattice spacing and adding in gamma '\
    'surface configurations.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'A15':
        ['a15', 'strain'],
    'bcc_distorted':
        ['bcc', 'strain'],
    'C15':
        ['c15', 'strain'],
    'di-sia':
        'interstitial',
    'di-vacancy':
        ['vacancy', 'divacancy'],
    'dia':
        ['diamond', 'strain'],
    'dimer':
        ['dimer', 'warning', 'large_forces', 'repulsive'],
    'fcc':
        ['fcc', 'strain'],
    'gamma_surface':
        'gamma_surface',
    'hcp':
        'hcp',
    'isolated_atom':
        'isolated_atom',
    'liquid':
        'liquid',
    'phonon':
        'aimd',
    'sc':
        ['sc', 'strain'],
    'short_range':
        ['bcc', 'interstitial', 'warning', 'large_forces', 'repulsive'],
    'sia':
        'interstitial',
    'slice_sample':
        ['bcc', 'strain'],
    'surf_liquid':
        'surface',
    'surface':
        'surface',
    'tri-vacancy':
        ['vacancy', 'divacancy', 'trivacancy'],
    'vacancy':
        'vacancy',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log')