In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

import numpy as np

client = MongoDatabase('colabfit_rebuild', nprocs=6)

In [None]:
name = 'MoNbTaVW_PRB2021'

configurations = list(load_data(
    file_path='/colabfit/data/acclab_helsinki/tabGAP_2b+3b_MoNbTaVW/db_HEA_reduced.xyz',
    file_format='xyz',
    name_field='config_type',
    elements=['Mo', 'Nb', 'Ta', 'V', 'W'],
    default_name=name,
    verbose=True,
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'force', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'GPa'},
        'per-atom': {'field': 'per-atom', 'units': None},
                
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
def tform(c):
    c.info['per-atom'] = False
    
    if 'virial' in c.info:
        c.info['virial'] = (c.info['virial']/np.abs(np.linalg.det(np.array(c.cell))))*-160.21766208

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    transform=tform,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

In [None]:
configuration_set_regexes = {
    '.*':
        'A variety of Mo-Nb-Ta-V-W structures',
    '^composition':
        'Ternary, quaternary, and quinary BCC alloys. 3 linearly spaced '\
        'compositions were sampled, each with 3 different lattice constants. '\
        'Atoms are randomly ordered and shifted slightly from their lattice '\
        'positions.',
    '^liquid':
        'Liquid configurations',
    '^sia':
        'Configurations with single self-interstitial defects',
    '^vacancy':
        'Single-vacancy configurations',
    'bcc_distorted':
        'BCC configurations with random strains up to +/- 30% to help train '\
        'the far-from-equilibrium elastic response',
    'binary_alloys':
        'Binary BCC alloys sampling 10 different concentrations from '\
        'A_0.05B_0.95 to A_0.95B_0.05 and 3 different lattice constants for '\
        'every composition. Atoms are randomly ordered and shifted slightly '\
        'from their lattice positions.',
    'di-sia':
        'Configurations with two self-interstitial defects',
    'di-vacancy':
        'Divacancy configurations',
    'dimer':
        'Dimers to fit to the full dissociation curve starting from 1.1 '\
        'angstrom',
    'gamma_surface':
        'Configurations representing the full gamma surface',
    'hea_ints':
        '1-5 interstitial atoms randomly inserted into HEA lattices and '\
        'relaxed with a partially-trained tabGAP model',
    'hea_short_range':
        'Randomly placed unrelaxed interstitial atom in HEAs to fit repulsion '\
        'inside crystals, making sure that the closest interatomic distance '\
        'is not too short for DFT to be unreliable (> 1.35 Ang)',
    'hea_small':
        'Bulk equiatomic quinary HEAs. Atoms are randomly ordered and shifted '\
        'slightly from their lattice positions. The lattice constant is '\
        'randomised in the range 3-3.4 Angstrom',
    'hea_surface':
        'Disordered HEA surfaces, including some of the damaged/molten '\
        'surface configurations from an existing pure W dataset that were '\
        'turned into HEAs',
    'hea_vacancies':
        '1-5 vacancies randomly inserted into HEA lattices, then relaxed with '\
        'a partially-trained tabGAP model',
    'isolated_atom':
        'Isolated W atom',
    'liquid_composition':
        'Liquid equiatomic binary, ternary, quaternary, and quinary alloys at '\
        'different densities',
    'liquid_hea':
        'Liquid HEA configurations',
    'mcmd':
        'Equiatomic quinary alloys generated via active learning by running '\
        'MCMD with a partially-trained tabGAP model.',
    'ordered_alloys':
        'Ordered binary, ternary, and quaternary alloys (always as a BCC '\
        'lattice, but with different crystal symmetries of the elemental '\
        'sublattices)',
    'phonon':
        'MD snapshots taken at 1000K for three different volumes',
    'short_range':
        'BCC crystals with random interstitial atom defects to capture '\
        'short-range many-body dynamics',
    'surf_liquid':
        'Damaged and half-molten (110) and (100) surfaces',
    'tri-vacancy':
        'Trivacancy configurations',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='MoNbTaVW_PRB2021',
    authors=[
        'J. Byggmästar', 'K. Nordlund', 'F. Djurabekova',
    ],
    links=[
        'https://journals.aps.org/prb/abstract/10.1103/PhysRevB.104.104101',
        'https://doi.org/10.23729/1b845398-5291-4447-b417-1345acdd2eae',
    ],
    description='This dataset was originally designed to fit a GAP '\
    'model for the Mo-Nb-Ta-V-W quinary system that was used to study '\
    'segregation and defects in the body-centered-cubic refractory '\
    'high-entropy alloy MoNbTaVW.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'alloys':
        'bcc',
    'bcc':
        ['bcc', 'strain'],
    'bcc_distorted':
        ['bcc', 'strain'],
    'di-sia':
        'interstitial',
    'di-vacancy':
        ['vacancy', 'divacancy'],
    'dimer':
        ['dimer', 'warning', 'large_forces', 'repulsive'],
    'gamma_surface':
        'gamma_surface',
    'hea':
        'hea',
    'hea_ints':
        ['hea', 'interstitial'],
    'isolated_atom':
        'isolated_atom',
    'liquid':
        'liquid',
    'mcmd':
        'aimd',
    'phonon':
        'aimd',
    'short_range':
        ['bcc', 'interstitial', 'warning', 'large_forces', 'repulsive'],
    'sia':
        'interstitial',
    'surf_liquid':
        'surface',
    'surface':
        'surface',
    'tri-vacancy':
        ['vacancy', 'divacancy', 'trivacancy'],
    'vacancies':
        'vacancy',
    'vacancy':
        'vacancy',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log', method='matplotlib')