In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_rebuild', nprocs=6)

In [None]:
import os
import json
import numpy as np
from ase import Atoms

def reader(file_path, **kwargs):
    with open(file_path) as f:
        data = json.loads('\n'.join(f.readlines()[kwargs['header_lines']:]))

    symbols     = data['Dataset']['Data'][0]['AtomTypes']
    positions   = np.array(data['Dataset']['Data'][0]['Positions'])
    box         = np.array(data['Dataset']['Data'][0]['Lattice'])

    at_name = os.path.splitext(str(file_path).split('JSON')[1][1:])[0]

    try:
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])
    except Exception as e:
        # print("Error on :", at_name, e, set(symbols))
        symbols = symbols[1:]
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])

    atoms.info['name']      = at_name
    atoms.info['energy']    = data['Dataset']['Data'][0]['Energy']
    atoms.arrays['forces']  = np.array(
        data['Dataset']['Data'][0]['Forces']
    )

    atoms.info['stress'] = np.array(data['Dataset']['Data'][0]['Stress'])

    atoms.info['per-atom'] = False
    
    yield atoms

In [None]:
name = 'WBe_PRB2019'

configurations = list(load_data(
    file_path='/colabfit/data/FitSNAP/examples/WBe_PRB2019/JSON',
    file_format='folder',
    name_field='name',
    elements=['W', 'Be'],
    default_name=name,
    reader=reader,
    glob_string='*.json',
    verbose=True,
    header_lines=1,
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'forces', 'units': 'eV/Ang'},
        'stress': {'field': 'stress', 'units': 'GPa'},
        'per-atom': {'field': 'per-atom', 'units': None},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

Note: this dataset has multiple properties that point to the same configurations, but have different property values

In [None]:
client.configurations.count_documents({'_id': {'$in': all_co_ids}, 'relationships.properties.1': {'$exists': True}})

In [None]:
duplicate_pr_ids = client.configurations.find_one(
    {'_id': {'$in': all_co_ids}, 'relationships.properties.1': {'$exists': True}},
    {'relationships.properties'}
)['relationships']['properties']
duplicate_pr_ids

In [None]:
pr0 = client.properties.find_one({'_id': duplicate_pr_ids[0]})
pr1 = client.properties.find_one({'_id': duplicate_pr_ids[1]})

In [None]:
pr0['energy-forces-stress']['energy']['source-value']

In [None]:
pr1['energy-forces-stress']['energy']['source-value']

In [None]:
configuration_set_regexes = {
    '.*':
        'W-Be configurations for studying plasma material interactions in fusion reactors',
    '(001|010|100)FreeSurf':
        'Be [001], [010], and [100] surfaces',
    'DFTMD_1000K':
        'AIMD sampling of Be at 1000K',
    'DFTMD_300K':
        'AIMD sampling of Be at 300K',
    'DFT_MD_1000K':
        'AIMD sampling of W-Be at 1000K',
    'DFT_MD_300K':
        'AIMD sampling of W-Be at 300K',
    'Divacancy':
        'divacancy defects in pure W',
    '^EOS':
        'Energy vs. volume EOS configurations for W-Be',
    'EOS_(BCC|FCC|HCP)':
        'Energy vs. volume EOS configurations for Be in various crystal structures',
    'EOS_Data':
        'Energy vs. volume configurations for W',
    'Elast_(BCC|FCC|HCP)_(Shear|Vol)':
        'BCC, FCC, and HCP Be with shear or volumetric strains',
    'ElasticDeform_(Shear|Vol)':
        'W-Be in various crystal structures with shear and volumetric strains',
    'Liquids':
        'Liquid Be',
    'StackFaults':
        'Be stacking faults',
    'WSurface_BeAdhesion':
        'Be adhesion onto W surfaces',
    'dislocation_quadrupole':
        'W dislocation quadrupole configurations',
    '^gamma_surface/':
        'W gamma surface configurations',
    'gamma_surface_vacancy':
        'W gamma surface configurations',
    'md_bulk':
        'AIMD sampling of bulk W',
    '^surface':
        'pure W surfaces',
    '^vacancy':
        'bulk W with vacancy defects',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(35), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='WBe_PRB2019',
    authors=[
        'M. A. Wood', 'M. A. Cusentino', 'B. D. Wirth', 'A. P. Thompson'
    ],
    links=[
        'https://journals.aps.org/prb/abstract/10.1103/PhysRevB.99.184305',
        'https://github.com/FitSNAP/FitSNAP/tree/master/examples/WBe_PRB2019'
    ],
    description='This data set was originally used to generate a '\
    'multi-component linear SNAP potential for tungsten and beryllium as '\
    'published in Wood, M. A., et. al. Phys. Rev. B 99 (2019) 184305. This '\
    'data set was developed for the purpose of studying plasma material '\
    'interactions in fusion reactors.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'FreeSurf':
        'surface',
    'surface':
        'surface',
    'BCC':
        'bcc',
    'DFT(_)?MD':
        'aimd',
    'Divacancy':
        'divacancy',
    'EOS':
        'eos',
    'Elast':
        'elastic',
    'Liquids':
        'liquid',
    'StackFaults':
        'stacking_fault',
    'dislocation':
        'dislocation',
    'gamma_surface':
        'gamma_surface',
    'md_bulk':
        'aimd',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log')