In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_rebuild', nprocs=1)

In [None]:
import os
import json
import numpy as np
from ase import Atoms

def reader(file_path, **kwargs):
    with open(file_path) as f:
        data = json.loads('\n'.join(f.readlines()[kwargs['header_lines']:]))

    symbols     = data['Dataset']['Data'][0]['AtomTypes']
    positions   = np.array(data['Dataset']['Data'][0]['Positions'])
    box         = np.array(data['Dataset']['Data'][0]['Lattice'])

    at_name = os.path.splitext(str(file_path).split('JSON')[1][1:])[0]

    try:
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])
    except Exception as e:
        print("Error on :", at_name, e, set(symbols))
        symbols = symbols[1:]
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])

    atoms.info['name']      = at_name
    atoms.info['energy']    = data['Dataset']['Data'][0]['Energy']
    atoms.arrays['forces']  = np.array(
        data['Dataset']['Data'][0]['Forces']
    )

    atoms.info['stress'] = np.array(data['Dataset']['Data'][0]['Stress'])
    
    atoms.info['per-atom'] = False
    
    yield atoms

In [None]:
name = 'Ta_Linear_JCP2015'

configurations = list(load_data(
    file_path='/colabfit/data/FitSNAP/examples/Ta_Linear_JCP2014/JSON',
    file_format='folder',
    name_field='name',
    elements=['Ta'],
    default_name=name,
    reader=reader,
    glob_string='*.json',
    # verbose=True,
    header_lines=1
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'forces', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'bar'},
        'per-atom': {'field': 'per-atom', 'units': None},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

In [None]:
configuration_set_regexes = {
    '.*':
        'Solid and liquid tantalum',
    'Displaced_A15':
        'A15 configurations with random displacements on the atomic positions',
    'Displaced_BCC':
        'BCC configurations with random displacements on the atomic positions',
    'Displaced_FCC':
        'FCC configurations with random displacements on the atomic positions',
    'Elastic_BCC':
        'BCC primitive cells with random strains',
    'Elastic_FCC':
        'FCC primitive cells with random strains',
    'GSF_(110|112)':
        'Relaxed and unrelaxed generalized stacking faults along the [110] '\
        'and [112] crystallographic directions',
    'Liquid':
        'High-temperature AIMD sampling of molten tantalum',
    'Surface':
        'Relaxed and unrelaxed [100], [110], [111], and [112] BCC surfaces',
    'Volume_A15':
        'A15 primitive cells, compressed or expanded isotropically over a '\
        'wide range of densities',
    'Volume_BCC':
        'BCC primitive cells, compressed or expanded isotropically over a '\
        'wide range of densities',
    'Volume_FCC':
        'FCC primitive cells, compressed or expanded isotropically over a '\
        'wide range of densities',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='Ta_Linear_JCP2015',
    authors=[
        'A. P. Thompson', 'L. P. Swiler', 'C. R. Trott', 'S. M. Foiles',
        'G. J. Tucker'
    ],
    links=[
        'https://www.sciencedirect.com/science/article/pii/S0021999114008353',
        'https://github.com/FitSNAP/FitSNAP/tree/master/examples/Ta_Linear_JCP2014',
    ],
    description='This data set was originally used to generate a '\
    'linear SNAP potential for solid and liquid tantalum as published in '\
    'Thompson, A.P. et. al, J. Comp. Phys. 285 (2015) 316-330.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'A15':
        'a15',
    'BCC':
        'bcc',
    'FCC':
        'fcc',
    'GSF':
        'stacking_fault',
    'Liquid':
        'liquid',
    'Surface':
        'surface',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log')