In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_rebuild', nprocs=6)

In [None]:
import os
import json
import numpy as np
from ase import Atoms

def reader(file_path, **kwargs):
    with open(file_path) as f:
        data = json.loads('\n'.join(f.readlines()[kwargs['header_lines']:]))

    symbols     = data['Dataset']['Data'][0]['AtomTypes']
    positions   = np.array(data['Dataset']['Data'][0]['Positions'])
    box         = np.array(data['Dataset']['Data'][0]['Lattice'])

    at_name = os.path.splitext(str(file_path).split('JSON')[1][1:])[0]

    try:
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])
    except Exception as e:
        print("Error on :", at_name, e, set(symbols))
        symbols = symbols[1:]
        atoms = Atoms(symbols, positions=positions, cell=box, pbc=[1, 1, 1])

    atoms.info['name']      = at_name
    atoms.info['energy']    = data['Dataset']['Data'][0]['Energy']
    atoms.arrays['forces']  = np.array(
        data['Dataset']['Data'][0]['Forces']
    )

    atoms.info['stress'] = np.array(data['Dataset']['Data'][0]['Stress'])
    
    atoms.info['per-atom'] = False
    # atoms.info['reference-energy'] = 3.48

    return [atoms]

In [None]:
name = 'InP_JPCA2020'

configurations = list(load_data(
    file_path='/colabfit/data/FitSNAP/examples/InP_JPCA2020/JSON',
    file_format='folder',
    name_field='name',
    elements=['In', 'P'],
    default_name=name,
    reader=reader,
    glob_string='*.json',
    # verbose=True,
    header_lines=2,
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'forces', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'kilobar'},
        'per-atom': {'field': 'per-atom', 'units': None},
        'reference-energy': {'field': 'reference-energy', 'units': 'eV'},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energies/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'LDA']
        }
    }]
}

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

In [None]:
configuration_set_regexes = {
    '.*':
        'Curated configurations for producing an interatomic potential for '\
        'indium phosphide capable of capturing high-energy defects that '\
        'result from radiation damage cascades',
    '^Bulk':
        'Ground state configuration for bulk zinc blende',
    '^EOS':
        'Bulk zinc blende with uniform expansion and compression',
    '^Shear':
        'Bulk zincblende with random cell shape modifications',
    '^Strain':
        'Uniaxially strained bulk zinc blende',
    '^a(In|P)':
        'Antisite defects in InP',
    '^aa':
        'Diantisite defects',
    '^i(In|P)':
        'Interstitial defects in InP',
    '^vP':
        'Vacancy defects in InP',
    '^vv':
        'Divacancy defects in InP',
    '^s_a(In|P|a)':
        'No description',
    '^s_i(In|P)':
        'No description',
    '^s_v(In|P)':
        'No description',
    '^s_vv':
        'No description',
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc, verbose=True)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='InP_JPCA2020',
    authors=[
        'M. A. Cusentino', 'M. A. Wood', 'A. P. Thompson'
    ],
    links=[
        'https://pubs.acs.org/doi/10.1021/acs.jpca.0c02450',
        'https://github.com/FitSNAP/FitSNAP/tree/master/examples/InP_JPCA2020',
    ],
    description='This data set was used to generate a multi-element '\
    'linear SNAP potential for InP as published in Cusentino, M.A. et. al, '\
    'J. Chem. Phys. (2020). Intended to produce an interatomic potential for '\
    'indium phosphide capable of capturing high-energy defects that result '\
    'from radiation damage cascades.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
configuration_label_regexes = {
    'Bulk|EOS|Shear|Strain':
        'zincblende',
    'EOS':
        'eos',
    'Shear|Strain':
        'strain',
    '^a(In|P)':
        'antisite',
    '^aa':
        'diantisite',
    '^i(In|P)':
        'interstitial',
    '^v(In|P|v)':
        'vacancy',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log', method='matplotlib')