In [None]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_rebuild', nprocs=6)

In [None]:
name = 'TiZrHfTa_APS2021'

configurations = list(load_data(
    file_path='/colabfit/data/gubaev/TiZrHfTa_APS2021/train.cfg',
    file_format='cfg',
    name_field=None,
    elements=['Ti', 'Zr', 'Hf', 'Ta'],
    default_name=name,
    verbose=True,
))

In [None]:
property_map = {
    'energy-forces-stress': [{
        # ColabFit name: {'field': ASE field name, 'units': str}
        'energy': {'field': 'energy', 'units': 'eV'},
        'forces': {'field': 'forces', 'units': 'eV/Ang'},
        'stress': {'field': 'virial', 'units': 'GPa'},
        'per-atom': {'field': 'per-atom', 'units': None},
        
        '_settings': {
            '_method': 'VASP',
            '_description': 'energy/forces/stresses',
            '_files': None,
            '_labels': ['PBE', 'GGA']
        }
    }]
}

In [None]:
def tform(c):
    c.info['per-atom'] = False

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    transform=tform,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
len(set(all_co_ids))

In [None]:
len(set(all_pr_ids))

Note: this dataset has many duplicate configurations

In [None]:
client.configurations.count_documents({'_id': {'$in': all_co_ids}, 'names.1': {'$exists': True}})

It also has two properties pointing to the same configuration, but with a single force component different by 1e-6

In [None]:
client.configurations.count_documents({'_id': {'$in': all_co_ids}, 'relationships.properties.1': {'$exists': True}})

In [None]:
duplicate_pr_ids = client.configurations.find_one(
    {'_id': {'$in': all_co_ids}, 'relationships.properties.1': {'$exists': True}},
    {'relationships.properties'}
)['relationships']['properties']
duplicate_pr_ids

In [None]:
pr0 = client.properties.find_one({'_id': duplicate_pr_ids[0]})
pr1 = client.properties.find_one({'_id': duplicate_pr_ids[1]})

In [None]:
import numpy as np

f1 = np.array(pr0['energy-forces-stress']['forces']['source-value'])
f2 = np.array(pr1['energy-forces-stress']['forces']['source-value'])

diff = np.abs(f1-f2)
print(np.max(diff))

np.where(diff > 0)

In [None]:
configuration_set_regexes = {
    '.*':
        'Configurations for the TiZrHfTa_x system with Ta concentrations '\
        'between 0 and 33 percent generated via active learning.'
}

cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc)

    cs_ids.append(cs_id)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='TiZrHfTa_APS2021',
    authors=[
        'A. P. Thompson', 'L. P. Swiler', 'C. R. Trott', 'S. M. Foiles',
        'G. J. Tucker'
    ],
    links=[
        'https://www.sciencedirect.com/science/article/pii/S0021999114008353',
        'https://github.com/FitSNAP/FitSNAP/tree/master/examples/Ta_Linear_JCP2014',
    ],
    description='This data set was originally used to generate a '\
    'linear SNAP potential for solid and liquid tantalum as published in '\
    'Thompson, A.P. et. al, J. Comp. Phys. 285 (2015) 316-330.',
    resync=True,
    verbose=True,
)

In [None]:
configuration_label_regexes = {
    '.*': 'active_learning',
}

for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'_id': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

In [None]:
dataset = client.get_dataset(ds_id, resync=True, verbose=True)['dataset']

for k,v in dataset.aggregated_info.items():
    print(k,v)

In [None]:
dataset.aggregated_info['property_fields']

In [None]:
fig = client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids, yscale='log')