This notebook serves as an example of how to load and manipulate the [Si GAP dataset](https://www.repository.cam.ac.uk/handle/1810/317974) using a `Dataset` object.

# Uncomment for Google Colab

# Imports

In [None]:
import os
import numpy as np

# Initialize the database

In [None]:
from colabfit.tools.database import MongoDatabase, load_data

client = MongoDatabase('colabfit_database', nprocs=1)

# Data loading

In [None]:
images = list(load_data(
    file_path='../../../colabfit/data/gap_si/gp_iter6_sparse9k.xml.xyz',
    file_format='xyz',
    name_field='config_type',  # key in Configuration.info to use as the Configuration name
    elements=['Si'],    # order matters for CFG files, but not others
    default_name='Si_PRX_GAP',  # default name with `name_field` not found
    verbose=True
))

In [None]:
# Data stored on atoms needs to be cleaned
for img in images:
    if 'DFT_energy' in img.info:
        img.info['dft_energy'] = img.info['DFT_energy']
        del img.info['DFT_energy']
        
    if 'DFT_force' in img.arrays:
        img.arrays['dft_force'] = img.arrays['DFT_force']
        del img.arrays['DFT_force']
        
    if 'DFT_virial' in img.info:
        img.info['dft_virial'] = img.info['DFT_virial']
        del img.info['DFT_virial']
        
    for k in [
        'md_temperature', 'md_cell_t', 'smearing_width', 'md_delta_t',
        'md_ion_t', 'cut_off_energy', 'elec_energy_tol',
        ]:
        if k in img.info:
            try:
                img.info[k] = float(img.info[k].split(' ')[0])
            except:
                pass

In [None]:
base_definition = {
    'property-id': 'energy-forces-virial',
    'property-title': 'A default property for storing energies, forces, and virial',
    'property-description': 'Energies and forces computed using DFT',
    
    'energy': {'type': 'float', 'has-unit': True, 'extent': [],      'required': True, 'description': 'Cohesive energy'},
    'forces': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': True, 'description': 'Atomic forces'},
    'virial': {'type': 'float', 'has-unit': True, 'extent': [6],     'required': False, 'description': 'Virial stress'},
}
    
extra_stuff_definition = {
    'property-id': 'si-prx-gap-data',
    'property-title': 'Si PRX GAP data',
    'property-description': 'A property for storing all of the additional information provided for the Si PRX GAP dataset',

    'mix_history_length':         {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'castep_file_name':           {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'grid_scale':                 {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'popn_calculate':             {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'n_neighb':                   {'type': 'int',    'has-unit': False, 'extent': [":"],   'required': False, 'description': ''},
    'oldpos':                     {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'i_step':                     {'type': 'int',    'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'md_temperature':             {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'positions':                  {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'task':                       {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'data_distribution':          {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'avg_ke':                     {'type': 'float',  'has-unit': True,  'extent': [":"],   'required': False, 'description': ''},
    'force_nlpot':                {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'continuation':               {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'castep_run_time':            {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'calculate_stress':           {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'Minim_Hydrostatic_Strain':   {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'avgpos':                     {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'frac_pos':                   {'type': 'float',  'has-unit': False, 'extent': [":",3], 'required': False, 'description': ''},
    'hamiltonian':                {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'md_cell_t':                  {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'cutoff_factor':              {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'momenta':                    {'type': 'float',  'has-unit': False, 'extent': [":",3], 'required': False, 'description': ''},
    'elec_energy_tol':            {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'mixing_scheme':              {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'Minim_Lattice_Fix':          {'type': 'float',  'has-unit': False, 'extent': [9],     'required': False, 'description': ''},
    'in_file':                    {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'travel':                     {'type': 'float',  'has-unit': False, 'extent': [":",3], 'required': False, 'description': ''},
    'thermostat_region':          {'type': 'float',  'has-unit': False, 'extent': [":"],   'required': False, 'description': ''},
    'time':                       {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'temperature':                {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'kpoints_mp_grid':            {'type': 'float',  'has-unit': False, 'extent': [3],     'required': False, 'description': ''},
    'gap_force':                  {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'gap_energy':                 {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'cutoff':                     {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'xc_functional':              {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'smearing_width':             {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'pressure':                   {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'gap_virial':                 {'type': 'float',  'has-unit': True,  'extent': [9],     'required': False, 'description': ''},
    'reuse':                      {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'fix_occupancy':              {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'map_shift':                  {'type': 'float',  'has-unit': False, 'extent': [":",3], 'required': False, 'description': ''},
    'md_num_iter':                {'type': 'int',    'has-unit': False, 'extent': [], 'required': False, 'description': ''},
    'damp_mask':                  {'type': 'float',  'has-unit': False, 'extent': [":"],   'required': False, 'description': ''},
    'opt_strategy':               {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'spin_polarized':             {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'nextra_bands':               {'type': 'int',    'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'fine_grid_scale':            {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'masses':                     {'type': 'float',  'has-unit': True,  'extent': [":"],   'required': False, 'description': ''},
    'iprint':                     {'type': 'int',    'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'finite_basis_corr':          {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'enthalpy':                   {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'opt_strategy_bias':          {'type': 'int',    'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'force_ewald':                {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'num_dump_cycles':            {'type': 'int',    'has-unit': False,  'extent': [],     'required': False, 'description': ''},
    'velo':                       {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'md_delta_t':                 {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'md_ion_t':                   {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'force_locpot':               {'type': 'float',  'has-unit': True,  'extent': [":",3], 'required': False, 'description': ''},
    'numbers':                    {'type': 'int',    'has-unit': False, 'extent': [":"],   'required': False, 'description': ''},
    'max_scf_cycles':             {'type': 'int',    'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'mass':                       {'type': 'float',  'has-unit': True,  'extent': [":"],      'required': False, 'description': ''},
    'Minim_Constant_Volume':      {'type': 'bool',   'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'cut_off_energy':             {'type': 'float',  'has-unit': True,  'extent': [],      'required': False, 'description': ''},
    'virial':                     {'type': 'float',  'has-unit': True,  'extent': [3,3],   'required': False, 'description': ''},
    'nneightol':                  {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'max_charge_amp':             {'type': 'float',  'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'md_thermostat':              {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'md_ensemble':                {'type': 'string', 'has-unit': False, 'extent': [],      'required': False, 'description': ''},
    'acc':                        {'type': 'float',  'has-unit': False, 'extent': [":",3], 'required': False, 'description': ''},
}

units = {
    'energy': 'eV',
    'forces': 'eV/Ang',
    'virial': 'GPa',
    'oldpos': 'Ang',
    'md_temperature': 'K',
    'positions': 'Ang',
    'avg_ke': 'eV',
    'force_nlpot': 'eV/Ang',
    'castep_run_time': 's',
    'avgpos': 'Ang',
    'md_cell_t': 'ps',
    'time': 's',
    'temperature': 'K',
    'gap_force': 'eV/Ang',
    'gap_energy': 'eV',
    'cutoff': 'Ang',
    'smearing_width': 'eV',
    'pressure': 'GPa',
    'gap_virial': 'GPa',
    'masses': '_amu',
    'enthalpy': 'eV',
    'force_ewald': 'eV/Ang',
    'velo': 'Ang/s',
    'md_delta_t': 'fs',
    'md_ion_t': 'ps',
    'force_locpot': 'eV/Ang',
    'mass': 'g',
    'cut_off_energy': 'eV',
    'virial': 'GPa',
}

property_map = {
    'energy-forces-virial': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'dft_energy', 'units': 'eV'},
        'forces': {'field': 'dft_force', 'units': 'eV/Ang'},
        'virial': {'field': 'dft_virial', 'units': 'GPa'}
    },
    'si-prx-gap-data': {
        k.replace('_', '-').lower(): {'field': k , 'units': units[k] if k in units else None}
        for k in extra_stuff_definition if k not in {'property-id', 'property-title', 'property-description'}
    }
}

# Can't use underscores in field names
extra_stuff_definition = {
    k.replace('_', '-').lower(): v for k,v in extra_stuff_definition.items()
}

In [None]:
client.insert_property_definition(base_definition)
client.insert_property_definition(extra_stuff_definition)

In [None]:
from colabfit.tools.property_settings import PropertySettings

pso = PropertySettings(
    method='CASTEP',
    description='DFT calculations using the CASTEP software',
    files=None,
    labels=['Monkhorst-Pack'],
)

ids = client.insert_data(
    images,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso, 'si-prx-gap-data': pso},
    generator=False,
    verbose=True
)

Note: this dataset has four pairs of duplicate configurations. This can be seen by counting the number of configurations that have twice as many linked properties as expected (expected is 2).

In [None]:
client.configurations.count_documents(
    {'relationships.properties.2': {'$exists': True}}
)

# Building Configuration Sets

In [None]:
# Used for building groups of configurations for easier analysis/exploration
configuration_set_regexes = {
    'isolated_atom': 'Reference atom',
    'bt': 'Beta-tin',
    'dia': 'Diamond',
    'sh': 'Simple hexagonal',
    'hex_diamond': 'Hexagonal diamond',
    'bcc': 'Body-centered-cubic',
    'bc8': 'BC8',
    'fcc': 'Face-centered-cubic',
    'hcp': 'Hexagonal-close-packed',
    'st12': 'ST12',
    'liq': 'Liquid',
    'amorph': 'Amorphous',
    'surface_001': 'Diamond surface (001)',
    'surface_110': 'Diamond surface (110)',
    'surface_111': 'Diamond surface (111)',
    'surface_111_pandey': 'Pandey reconstruction of diamond (111) surface',
    'surface_111_3x3_das': 'Dimer-adatom-stacking-fault (DAS) reconstruction',
    '111adatom': 'Configurations with adatom on (111) surface',
    'crack_110_1-10': 'Small (110) crack tip',
    'crack_111_1-10': 'Small (111) crack tip',
    'decohesion': 'Decohesion of diamond-structure Si along various directions',
    'divacancy': 'Diamond divacancy configurations',
    'interstitial': 'Diamond interstitial configurations',
    'screw_disloc': 'Si screw dislocation core',
    'sp': 'sp bonded configurations',
    'sp2': 'sp2 bonded configurations',
    'vacancy': 'Diamond vacancy configurations'
}

In [None]:
cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'names': {'$regex': regex}},
        ravel=True
    ).tolist()
    
    print(f'Configuration set {i}', f'({regex}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc, verbose=True)
    
    cs_ids.append(cs_id)

# Building the Dataset

In [None]:
all_co_ids, all_pr_ids = list(zip(*ids))
len(all_pr_ids)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='Si_PRX_GAP',
    authors=[
        'Albert P. Bartók', 'James Kermode', 'Noam Bernstein', 'Gábor Csányi'
    ],
    links=[
        'https://journals.aps.org/prx/abstract/10.1103/PhysRevX.8.041048',
        'https://www.repository.cam.ac.uk/handle/1810/317974'
    ],
    description=\
        "The original DFT training data for the general-purpose silicon "\
        "interatomic potential described in the associated publication."\
        " The kinds of configuration that we include are chosen using "\
        "intuition and past experience to guide what needs to be included "\
        "to obtain good coverage pertaining to a range of properties.",
    resync=True,
    verbose=True,
)
ds_id

# Adding labels

In [None]:
all_co_ids, all_pr_ids = list(zip(*ids))
len(all_pr_ids)

In [None]:
client.apply_labels(dataset_id=ds_id, collection_name='properties', query={'si-prx-gap-data.xc-functional.source-value': 'PW91'}, labels='PW91', verbose=True)
client.apply_labels(dataset_id=ds_id, collection_name='properties', query={'si-prx-gap-data.xc-functional.source-value': 'PBE'}, labels='PBE', verbose=True)

In [None]:
# Used to apply metadata labels to configurations for future queries
configuration_label_regexes = {
    'isolated_atom': 'isolated_atom',
    'bt': 'a5',
    'dia': 'diamond',
    'sh': 'sh',
    'hex_diamond': 'sonsdaleite',
    'bcc': 'bcc',
    'bc8': 'bc8',
    'fcc': 'fcc',
    'hcp': 'hcp',
    'st12': 'st12',
    'liq': 'liquid',
    'amorph': 'amorphous',
    'surface_001': ['surface', '001'],
    'surface_110': ['surface', '110'],
    'surface_111': ['surface', '111'],
    'surface_111_pandey': ['surface', '111'],
    'surface_111_3x3_das': ['surface', '111', 'das'],
    '111adatom': ['surface', '111', 'adatom'],
    'crack_110_1-10': ['crack', '110'],
    'crack_111_1-10': ['crac', '111'],
    'decohesion': ['diamond', 'decohesion'],
    'divacancy': ['diamond', 'vacancy', 'divacancy'],
    'interstitial': ['diamond', 'interstitial'],
    'screw_disloc': ['screw', 'dislocation'],
    'sp': 'sp',
    'sp2': 'sp2',
    'vacancy': ['diamond', 'vacancy']
}

In [None]:
for regex, labels in configuration_label_regexes.items():
    client.apply_labels(
        dataset_id=ds_id,
        collection_name='configurations',
        query={'names': {'$regex': regex}},
        labels=labels,
        verbose=True
    )

# Exploring

In [None]:
dataset = client.get_dataset(ds_id, resync=True)['dataset']

In [None]:
for k,v in dataset.aggregated_info.items():
    print(k, v)

In [None]:
client.plot_histograms(
    ['energy-forces-virial.energy', 'energy-forces-virial.forces', 'energy-forces-virial.virial'],
    yscale='log',
    ids=dataset.property_ids,
    verbose=True
)

In [None]:
client.dataset_to_markdown(
    ds_id=ds_id,
    base_folder='/home/jvita/scripts/colabfit-tools/colabfit/examples/'+dataset.name,
    html_file_name='README.md',
    data_format='mongo',
    data_file_name=None,
    histogram_fields=['energy-forces-virial.energy', 'energy-forces-virial.forces', 'energy-forces-virial.virial'],
    yscale='log'
)

In [None]:
# Convert to per-atom energies
client.apply_transformation(
    dataset_id=ds_id,
    property_ids=all_pr_ids,
    update_map={
        'energy-forces-virial.energy':
        lambda f, doc: f/doc['configuration']['nsites']
    },
    configuration_ids=all_co_ids,
)

In [None]:
client.plot_histograms(
    ['energy-forces-virial.energy', 'energy-forces-virial.forces', 'energy-forces-virial.virial'],
    yscale='log',
    ids=dataset.property_ids,
    verbose=True
)

# Filtering datasets based on XC functional

In [None]:
set(client.get_data('properties', 'si-prx-gap-data.xc-functional', ravel=True))

In [None]:
no_xc_config_sets, no_xc_pr_ids = client.filter_on_properties(
    ds_id,
    query={'si-prx-gap-data.xc-functional.source-value': {'$exists': False}},
)

new_cs_ids = []
for cs in no_xc_config_sets:
    new_cs_ids.append(client.insert_configuration_set(cs.configuration_ids, cs.description, verbose=True))

no_xc_ds_id = client.insert_dataset(
    cs_ids=new_cs_ids,
    pr_ids=no_xc_pr_ids,
    name='Si_PRX_GAP-no-xc',
    authors=dataset.authors,
    links=dataset.links,
    description="A subset of the Si_PRX_GAP dataset that only contains data without a specified XC functional",
    resync=True,
    verbose=True,
)
no_xc_ds_id

In [None]:
pbe_config_sets, pbe_pr_ids = client.filter_on_properties(
    ds_id,
    query={'si-prx-gap-data.xc-functional.source-value': 'PBE'},
)

new_cs_ids = []
for cs in pbe_config_sets:
    if cs.configuration_ids:
        new_cs_ids.append(client.insert_configuration_set(cs.configuration_ids, cs.description, verbose=True))
        
pbe_ds_id = client.insert_dataset(
    cs_ids=new_cs_ids,
    pr_ids=pbe_pr_ids,
    name='Si_PRX_GAP-pbe',
    authors=dataset.authors,
    links=dataset.links,
    description="A subset of the Si_PRX_GAP dataset that only contains data computed using the PBE XC functional",
    resync=True,
    verbose=True,
)
pbe_ds_id

In [None]:
pw91_config_sets, pw91_pr_ids = client.filter_on_properties(
    ds_id,
    query={'si-prx-gap-data.xc-functional.source-value': 'PW91'},
)

new_cs_ids = []
for cs in pw91_config_sets:
    if cs.configuration_ids:
        new_cs_ids.append(client.insert_configuration_set(cs.configuration_ids, cs.description, verbose=True))
        
pw91_ds_id = client.insert_dataset(
    cs_ids=new_cs_ids,
    pr_ids=pw91_pr_ids,
    name='Si_PRX_GAP-pw91',
    authors=dataset.authors,
    links=dataset.links,
    description="A subset of the Si_PRX_GAP dataset that only contains data computed using the PW91 XC functional",
    resync=True,
    verbose=True,
)
pw91_ds_id