This notebook serves as an example of how to load and manipulate the [Si GAP dataset](https://www.repository.cam.ac.uk/handle/1810/317974) using a `Dataset` object.

# Uncomment for Google Colab

In [None]:
!mkdir si_prx_gap
!cd si_prx_gap && wget -O Si_PRX_GAP.zip https://www.repository.cam.ac.uk/bitstream/handle/1810/317974/Si_PRX_GAP.zip?sequence=1&isAllowed=yield
!cd si_prx_gap && unzip Si_PRX_GAP.zip
!pip install git+https://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@github.com/colabfit/colabfit-tools.git

# Imports

In [None]:
import os
import numpy as np

from colabfit.tools.dataset import Dataset, load_data
from colabfit.tools.property_settings import PropertySettings

# Dataset preparation

In [None]:
dataset = Dataset('Si_PRX_GAP')

dataset.authors = [
    'Albert P. Bartók', 'James Kermode', 'Noam Bernstein', 'Gábor Csányi'
]

dataset.links = [
    'https://journals.aps.org/prx/abstract/10.1103/PhysRevX.8.041048',
    'https://www.repository.cam.ac.uk/handle/1810/317974'
]

dataset.description = \
"The original DFT training data for the general-purpose silicon "\
"interatomic potential described in the associated publication."\
" The kinds of configuration that we include are chosen using "\
"intuition and past experience to guide what needs to be included "\
"to obtain good coverage pertaining to a range of properties."

In [None]:
dataset.property_map = {
    'default': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'dft_energy', 'units': 'eV'},
        'forces': {'field': 'dft_force', 'units': 'eV/Ang'},
        'virial': {'field': 'dft_virial', 'units': 'GPa'}
    }
}

In [None]:
dataset.configurations = load_data(
    file_path='./si_prx_gap/gp_iter6_sparse9k.xml.xyz',
    file_format='xyz',
    name_field='config_type',  # key in Configuration.info to use as the Configuration name
    elements=['Si'],
    default_name=dataset.name,  # default name with `name_field` not found
    verbose=True
)

In [None]:
# Used for building groups of configurations for easier analysis/exploration
dataset.configuration_set_regexes = {
    'isolated_atom': 'Reference atom',
    'bt': 'Beta-tin',
    'dia': 'Diamond',
    'sh': 'Simple hexagonal',
    'hex_diamond': 'Hexagonal diamond',
    'bcc': 'Body-centered-cubic',
    'bc8': 'BC8',
    'fcc': 'Face-centered-cubic',
    'hcp': 'Hexagonal-close-packed',
    'st12': 'ST12',
    'liq': 'Liquid',
    'amorph': 'Amorphous',
    'surface_001': 'Diamond surface (001)',
    'surface_110': 'Diamond surface (110)',
    'surface_111': 'Diamond surface (111)',
    'surface_111_pandey': 'Pandey reconstruction of diamond (111) surface',
    'surface_111_3x3_das': 'Dimer-adatom-stacking-fault (DAS) reconstruction',
    '111adatom': 'Configurations with adatom on (111) surface',
    'crack_110_1-10': 'Small (110) crack tip',
    'crack_111_1-10': 'Small (111) crack tip',
    'decohesion': 'Decohesion of diamond-structure Si along various directions',
    'divacancy': 'Diamond divacancy configurations',
    'interstitial': 'Diamond interstitial configurations',
    'screw_disloc': 'Si screw dislocation core',
    'sp': 'sp bonded configurations',
    'sp2': 'sp2 bonded configurations',
    'vacancy': 'Diamond vacancy configurations'
}

In [None]:
# Used to apply metadata labels to configurations for future queries
dataset.configuration_label_regexes = {
    'isolated_atom': 'isolated_atom',
    'bt': 'a5',
    'dia': 'diamond',
    'sh': 'sh',
    'hex_diamond': 'sonsdaleite',
    'bcc': 'bcc',
    'bc8': 'bc8',
    'fcc': 'fcc',
    'hcp': 'hcp',
    'st12': 'st12',
    'liq': 'liquid',
    'amorph': 'amorphous',
    'surface_001': ['surface', '001'],
    'surface_110': ['surface', '110'],
    'surface_111': ['surface', '111'],
    'surface_111_pandey': ['surface', '111'],
    'surface_111_3x3_das': ['surface', '111', 'das'],
    '111adatom': ['surface', '111', 'adatom'],
    'crack_110_1-10': ['crack', '110'],
    'crack_111_1-10': ['crac', '111'],
    'decohesion': ['diamond', 'decohesion'],
    'divacancy': ['diamond', 'vacancy', 'divacancy'],
    'interstitial': ['diamond', 'interstitial'],
    'screw_disloc': ['screw', 'dislocation'],
    'sp': 'sp',
    'sp2': 'sp2',
    'vacancy': ['diamond', 'vacancy']
}

In [None]:
# Resync to build groups and apply labels
dataset.resync()

In [None]:
dataset.rename_configuration_field('DFT_energy', 'dft_energy')
dataset.rename_configuration_field('DFT_force', 'dft_force')
dataset.rename_configuration_field('DFT_virial', 'dft_virial')

In [None]:
dataset.print_configuration_sets()

In [None]:
dataset.property_map = {
    'default': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'dft_energy', 'units': 'eV'},
        'forces': {'field': 'dft_force', 'units': 'eV/Ang'},
        'virial': {'field': 'dft_virial', 'units': 'GPa'}
    }
}

In [None]:
dataset.parse_data(convert_units=False, verbose=True)

In [None]:
len(dataset.configurations)

In [None]:
dataset.summary()

# Exploring data

In [None]:
dataset.plot_histograms(['energy', 'forces', 'stress'], yscale='log')

In [None]:
set(dataset.get_configuration_field('xc_functional'))

In [None]:
no_xc_data = dataset.filter(
    'configurations',
    lambda c: c.info.get('xc_functional', None) is None
)

len(no_xc_data.data)

In [None]:
pbe_data = dataset.filter(
    'configurations',
    lambda c: c.info.get('xc_functional', None) == 'PBE'
)

len(pbe_data.data)

In [None]:
pw91_data = dataset.filter(
    'configurations',
    lambda c: c.info.get('xc_functional', None) == 'PW91'
)

len(small.data)