This notebook serves as an example of how to load and manipulate the [Si GAP dataset](https://www.repository.cam.ac.uk/handle/1810/317974) using a `Dataset` object.

# Imports

In [None]:
import os
import numpy as np

from ase import Atoms

from colabfit.tools.dataset import Dataset, load_data
from colabfit.tools.property_settings import PropertySettings

# Dataset preparation

In [None]:
dataset = Dataset('Si_PRX_GAP')

dataset.authors = [
    'Albert P. Bartók', 'James Kermode', 'Noam Bernstein', 'Gábor Csányi'
]

dataset.links = [
    'https://journals.aps.org/prx/abstract/10.1103/PhysRevX.8.041048',
    'https://www.repository.cam.ac.uk/handle/1810/317974'
]

dataset.description = \
"The original DFT training data for the general-purpose silicon "\
"interatomic potential described in the associated publication."\
" The kinds of configuration that we include are chosen using "\
"intuition and past experience to guide what needs to be included "\
"to obtain good coverage pertaining to a range of properties."

In [None]:
dataset.property_map = {
    'default': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'dft_energy', 'units': 'eV'},
        'forces': {'field': 'dft_force', 'units': 'eV/Ang'},
        'virial': {'field': 'dft_virial', 'units': 'GPa'}
    }
}

In [None]:
dataset.configurations = load_data(
    file_path='../../../colabfit/data/gap_si/gp_iter6_sparse9k.xml.xyz',
    file_format='xyz',
    name_field=None,  # key in ase.Atoms.info to use as the Configuration name
    elements=['Si'],
    default_name=dataset.name,  # default name with `name_field` not found
    verbose=True
)

In [None]:
len(dataset.configurations)

In [None]:
# Renaming some mismatched 
for i, conf in enumerate(dataset.configurations):
    for old_name, new_name in zip(
        ['DFT_energy', 'DFT_force', 'DFT_virial'],
        ['dft_energy', 'dft_force', 'dft_virial']
    ):
        if old_name in conf.info:
            conf.info[new_name] = conf.info[old_name]
            del conf.info[old_name]
        elif old_name in conf.arrays:
            conf.arrays[new_name] = conf.arrays[old_name]
            del conf.arrays[old_name]

In [None]:
dataset.parse_data(convert_units=False, verbose=True)

# Exploring data

In [None]:
dataset.plot_histograms(yscale='log')