# Uncomment for Google Colab

In [None]:
!pip install git+https://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@github.com/colabfit/colabfit-tools.git

# Basic Example

## Creating a `Dataset` from scratch

In [None]:
from colabfit.tools.dataset import Dataset

In [None]:
dataset = Dataset(name='example')

dataset.authors = [
    'J. E. Lennard-Jones',
]

dataset.links = [
    'https://en.wikipedia.org/wiki/John_Lennard-Jones'
]

dataset.description = "This is an example dataset"


## Adding configurations

### Manually

In [None]:
import numpy as np
from ase import Atoms

images = []
for i in range(1, 1000):
    atoms = Atoms('H'*i, positions=np.random.random((i, 3)))

    atoms.info['_name'] = 'configuration_' + str(i)
    
    atoms.info['dft_energy'] = i*i
    atoms.arrays['dft_forces'] = np.random.normal(size=(i, 3))
    

    images.append(atoms)

In [None]:
from colabfit.tools.configuration import Configuration

dataset.configurations = [
    Configuration.from_ase(atoms) for atoms in images
]

### Using `load_data()`

In [None]:
from ase.io import write

write('/content/example.extxyz', images)

In [None]:
from colabfit.tools.dataset import load_data

dataset.configurations = load_data(
    file_path='/content/example.extxyz',
    file_format='xyz',
    name_field='_name',
    elements=['H'],
    default_name=None,
)

In [None]:
_ = [print(co) for co in dataset.configurations[:5]]

## Applying labels to configurations

In [None]:
# Labels can be specified as lists or single strings (which will be wrapped in a list).
dataset.configuration_label_regexes = {
    'configuration_[1-10]': 'small',
    '.*': 'random',
}

## Building configuration sets

In [None]:
dataset.configuration_set_regexes = {
    'configuration_[1-499]':   "The first configuration set",
    'configuration_[500-999]': "The second configuration set",
}

## Synchronizing the dataset

`Dataset.resync()` must be called in order to make sure that the dataset is self-consistent by applying configuration labels, building configuration sets, etc.

In [None]:
print(dataset.configurations[0].info['_labels'])
dataset.print_configuration_sets()

In [None]:
dataset.resync()

In [None]:
print(dataset.configurations[0].info['_labels'])
dataset.print_configuration_sets()

## Parsing the data

In [None]:
dataset.property_map = {
    'default': {
        'energy': {'field': 'dft_energy', 'units': 'eV'},
        'forces': {'field': 'dft_forces', 'units': 'eV/Ang'},
    }
}

In [None]:
dataset.parse_data(convert_units=False, verbose=True)

## Visualizing the data

In [None]:
dataset.plot_histograms(['energy', 'forces'])

## Providing calculation metadata

In [None]:
from colabfit.tools.property_settings import PropertySettings

dataset.property_settings_regexes = {
    '.*':
        PropertySettings(
            method='VASP',
            description='energy/force calculations',
            # files=['/path/to/INCAR'],
            labels=['PBE', 'GGA'],
        )
}

In [None]:
dataset.resync()

In [None]:
dataset.data[0].settings