# Overview

* Flexible enough to handle different types of data
* Able to quickly identify relationships between datasets
* Avoids data duplication

# Database

* The underlying database
* Example: Mongo

In [2]:
from colabfit.tools.database import MongoDatabase

database = MongoDatabase('test')
database

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

# Configurations

* The inputs to a property calculation
* Example: atomic positions, cell vectors, PBCs, constraints

In [3]:
from colabfit.tools.configuration import Configuration

In [4]:
import numpy as np

atoms = Configuration(symbols='H2O', positions=np.random.random((3, 3)), cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]], pbc=True)
atoms

Configuration(symbols='H2O', pbc=True, cell=[1.0, 1.0, 1.0])

In [5]:
from ase import Atoms

ase_atoms = Atoms(symbols='H2O', positions=np.random.random((3, 3)), cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]], pbc=True)

colabfit_atoms = Configuration.from_ase(atoms)
colabfit_atoms

Configuration(symbols='H2O', pbc=True, cell=[1.0, 1.0, 1.0])

In [6]:
configurations = [
    Configuration(
        symbols=f'Cu{i}',
        positions=np.random.random((i, 3)),
        cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
        pbc=True,
    )
    for i in range(1, 101)
]
configurations[:5]

[Configuration(symbols='Cu', pbc=True, cell=[1.0, 1.0, 1.0]),
 Configuration(symbols='Cu2', pbc=True, cell=[1.0, 1.0, 1.0]),
 Configuration(symbols='Cu3', pbc=True, cell=[1.0, 1.0, 1.0]),
 Configuration(symbols='Cu4', pbc=True, cell=[1.0, 1.0, 1.0]),
 Configuration(symbols='Cu5', pbc=True, cell=[1.0, 1.0, 1.0])]

# Properties

* The outputs of a calculation
* Must point to at least one Configuration
* Should also point to a Property Settings
* Usually constructed using `database.insert_data()`
* Example: configuration energy, atomic forces

In [7]:
for atoms in configurations:
    atoms.info['energy'] = np.random.random()
    atoms.arrays['forces'] = np.random.random((len(atoms), 3))

In [8]:
from colabfit.tools.property import Property

In [9]:
definition = {
    "property-id": "energy-forces",

    "property-title": "Basic outputs from a static calculation",

    "property-description": "Supercell energy and atomic forces",

    "energy": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [],
        "required":     False,
        "description":  "Potential energy of the entire configuration",
    },

    "forces": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [":",3],
        "required":     False,
        "description":  "x,y,z force components for each atom",
    }
}

In [10]:
property_map = {
    'energy': {'field': 'energy', 'units': 'eV'},
    'forces': {'field': 'forces', 'units': 'eV/Ang'},
}

In [11]:
Property.from_definition(
    'energy-forces',
    definition=definition,
    configuration=configurations[0],
    property_map=property_map
)

Property(instance_id=1, name='energy-forces')

# PropertySettings

* Additional metadata for setting up a calculation
* Example: xc-functional, k-point mesh, full INCAR file

In [12]:
from colabfit.tools.property_settings import PropertySettings

In [13]:
ps = PropertySettings(
    method='VASP',
    description='A static VASP calculation',
    files=None,  # [/path/to/INCAR]
    labels=['PBE', 'GGA', 'Monkhorst-Pack']
)
ps

PropertySettings(method='VASP', description='A static VASP calculation', labels={'Monkhorst-Pack', 'GGA', 'PBE'})

# ConfigurationSets

* Defines a group of configurations
* Aggregates configuration information (e.g., atom counts, labels, chemical systems, ...)
* Useful for improving discoverability and interpretablility
* Example: "Snapshots from a molecular dynamics run at 1000K"

In [14]:
from colabfit.tools.configuration_set import ConfigurationSet

In [15]:
cs = ConfigurationSet(
    configuration_ids=[str(i) for i in range(len(configurations))],  # come from `database.insert_data()`
    description='Randomly-generated Cu clusters',
    aggregated_info=None,  # comes from `database.aggregate_configuration_info()`
)
cs

ConfigurationSet(description='Randomly-generated Cu clusters', nconfigurations=100)

# Datasets

* A group of computed properties and their associated configurations
* Aggregates property and configuration information (e.g., property types, labels, configuration set info, ...)
* Example: QM9, Si PRX GAP, user-contributed datasets

In [16]:
from colabfit.tools.dataset import Dataset

In [17]:
ds = Dataset(
    configuration_set_ids=[1],  # from `database.insert_configuration_set()`
    property_ids=[str(i) for i in range(len(configurations))],  # from `database.insert_data()`
    name='example',
    authors=['colabfit'],
    links=['https://colabfit.org'],
    description='An example dataset',
    aggregated_info=None,  # from `database.aggregate_configuration_set_info()` and `database.aggregate_property_info()`
)
ds

Dataset(description='An example dataset', nconfiguration_sets=1, nproperties=100)

# Next up:

* Building a real dataset and adding it into the database