# Database

* The backend tool that supports data I/O and querying.
* Example: Mongo database on local server

In [None]:
from colabfit.tools.database import MongoDatabase

In [None]:
database = MongoDatabase('example', drop_database=True)
database

# Configurations (CO)

* The inputs to a calculation of a material property
* **Example:** atomic types/positions, cell vectors, PBCs, constraints
* **Additional information:** user-provided names/labels, chemical formula, element concentrations, ...
---
* Extension of `ase.Atoms` objects (https://wiki.fysik.dtu.dk/ase/ase/atoms.html), with some required fields
* Usually generated by `load_data()`

In [None]:
from colabfit.tools.configuration import Configuration

In [None]:
import numpy as np

colabfit_atoms = Configuration(
    symbols=['H', 'H', 'O'],
    positions=np.random.random((3,3)),
    cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
    pbc=True
)
colabfit_atoms

In [None]:
from ase import Atoms

ase_atoms = Atoms(
    symbols=['H', 'H', 'O'],
    positions=np.random.random((3,3)),
    cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
    pbc=True
)
ase_atoms

colabfit_atoms = Configuration.from_ase(ase_atoms)
colabfit_atoms

In [None]:
colabfit_atoms.info.keys()

In [None]:
ase_atoms.info.keys()

In [None]:
from colabfit.tools.database import load_data

configurations = load_data(...)

# Properties (PR) and Definitions (PD)

* Property: the outputs of a calculation; a computed material property
    * **Example:** configuration energy, atomic forces
    * **Additional information:** units, pointers to configurations/definitions/settings

* Definition: explanation of the contents of a property
    * **Example:**: property name, data types/shapes, human-readable descriptions of fields
---
* Properties must point to at least one Configuration
* Properties usually constructed using `database.insert_data()`
* Definitions usually found in the OpenKIM Properties List: https://openkim.org/properties

In [None]:
# Clearly define what data is on a property
definition = {
    "property-id": "dft-energy-forces",
 
    "property-title": "Basic outputs from a DFT calculation",

    "property-description": "Supercell energy and atomic forces",

    "energy": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [],
        "required":     False,
        "description":  "Potential energy of the entire configuration",
    },

    "forces": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [":",3],
        "required":     False,
        "description":  "x,y,z force components for each atom",
    }
}

In [None]:
from colabfit.tools.property import Property

In [None]:
import numpy as np

prop = Property(
    definition=definition,
    instance={
        'property-id': 'dft-energy-forces',
        'instance-id': 1,
        'energy': {'source-value': 1.23, 'source-unit': 'eV'},
        'forces': {'source-value': np.random.random((7, 3)).tolist(), 'source-unit': 'eV/Ang'},
    },
    configuration_ids=[1],
)
prop

In [None]:
configurations = [
    Configuration(
        symbols=f'Cu{i}',
        positions=np.random.random((i,3)),
        cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
        pbc=True
    )
    for i in range(1, 101)
]

In [None]:
for atoms in configurations:
    atoms.info['energy'] = np.random.random()
    atoms.arrays['forces'] = np.random.random((len(atoms), 3))

In [None]:
database.insert_property_definition(definition)
database.property_definitions.find_one({}, {'_id'})

In [None]:
# Specify how to extract that data from a Configuration
property_map = {
    # Definition key: {'field': Configuration key, 'units': Units}
    'energy': {'field': 'energy', 'units': 'eV'},
    'forces': {'field': 'forces', 'units': 'eV/Ang'},
}

In [None]:
ids = database.insert_data(
    configurations=configurations,
    property_map={'dft-energy-forces': property_map},
    verbose=True
)

In [None]:
ids[0]

In [None]:
database.configurations.find_one({}, {'_id'})

In [None]:
database.properties.find_one({}, {'_id'})

# PropertySettings (PS)

* Additional metadata for setting up a calculation
* **Example:** software package/version, xc-functional, k-point mesh, full input file(s)

In [None]:
from colabfit.tools.property_settings import PropertySettings

In [None]:
ps = PropertySettings(
    method='VASP',
    description='A static VASP calculation',
    files=None,  # [(/path/to/file, <contents_of_file>)]
    labels=['PBE', 'GGA', 'Monkhorst-Pack']
)
ps

In [None]:
ids = database.insert_data(
    configurations=configurations,
    property_map={'dft-energy-forces': property_map},
    property_settings={'dft-energy-forces': ps},
    verbose=True
)

In [None]:
all_co_ids, all_pr_ids = list(zip(*ids))

len(all_co_ids), len(all_pr_ids)

# ConfigurationSets (CS)

* Defines a group of configurations
* Useful for improving discoverability and interpretablility
* **Example:** "Snapshots from a molecular dynamics run at 1000K"
* **Additional information:** aggregates configuration information (e.g., atom counts, labels, chemical systems, ...)

In [None]:
from colabfit.tools.configuration_set import ConfigurationSet

In [None]:
cs = ConfigurationSet(
    configuration_ids=all_co_ids,  # come from `database.insert_data()`
    description='Randomly-generated Cu clusters',
    aggregated_info=None,  # comes from `database.aggregate_configuration_info()`
)
cs

In [None]:
cs_id = database.insert_configuration_set(
    ids=all_co_ids,
    description='Randomly-generated Cu clusters',
)
cs_id

# Datasets (DS)

* A group of computed properties and their associated configurations
* Pointers to CSs (instead of COs) to help keep data organized
* **Example:** QM9, Si PRX GAP, user-contributed datasets
* **Additional information:** aggregates property and configuration information (e.g., property types, labels, configuration set info, ...)

In [None]:
from colabfit.tools.dataset import Dataset

In [None]:
ds = Dataset(
    configuration_set_ids=[cs_id],  # from `database.insert_configuration_set()`
    property_ids=all_pr_ids,
    name='example',
    authors=['colabfit'],
    links=['https://colabfit.org'],
    description='An example dataset',
    aggregated_info=None,  # from `database.aggregate_configuration_set_info()` and `database.aggregate_property_info()`
)
ds

In [None]:
ds_id = database.insert_dataset(
    cs_ids=[cs_id],
    pr_ids=all_pr_ids,
    name='example',
    authors=['colabfit'],
    links=['https://colabfit.org'],
    description='An example dataset',
)
ds_id

# Next up:

* Building a real dataset and adding it into the database