# Database

* **Definition:** the backend tool that supports data I/O and querying.
* **Example:** Mongo database on local machine

In [1]:
from colabfit.tools.database import MongoDatabase

In [2]:
database = MongoDatabase('example', drop_database=True)
database

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

# Configuration (CO)

* **Definition:** the inputs to a calculation of a material property
* **Example:** atomic types/positions, cell vectors, PBCs, constraints
* **Additional information:**
    * user-provided names/labels, chemical formula, element concentrations, ...
    * Extension of `ase.Atoms` objects (https://wiki.fysik.dtu.dk/ase/ase/atoms.html), with some required fields
    * Usually generated by `load_data()`

In [3]:
from colabfit.tools.configuration import Configuration

In [4]:
import numpy as np

colabfit_atoms = Configuration(
    symbols=['H', 'H', 'O'],
    positions=np.random.random((3,3)),
    cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
    pbc=True
)
colabfit_atoms

Configuration(symbols='H2O', pbc=True, cell=[1.0, 1.0, 1.0])

In [5]:
from ase import Atoms

ase_atoms = Atoms(
    symbols=['H', 'H', 'O'],
    positions=np.random.random((3,3)),
    cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
    pbc=True
)
ase_atoms

colabfit_atoms = Configuration.from_ase(ase_atoms)
colabfit_atoms

Configuration(symbols='H2O', pbc=True, cell=[1.0, 1.0, 1.0])

In [6]:
colabfit_atoms.info.keys()

dict_keys(['_name', '_labels', '_constraints'])

In [7]:
ase_atoms.info.keys()

dict_keys([])

# PropertyDefinition (PD)

* **Definition:**  explanation of the contents of a property
* **Example:** property name, data types/shapes, human-readable descriptions of fields
* **Additional information:**
    * Definitions usually found in the OpenKIM Properties List: https://openkim.org/properties

In [8]:
# Clearly define what data is on a property
definition = {
    "property-id": "dft-energy-forces",
 
    "property-title": "Basic outputs from a DFT calculation",

    "property-description": "Supercell energy and atomic forces",

    "energy": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [],
        "required":     False,
        "description":  "Potential energy of the entire configuration",
    },

    "forces": {
        "type":         "float",
        "has-unit":     True,
        "extent":       [":",3],
        "required":     False,
        "description":  "x,y,z force components for each atom",
    }
}

In [9]:
database.insert_property_definition(definition)
database.property_definitions.find_one({}, {'_id'})



{'_id': 'dft-energy-forces'}

# Property Instance (PI)

* **Definition:** the outputs of a calculation; a computed material property
* **Example:** configuration energy, atomic forces
* **Additional information:** units, pointers to configurations/definitions/settings

In [10]:
from colabfit.tools.property import Property

In [11]:
import numpy as np

prop = Property(
    definition=definition,
    instance={
        'property-id': 'dft-energy-forces',
        'instance-id': 1,
        'energy': {'source-value': 1.23, 'source-unit': 'eV'},
        'forces': {'source-value': np.random.random((7, 3)).tolist(), 'source-unit': 'eV/Ang'},
    },
    configuration_ids=['CO_XXXXXXXXXXXX_XXX'],  # dummy ID
)
prop

Property(instance_id=1, name='dft-energy-forces')

# PropertySettings (PS)

* **Definition:** additional metadata for setting up a calculation
* **Example:** software package/version, xc-functional, k-point mesh, full input file(s)

In [12]:
from colabfit.tools.property_settings import PropertySettings

In [13]:
ps = PropertySettings(
    method='VASP',
    description='A static VASP calculation',
    files=None,  # [(/path/to/file, <contents_of_file>)]
    labels=['PBE', 'GGA', 'Monkhorst-Pack']
)
ps

PropertySettings(method='VASP', description='A static VASP calculation', labels={'PBE', 'Monkhorst-Pack', 'GGA'})

# Inserting data in practice

In [14]:
# Loaded using load_data()
configurations = [
    Configuration(
        symbols=f'Cu{i}',
        positions=np.random.random((i,3)),
        cell=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
        pbc=True
    )
    for i in range(1, 101)
]

In [15]:
# Computed values already attached
for atoms in configurations:
    atoms.info['energy'] = np.random.random()
    atoms.arrays['forces'] = np.random.random((len(atoms), 3))

In [16]:
# Automatically inserts COs, PIs, and PSs at the same time
# Covered in more detail in the Dataset Example video

ids = database.insert_data(
    configurations=configurations,
    property_map={
        'dft-energy-forces': [
            {
                'energy': {'field': 'energy', 'units': 'eV'},
                'forces': {'field': 'forces', 'units': 'eV/Ang'},

                '_settings': {
                    '_method': 'VASP',
                    '_description': 'A static VASP calculation',
                    '_files': None,  # [(/path/to/file, <contents_of_file>)]
                    '_labels': ['PBE', 'GGA', 'Monkhorst-Pack']
                }
            }
        ]
    }
)

Preparing to add configurations to Database: 100%|██████████████████████████| 100/100 [00:00<00:00, 643.47it/s]


In [17]:
all_co_ids, all_pi_ids = list(zip(*ids))

len(all_co_ids), len(all_pi_ids)

(100, 100)

# ConfigurationSet (CS)

* **Definition:** a group of configurations
* **Example:** "Snapshots from a molecular dynamics run at 1000K"
* **Additional information:**
    * Aggregates configuration information (e.g., atom counts, labels, chemical systems, ...)
    * Useful for improving discoverability and interpretablility

In [18]:
from colabfit.tools.configuration_set import ConfigurationSet

In [19]:
cs = ConfigurationSet(
    configuration_ids=all_co_ids,  # come from `database.insert_data()`
    description='Randomly-generated Cu clusters',
    aggregated_info=None,  # comes from `database.aggregate_configuration_info()`
)
cs

ConfigurationSet(description='Randomly-generated Cu clusters', nconfigurations=100)

In [20]:
cs_id = database.insert_configuration_set(
    ids=all_co_ids,
    description='Randomly-generated Cu clusters',
)
cs_id

'CS_53614896853034_000'

# Dataset (DS)

* **Definition:** a group of computed properties and their associated configurations
* **Example:** QM9, Si PRX GAP, user-contributed datasets
* **Additional information:**
    * Aggregates property and configuration information (e.g., property types, labels, configuration set info, ...)
    * Pointers to CSs (instead of COs) to help keep data organized

In [21]:
from colabfit.tools.dataset import Dataset

In [22]:
ds = Dataset(
    configuration_set_ids=[cs_id],  # from `database.insert_configuration_set()`
    property_ids=all_pi_ids,
    name='example',
    authors=['colabfit'],
    links=['https://colabfit.org'],
    description='An example dataset',
    aggregated_info=None,  # from `database.aggregate_configuration_set_info()` and `database.aggregate_property_info()`
)
ds

Dataset(description='An example dataset', nconfiguration_sets=1, nproperties=100)

In [23]:
ds_id = database.insert_dataset(
    cs_ids=[cs_id],
    pr_ids=all_pi_ids,
    name='example',
    authors=['colabfit'],
    links=['https://colabfit.org'],
    description='An example dataset',
)
ds_id

Updating PR->DS relationships: 100%|█████████████████████████████████████| 100/100 [00:00<00:00, 120422.16it/s]


'DS_89083579822367_000'