This notebook serves as an example of how to load and manipulate the [COMP6 dataset](https://github.com/isayev/COMP6) using a `Dataset` object.

In [1]:
import os
import numpy as np

from ase import Atoms

from colabfit.tools.dataset import Dataset, load_data
from colabfit.tools.property_settings import PropertySettings

# Setup

The COMP6 dataset is one of a collection of datasets that uses the [ANI-1 format](https://github.com/isayev/ANI1_dataset) for loading. Before running this example, you should make sure that [pyanitools.py](https://github.com/isayev/ANI1_dataset/blob/master/readers/lib/pyanitools.py) is in `PYTHONPATH` so that you can use it for loading from the ANI-formatted HDF5 files.

In [2]:
import sys

my_path_to_pyanitools = '../../../svreg_data/AlZnMg/AL_Al/'
sys.path.append(my_path_to_pyanitools)

# Custom reader

Since COMP6 is not stored in one of the core file formats, a user-specified `reader` function must be provided to `load_data` in order to read the data.

In [3]:
def reader(path):
    import pyanitools as pya
    
    adl = pya.anidataloader(path)
    
    images = []
    for data in adl:        
        for i in range(data['coordinates'].shape[0]):
            atoms = Atoms(symbols=data['species'], positions=data['coordinates'][i])
            
            atoms.info['name'] = '{}_conformer_{}'.format(data['path'], i)
            
            atoms.info['energy'] = data['energies'][i]
            atoms.arrays['forces'] = data['forces'][i]
            
            atoms.info['cm5'] = data['cm5'][i]
            atoms.info['hirdipole'] = data['hirdipole'][i]
            atoms.info['hirshfeld'] = data['hirshfeld'][i]
            atoms.info['spindensities'] = data['spindensities'][i]
            
            images.append(atoms)
            
    return images

# Loading configurations

In [4]:
dataset = Dataset('COMP6')

dataset.authors = [
    'Justin S. Smith', 'Ben Nebgen', 'Nicholas Lubbers', 'Olexandr Isayev', 'Adrian E. Roitberg'
]

dataset.links = [
    'https://aip.scitation.org/doi/full/10.1063/1.5023802',
    'https://github.com/isayev/COMP6',
#     'https://pubs.acs.org/doi/10.1021/acs.jctc.8b00524',
]

dataset.description = \
'This repository contains the COMP6 benchmark '\
'for evaluating the extensibility of machine-learning '\
'based molecular potentials.'

In [5]:
dataset.configurations = load_data(
    file_path='../../../colabfit/data/isayev/COMP6/COMP6v1/',
    file_format='folder',
    name_field='name',  # key in Configuration.info to use as the Configuration name
    elements=['C', 'H', 'N', 'O'],    # order matters for CFG files, but not others
    default_name='comp6',  # default name with `name_field` not found
    reader=reader,
    glob_string='*.h5',
    verbose=True
)

Loading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:42<00:00,  3.84s/it]


In [6]:
len(dataset.configurations)

101352

# Parsing properties

COMP6 includes three properties that don't exist in the currently available definitions: `cm5`, `hirshfeld`, and `hirdipole`. In order to load and properly document these, a custom property definition must be provided

In [7]:
comp6_property_definition = {
    'property-id': 'comp6-charges',
    'property-title': 'cm5, hirdipole, hirshfeld, spindensities',
    'property-description': 'Charges, dipoles, and spin densities',
    'cm5':          {'type': 'float', 'has-unit': True, 'extent': [":"], 'required': True, 'description': 'CM5 atomic charges'},
    'hirshfeld':    {'type': 'float', 'has-unit': True, 'extent': [":"], 'required': True, 'description': 'Hirshfeld atomic charges'},
    'hirdipole':    {'type': 'float', 'has-unit': True, 'extent': [":", 3], 'required': True, 'description': 'Hirshfeld atomic dipoles'},
}

Note that `energy` and `forces` can still be loaded by the default definition since they are known properties.

In [8]:
property_map = {
    'default' : {
        'energy':    {'field': 'energy',    'units': 'kcal/mol'},
        'forces':    {'field': 'forces',    'units': 'kcal/mol/Ang'},
    },
    'comp6-charges': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'cm5':       {'field': 'cm5',       'units': 'elementary_charge'},
        'hirshfeld': {'field': 'hirshfeld', 'units': 'elementary_charge'},
        'hirdipole': {'field': 'hirdipole', 'units': 'elementary_charge*Ang'},
    }
}

In [9]:
dataset.property_map = property_map

dataset.custom_definitions = {'comp6-charges': comp6_property_definition}

In [10]:
dataset.parse_data(convert_units=False, verbose=True)

Parsing data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 101352/101352 [01:23<00:00, 1218.92it/s]


In [11]:
dataset.property_settings_regexes = {
    '.*':
        PropertySettings(
            method='Gaussian09',
            description='COMP6 property settings calculation',
            files=[],
            labels=['DFT', 'wb97x', '6-31G(d)'],
        )
}

In [12]:
dataset.resync()

# Saving the dataset

In [13]:
dataset.to_markdown(
    base_folder='../../../colabfit/data/formatted/COMP6v1/',
    html_file_name='README.md',
    data_file_name=dataset.name+'.extxyz',
    data_format='xyz',
)

# Exploration