This notebook serves as an example of how to load and manipulate the [ANI-1 dataset](https://github.com/isayev/ANI1_dataset) using a `Dataset` object.

In [1]:
import random
import numpy as np

from colabfit import SHORT_ID_STRING_NAME

from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings
from colabfit.tools.configuration import AtomicConfiguration

In [2]:
client = MongoDatabase('ani1_rebuild', configuration_type=AtomicConfiguration, nprocs=4, drop_database=True)

In [3]:
import os
import sys

my_path_to_pyanitools = '/home/jvita/scripts/colabfit/data/ANI-1_release/'
sys.path.append(my_path_to_pyanitools)

In [4]:
import pyanitools as pya

# To do:

* Merge all of the HDF5 files into a single file, that way you can parallelize over that

In [5]:
import os
import glob
import h5py
from tqdm import tqdm

master_file_name = '/home/jvita/scripts/colabfit/data/ANI-1_release/merged.h5'

counter = 0

with h5py.File(master_file_name, 'w') as merged:
    for file_path in glob.glob('/home/jvita/scripts/colabfit/data/ANI-1_release/ani_*.h5'):
        with h5py.File(file_path, 'r') as hdf5:
            for group_name in hdf5:
                for sub in tqdm(hdf5[group_name]):
                    merged[sub] = h5py.ExternalLink(file_path, os.path.join(group_name, sub))
                    
                    counter += hdf5[group_name][sub]['coordinates'].shape[0]

                    if 'coordinatesHE' in hdf5[group_name][sub]:
                        counter += hdf5[group_name][sub]['coordinatesHE'].shape[0]
        

100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 20.50it/s]
100%|█████████████████████████████████████████████████████████████████████████| 47932/47932 [01:53<00:00, 420.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 765.38it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 267/267 [00:00<00:00, 328.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 284.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 249.51it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1406/1406 [00:04<00:00, 321.81it/s]
100%|███████████████████████████████████████████████████████████████████████████| 7760/7760 [00:24<00:00, 317.73it/s]


In [6]:
with h5py.File(master_file_name, 'a') as merged:
    group_keys = list(merged.keys())
    
len(group_keys)

57462

In [7]:
random.shuffle(group_keys)
split_keys = [_.tolist() for _ in np.split(np.array(group_keys), 6)]

In [8]:
for i, keys in enumerate(split_keys):
    split_file_name = f'/home/jvita/scripts/colabfit/data/ANI-1_release/split_{i}.h5'
    
    with h5py.File(split_file_name, 'w') as hdf5:
        for k in tqdm(keys):
            hdf5[k] = h5py.ExternalLink(master_file_name, k)

100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 21296.84it/s]
100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26150.91it/s]
100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26061.30it/s]
100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25939.50it/s]
100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25981.56it/s]
100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26087.58it/s]


In [9]:
counter

24687809

In [10]:
def reader(hdf5_path):
    
    with h5py.File(hdf5_path, 'r') as hdf5:
        for key in tqdm(hdf5):
            data = hdf5[key]

            n_images = data['coordinates'].shape[0]

            for ni in range(n_images):
                atoms = AtomicConfiguration(
                    symbols=''.join(data['species']),
                    positions=data['coordinates'][ni]
                )

                atoms.info['_name'] = file_name+data['path']

                atoms.info['energy'] = data['energies'][ni]
                atoms.info['smiles'] = ''.join(data['smiles'])

                yield atoms

            # High-energy structures were separated out
            n_images = data['coordinatesHE'].shape[0]

            for ni in tqdm(range(n_images)):
                atoms = AtomicConfiguration(
                    symbols=''.join(data['species']),
                    positions=data['coordinatesHE'][ni]
                )

                atoms.info['_name'] = file_name+data['path']
                atoms.info['_labels'] = ['high_energy']

                atoms.info['energy'] = data['energiesHE'][ni]
                atoms.info['smiles'] = ''.join(data['smiles'])

                yield atoms

In [None]:
ids = list(database.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    transform=tform,
    verbose=True
))

all_co_ids, all_pr_ids = list(zip(*ids))

In [None]:
import os
import numpy as np

from ase import Atoms

# Initialize the database

In [None]:
from colabfit.tools.database import MongoDatabase, load_data

client = MongoDatabase('test', nprocs=1, drop_database=True)

# Setup

This dataset uses the [ANI-1 format](https://github.com/isayev/ANI1_dataset) for loading. Before running this example, you should make sure that [pyanitools.py](https://github.com/isayev/ANI1_dataset/blob/master/readers/lib/pyanitools.py) is in `PYTHONPATH` so that you can use it for loading from the ANI-formatted HDF5 files.

In [None]:
import sys

my_path_to_pyanitools = '/colabfit/data/AL_Al'
sys.path.append(my_path_to_pyanitools)

# Custom reader

Since ANI-1 is not stored in one of the core file formats, a user-specified `reader` function must be provided to `load_data` in order to read the data.

In [None]:
import pyanitools as pya
import glob

In [None]:
len(all_paths)

In [None]:
chunked_paths = [all_paths[i*10000:(i+1)*10000] for i in range(int(np.ceil(len(all_paths)/10000)))]
print(len(chunked_paths[0]))
sum(len(c) for c in chunked_paths)

# Data loading

In [None]:
adl = pya.anidataloader('/colabfit/data/ANI-1_release/ani_gdb_s01.h5')

In [None]:
for data in adl:
    print()
    for k,v in data.items():
        try:
            print(k, v.shape)
        except:
            print(k, v)

In [None]:
adl = pya.anidataloader('/colabfit/data/ANI-1_release/ani_gdb_s01.h5')

for data in adl:
    for ai in tqdm(range(data['coordinates'].shape[0]), 'Loading configurations'):
        atoms = Configuration(symbols=data['species'], positions=data['coordinates'][ai])

        atoms.info['_name'] = [
            '{}{}_configuration_{}'.format(short_name, atoms_path, ai),
            ''.join(data['smiles'].asstr())
        ]
        atoms.info['energy'] = data['energies'][ai]
        # atoms.info['smiles'] = ''.join(data['smiles'].asstr())

        break
    break

In [None]:
data['smiles']

In [None]:
from colabfit import ATOMS_NAME_FIELD, ATOMS_LABELS_FIELD
from tqdm import tqdm
from colabfit.tools.configuration import Configuration
import h5py

def reader(path):
        
    adl = pya.anidataloader(path)

    for data in adl:
        for ai in tqdm(range(data['coordinates'].shape[0]), 'Loading configurations'):
            atoms = Configuration(symbols=data['species'], positions=data['coordinates'][ai])

            atoms.info['_name'] = [
                '{}{}_configuration_{}'.format(path, data['path'], ai),
                ''.join(data['smiles'])
            ]
            atoms.info['energy'] = data['energies'][ai]
            atoms.info['smiles'] = ''.join(data['smiles'])

            yield atoms

In [None]:
base_definition = {
    'property-id': 'energy-forces-stress',
    'property-title': 'A default property for storing energies, forces, and stress',
    'property-description': 'Energies and forces computed using DFT',

    'energy': {'type': 'float', 'has-unit': True, 'extent': [],      'required': False, 'description': 'Cohesive energy'},
    'forces': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Atomic forces'},
    'stress': {'type': 'float', 'has-unit': True, 'extent': [':',3], 'required': False, 'description': 'Stress'},
}

smiles_definition = {
    'property-id': 'smiles',
    'property-title': 'SMILES',
    'property-description': 'A SMILES string of a molecule',

    'smiles': {'type': 'string', 'has-unit': False, 'extent': [], 'required': True, 'description': 'SMILES string'},
}

In [None]:
client.insert_property_definition(base_definition)
client.insert_property_definition(smiles_definition)

In [None]:
property_map = {
    'energy-forces-stress': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'energy', 'units': 'Hartree'},
    },
    'smiles': {
        'smiles': {'field': 'smiles', 'units': None}
    }
}

In [None]:
from colabfit.tools.property_settings import PropertySettings

pso = PropertySettings(
    method='Gaussian09',
    description='ANI-1 property settings calculation',
    files=[],
    labels=['DFT', 'wb97x', '6-31G(d)'],
)

In [None]:
images = load_data(
    file_path='/colabfit/data/ANI-1_release',
    file_format='folder',
    name_field='_name',  # key in Configuration.info to use as the Configuration name
    elements=['C', 'H', 'N', 'O'],    # order matters for CFG files, but not others
    default_name='ani-1',  # default name with `name_field` not found
    reader=reader,
    glob_string='*.h5',
#     verbose=True
)

In [None]:
ids = list(client.insert_data(
    images,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso, 'smiles': pso},
    generator=True,
    verbose=True
))

In [None]:
next(images)

In [None]:
images = load_data(
    file_path='/home/jvita/scripts/colabfit/data/ANI-1_release',
    file_format='folder',
    name_field='name',  # key in Configuration.info to use as the Configuration name
    elements=['C', 'H', 'N', 'O'],    # order matters for CFG files, but not others
    default_name='ani-1',  # default name with `name_field` not found
    reader=reader,
    glob_string='ani_gdb_s02.h5',
#     verbose=True
)

ids = client.insert_data(
    images,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso, 'smiles': pso},
    generator=True,
    verbose=True
)

In [None]:
images = load_data(
    file_path='/home/jvita/scripts/colabfit/data/ANI-1_release',
    file_format='folder',
    name_field='name',  # key in Configuration.info to use as the Configuration name
    elements=['C', 'H', 'N', 'O'],    # order matters for CFG files, but not others
    default_name='ani-1',  # default name with `name_field` not found
    reader=reader,
    glob_string='ani_gdb_s01.h5',
#     verbose=True
)

ids = client.insert_data(
    images,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso, 'smiles': pso},
    generator=True,
    verbose=True
)

In [None]:
configuration_set_regexes = {
    'ani_md_bench':
        'Forces from the ANI-1x potential are applied to run '\
        '1 ns of vacuum molecular dynamics with a 0.25 fs time '\
        'step at 300 K using the Langevin thermostat on 14 well-'\
        'known drug molecules and two small proteins. System '\
        'sizes range from 20 to 312 atoms. A random subsample '\
        'of 128 frames from each 1 ns trajectory is selected, and '\
        'reference DFT single point calculations are performed '\
        'to obtain QM energies and forces.',
    'drugbank_testset':
        'This benchmark is developed '\
        'through a subsampling of the DrugBank database '\
        'of real drug molecules. 837 SMILES strings con'\
        'taining C, N, and O are randomly selected. Like the '\
        'GDB7to9 benchmark, the molecules are embedded in '\
        '3D space, structurally optimized, and normal modes are '\
        'computed. DNMS is utilized to generate random '\
        'non-equilibrium conformations.',
    'gdb11_0[7-9]':
        'The GDB-11 subsets contain'\
        'ing 7 to 9 heavy atoms (C, N, and O) are subsampled '\
        'and randomly embedded in 3D space using RDKit '\
        '[www.rdkit.org]. A total of 1500 molecule SMILES '\
        '[opensmiles.org] strings are selected: 500 per 7, 8, '\
        'and 9 heavy-atom sets. The resulting structures are '\
        'optimized with tight convergence criteria, and nor'\
        'mal modes/force constants are computed using the '\
        'reference DFT model. Finally, diverse normal mode '\
        'sampling (DNMS) is carried out to generate non-'\
        'equilibrium conformations.',
    'gdb1[1,3]_1[0-3]':
        'GDB10to13 benchmark. Subsamples of 500 SMILES '\
        'strings each from the 10 and 11 heavy-atom subsets '\
        'of GDB-11 and 1000 SMILES strings from the 12 '\
        'and 13 heavy-atom subsets of the GDB-13 database '\
        'are randomly selected. DNMS is utilized to generate '\
        'random non-equilibrium conformations.',
    'tripeptide_full':
        'Tripeptide benchmark. 248 random tripeptides contain'\
        'ing H, C, N, and O are generated using FASTA strings '\
        'and randomly embedded in 3D space using RDKit. As '\
        'with GDB7to9, the molecules are optimized, and nor'\
        'mal modes are computed. DNMS is utilized to generate '\
        'random non-equilibrium conformations.',
    's66x8_wb97x6-31gd':
        'S66x8 benchmark. This dataset is built from the '\
        'original S66x850 benchmark for comparing accuracy '\
        'between different methods in describing noncovalent '\
        'interactions common in biological molecules. S66x8 is '\
        'developed from 66 dimeric systems involving hydro'\
        'gen bonding, pi-pi stacking, London interactions, and '\
        'mixed influence interactions. While the keen reader '\
        'might question the use of this benchmark without dis'\
        'persion corrections, since dispersion corrections such '\
        'as the D362 correction by Grimme et al. are a posteriori '\
        'additions to the produced energy, then a comparison '\
        'without the correction is equivalent to a comparison '\
        'with the same dispersion corrections applied to both '\
        'models.'
}

In [None]:
cs_ids = []

for i, (regex, desc) in enumerate(configuration_set_regexes.items()):
    co_ids = client.get_data(
        'configurations',
        fields='_id',
        query={'names': {'$regex': regex}},
        ravel=True
    ).tolist()
    
    print(f'Configuration set {i}', f'({regex}):'.rjust(20), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc, verbose=True)
    
    cs_ids.append(cs_id)

In [None]:
all_co_ids, all_pr_ids = list(zip(*ids))
len(all_pr_ids)

In [None]:
ds_id = client.insert_dataset(
    cs_ids=cs_ids,
    pr_ids=all_pr_ids,
    name='COMP6',
    authors=[
        'Justin S. Smith',
        'Ben Nebgen',
        'Nicholas Lubbers',
        'Olexandr Isayev',
        'Adrian E. Roitberg'
    ],
    links=[
        'https://aip.scitation.org/doi/full/10.1063/1.5023802',
        'https://github.com/isayev/COMP6',
    ],
    description='This repository contains the COMP6 benchmark '\
        'for evaluating the extensibility of machine-learning '\
        'based molecular potentials.',
    resync=True,
    verbose=True,
)
ds_id

In [None]:
ds = client.get_dataset(ds_id, verbose=True)['dataset']

In [None]:
for k,v in ds.aggregated_info.items():
    print(k, v)

# Exploration

In [None]:
ds.aggregated_info['property_fields']

In [None]:
from IPython.display import Image

In [None]:
fig = client.plot_histograms(ds.aggregated_info['property_fields'], ids=ds.property_ids, verbose=True)

In [None]:
Image(fig.to_image(format="png", width=800, height=500, scale=1))

# Filtering

In [None]:
clean_config_sets, clean_property_ids = client.filter_on_properties(
    ds_id=ds_id,
    filter_fxn=lambda p: p['comp6-data']['energy']['source-value'] > -4000,
    fields=['comp6-data.energy'],
    verbose=True
)

In [None]:
clean_cs_ids = []

for cs in clean_config_sets:
    cs_id = client.insert_configuration_set(cs.configuration_ids, description=cs.description, verbose=True)
    
    clean_cs_ids.append(cs_id)

In [None]:
clean_ds_id = client.insert_dataset(
    cs_ids=clean_cs_ids,
    pr_ids=clean_property_ids,
    name='COMP6_filtered',
    authors=ds.authors,
    links=ds.links,
    description="A filtered version of the COMP6 dataset "\
    "that removed all configurations with energies < -4000",
    resync=True,
    verbose=True
)
clean_ds_id

In [None]:
clean_ds = client.get_dataset(clean_ds_id, verbose=True)['dataset']

In [None]:
fig = client.plot_histograms(clean_ds.aggregated_info['property_fields'], ids=clean_ds.property_ids, verbose=True)

In [None]:
Image(fig.to_image(format="png", width=800, height=500, scale=1))

## Extracting data from a single configuration set

In [None]:
cs = client.get_configuration_set(clean_ds.configuration_set_ids[0])['configuration_set']

pr_ids = client.get_data(
    'configurations',
    fields='relationships.properties',
    ids=cs.configuration_ids,
    ravel=True
).tolist()

len(pr_ids)

In [None]:
fig = client.plot_histograms(
    clean_ds.aggregated_info['property_fields'],
    ids=pr_ids,
    verbose=True
)

Image(fig.to_image(format="png", width=800, height=500, scale=1))