Can install colabfit-tools using: pip install git+https://github.com/colabfit/colabfit-tools.git@Calculation

MongoDB process should be running see: https://www.mongodb.com/docs/manual/administration/install-community/

In [171]:

import ase
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings
from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.property_definitions import potential_energy_pd, atomic_forces_pd
from colabfit.tools.converters import FolderConverter
from collections import defaultdict
import h5py


## Connect to MongoDB instance

In [2]:
client = MongoDatabase('test3_e2e',drop_database=True)

## Define custom reader if necessary

In [149]:
# This dataset has properties scaled by 0.95, 0.975, 1.0, 1.05, and 1.1
# Below code indexes the non-scaled properties (i.e., at position 2)
# https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/549359/README.md?sequence=1&isAllowed=y

def reader(file_path):
    with h5py.File(file_path, 'r') as f:
        for key in list(f.keys()):
            # key0 = (list(f.keys())[0])
            elements = [x.decode('utf-8') for x in f[key]['elements'][()]]
            coordinates = f[key]['coordinates'][2]
            cell = f[key]['cells'][2]
            total_energy = (f[key]['total_energies'][2])
            atoms = ase.Atoms(symbols=elements, positions=coordinates, cell=cell)
            atoms.info['name'] = 'qm_intermolecular_potential_crystals'
            atoms.info['energy'] = total_energy
            yield atoms

In [150]:
reader('/Users/piper/Code/colabfit/data/qm_intermolecular_potential_crystals/crystal_dataset.hdf5')

<generator object reader at 0x7fc7fa8b1ac0>

## Load data from file(s)

In [151]:
configurations = load_data(
    file_path='/Users/piper/Code/colabfit/data/qm_intermolecular_potential_crystals/', #Data can be downloaded here: https://figshare.com/articles/dataset/A_dataset_of_DFT_energies_and_forces_for_carbon_allotropes_of_monolayer_graphene_bilayer_graphene_graphite_and_diamond/12649811
    file_format='folder', 
    name_field='name',  
    elements=['C', 'H', 'O', 'S', 'Cl', 'N', 'F'],
    reader=reader,
    glob_string='*.hdf5',
    generator=False,
)

1it [00:30, 30.41s/it]


In [152]:
len(configurations)

11489

## Define properties and setup property mapping(s)

In [153]:
#Load from colabfit's definitions
client.insert_property_definition(potential_energy_pd)
# client.insert_property_definition(atomic_forces_pd)
# potential_energy_pd

In [160]:
metadata = { 'software': {'value':'Quantum Espresso'},
            'method': {'value':'PBE-XDM'}
}
property_map = {
    'potential-energy': [{
        'energy':   {'field': 'total_energies',  'units': 'kJ/mol'},
        'per-atom': {'value': False, 'units': None},
        '_metadata': metadata
    }],

    # 'atomic-forces': [{
    #     'forces':   {'field': 'force',  'units': 'eV/Ang'},
    #         '_metadata': metadata

    # }],
}

## Insert configurations and properties into database

In [161]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_do_ids = list(zip(*ids))

Preparing to add configurations to Database: 100%|██████████| 11489/11489 [00:21<00:00, 528.14it/s]


## Group configurations into convenient sets

In [173]:
element_combos = defaultdict(list)


In [175]:
hash_elements = client.get_data('configurations', fields=['hash', 'elements'])
for i, e in enumerate(hash_elements['elements']):
    key = ''.join(e)
    element_combos[key].append(hash_elements['hash'][i])

In [178]:


cs_ids = []
count = 0
for key, val in element_combos.items():
 
    co_ids = client.get_data(
        'configurations',
        fields='hash',
        query={'hash': {'$in': val}},
        ravel=True
    ).tolist()

    print(f'Configuration set {count}', f'({key}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=f"Sets with elements {key}", name=key)

    cs_ids.append(cs_id)
    count += 1


Configuration set 0                 (CHO):    2170
Configuration set 1              (CClHOS):      78
Configuration set 2                (CHNO):    4568
Configuration set 3                (CHNS):     371
Configuration set 4              (CClHNO):     786
Configuration set 5             (CClHNOS):     256
Configuration set 6               (CFNOS):       1
Configuration set 7               (CHNOS):    1587
Configuration set 8               (CClHO):     274
Configuration set 9               (CClHN):     143
Configuration set 10                (CClH):      36
Configuration set 11                 (CHN):     488
Configuration set 12                (CHOS):     446
Configuration set 13              (CClHNS):      47
Configuration set 14               (CFHNO):      15
Configuration set 15                 (CHS):      81
Configuration set 16                (CFOS):       1
Configuration set 17                  (CH):      91
Configuration set 18                (CClN):       5
Configuration set 19  

## Create a dataset

In [None]:
ds_id = client.insert_dataset(
    cs_ids,
    all_do_ids,
    name='C_npj2020',
    authors=[
        'M. Wen', 'E. B. Tadmor'
    ],
    links=[
        'https://www.nature.com/articles/s41524-020-00390-8#Abs1',
        'https://figshare.com/articles/dataset/A_dataset_of_DFT_energies_and_forces_for_carbon_allotropes_of_monolayer_graphene_bilayer_graphene_graphite_and_diamond/12649811',
    ],
    description = 'The dataset consists of energies and forces for monolayer '\
        'graphene, bilayer graphene, graphite, and diamond in various' \
        'states, including strained static structures and configurations' \
        'drawn from ab initio MD trajectories. A total number of 4788' \
        'configurations was generated from DFT calculations using the' \
        'Vienna Ab initio Simulation Package (VASP).',
    verbose=True,
)
client.datasets.find_one()

Aggregating configuration info: 100%|██████████| 3635/3635 [00:00<00:00, 7807.83it/s]
Aggregating data_object info: 100%|██████████| 3642/3642 [00:00<00:00, 79193.20it/s]
Updating CA->DS relationships: 100%|██████████| 3642/3642 [00:00<00:00, 46616.60it/s]


{'_id': ObjectId('63c0520a436308b0bd904658'),
 'hash': '8186678686281254914507701352425145601259760796485051574656071008468895114605171170971695732081778015085071440531003004048290678638412913751942422889006305',
 'aggregated_info': {'nconfigurations': 3635,
  'nsites': 191300,
  'nelements': 1,
  'chemical_systems': ['C'],
  'elements': ['C'],
  'individual_elements_ratios': {'C': [1.0]},
  'total_elements_ratios': {'C': 1.0},
  'chemical_formula_reduced': ['C'],
  'chemical_formula_anonymous': ['A'],
  'chemical_formula_hill': ['C52', 'C64', 'C32', 'C2', 'C72', 'C76'],
  'nperiodic_dimensions': [3],
  'dimension_types': [[1, 1, 1]],
  'property_types': ['potential-energy', 'atomic-forces'],
  'property_types_counts': [3642, 3642]},
 'authors': ['M. Wen', 'E. B. Tadmor'],
 'colabfit-id': 'DS_povkyynx8tl8_0',
 'description': 'The dataset consists of energies and forces for monolayer graphene, bilayer graphene, graphite, and diamond in variousstates, including strained static structures