In [None]:
from ase import Atoms
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.property_definitions import atomic_forces_pd, potential_energy_pd
from collections import defaultdict
import numpy as np
from pathlib import Path
from pymongo.errors import OperationFailure, InvalidOperation
import re

In [None]:
DATASET_FP = Path("/Users/piper/Code/colabfit/data/alanine_h2o_2022/")
water_fp = DATASET_FP / "water.npy"
alani_fp = DATASET_FP / 'alanine_dipeptide.npy'
client = MongoDatabase('----', drop_database=True)

In [None]:
def water_reader(file_path):
    data = np.load(file_path, allow_pickle=True)
    data = data.tolist()
    return data

data = reader(DATASET_FP)


In [17]:
data.keys()

dict_keys(['atomic_number', 'pos', 'force', 'lengths', 'angles'])

In [16]:
data['atomic_number'].shape

(22,)

In [None]:


def water_reader(file_path):
    file_path = Path(file_path)
    data = np.load(file_path, allow_pickle=True)
    data = data.tolist()
    # keys: 'wrapped_coords', 'unwrapped_coords', 'forces', 'velocities'
    # 'lengths', 'angles', 'raw_types', 'atom_types', 'bond_index', 
    # 'bond_types', 'e_steps', 'energy'
    atoms = []
    atoms = [Atoms(numbers=data['atom_types'], positions=data['wrapped_coords'][i]) for i, atom in enumerate(data['wrapped_coords'])]
    for i, atom in enumerate(atoms):
        atom.info['energy'] = data['energy'][i]
        atom.info['forces'] = data['forces'][i]
        atom.info['unwrapped_coords'] = data['unwrapped_coords'][i]
        atom.info['velocities'] = data['velocities'][i]
        atom.info['lengths'] = data['lengths'][i]
        atom.info['angles'] = data['angles'][i]
        atom.info['e_steps'] = data['e_steps'][i]
    return atoms[:1000]


In [None]:
def water_reader(file_path):
    file_path = Path(file_path)
    data = np.load(file_path, allow_pickle=True)
    data = data.tolist()
    # keys: 'atomic_number', 'pos', 'force', 'lengths', 'angles'
    atoms = []
    atoms = [Atoms(numbers=data['atomic_number'], positions=data['pos'][i]) for i, atom in enumerate(data['pos'])]
    for i, atom in enumerate(atoms):
        atom.info['forces'] = data['force'][i]
        atom.info['lengths'] = data['lengths'][i]
        atom.info['angles'] = data['angles'][i]
    return atoms[:1000]

In [None]:
configurations = load_data(
    file_path=DATASET_FP,
    file_format='folder', 
    name_field=None,  
    elements=['H', 'O'],
    reader=water_reader,
    glob_string='*.npy',
    generator=False,
)

In [None]:
configurations.extend(
    load_data(
    file_path=DATASET_FP,
    file_format='folder', 
    name_field=None,  
    elements=['H', 'O'],
    reader=water_reader,
    glob_string='alanine_dipeptide.npy',
    generator=False,
 )

In [None]:
len(configurations)

In [None]:
client.insert_property_definition(potential_energy_pd)
client.insert_property_definition(atomic_forces_pd)

In [None]:
metadata = { 'software': {'value':'LAMMPS'},
            'method': {'value':'DFT'},
            'unwrapped_coords': {'field': 'unwrapped_coords'},
            'velocities': {'field': 'velocities'},
            'lengths': {'field': 'lengths'},
            'angles': {'field': 'angles'},
            'e_steps': {'field': 'e_steps'},
}

property_map = {
    'potential-energy': [{
        'energy':   {'field': 'energy',  'units': 'eV'},
        'per-atom': {'value': False, 'units': None},
        '_metadata': metadata
    }],

    'atomic-forces': [{
        'forces':   {'field': 'forces',  'units': 'eV/A'},
            '_metadata': metadata

    }],
}

In [None]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_do_ids = list(zip(*ids))

In [None]:

cs_regexes = [
    [
        "All_H2/Pt(III)",
        ".*",
        "All configurations from H/Pt(III)",
    ],
    [
        "H2_H2/Pt(III)",
        "H2*",
        "H2 configurations from H/Pt(III)",
    ],
    [
        "Pt-bulk_H2/Pt(III)",
        "Pt-bulk*",
        "Pt-bulk configurations from H/Pt(III)",
    ],
    [
        "Pt-surface_H2/Pt(III)",
        "Pt-surface*",
        "Pt-surface configurations from H/Pt(III)",
    ],
    [
        "PtH_H2/Pt(III)",
        "PtH*",
        "PtH configurations from H/Pt(III)",
    ],
]

cs_ids = []

for i, (name, regex, desc) in enumerate(cs_regexes):
    try:
        co_ids = client.get_data(
            "configurations",
            fields="hash",
            query={"hash": {"$in": all_co_ids}, "names": {"$regex": regex}},
            ravel=True,
        ).tolist()
    except OperationFailure:
        print(f"No match for regex: {regex}")
        continue

    print(
        f"Configuration set {i}",
        f"({name}):".rjust(25),
        f"{len(co_ids)}".rjust(7),
    )

    if len(co_ids) == 0:
        pass
    else:    
        cs_id    = client.insert_configuration_set(
            co_ids, description=desc, name=name
        )

        cs_ids.append(cs_id)


In [None]:
client.insert_dataset(
    cs_ids,
    all_do_ids,
    name='HPt_nc_2022',
    authors=[
        "S. Lee, K. Ermanis, J.M. Goodman"
    ],
    links=[
        "https://rdmc.nottingham.ac.uk/handle/internal/9356",
        "http://doi.org/10.17639/nott.7159",
        "https://doi.org/10.1039/D1SC06324C"
    ],
    description = 'A training dataset of 90,000 configurations'
    ' with interaction properties between H2 and Pt(111) surfaces.',
    verbose=True,
)