Can install colabfit-tools using: pip install git+https://github.com/colabfit/colabfit-tools.git@Calculation

MongoDB process should be running see: https://www.mongodb.com/docs/manual/administration/install-community/

In [7]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings
from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.property_definitions import potential_energy_pd, atomic_forces_pd
import ase

## Connect to MongoDB instance

In [3]:
client = MongoDatabase('test_e2e',drop_database=True)

## Define custom reader if necessary

In [4]:
def reader(file_path):
    file_name=file_path.stem
    atom=ase.io.read(file_path)
    atom.info['name'] = file_name
    yield atom

## Load data from file(s)

In [5]:
configurations = load_data(
    file_path='/Users/piper/Code/colabfit/data/carbon_energies_forces', #Data can be downloaded here: https://figshare.com/articles/dataset/A_dataset_of_DFT_energies_and_forces_for_carbon_allotropes_of_monolayer_graphene_bilayer_graphene_graphite_and_diamond/12649811
    file_format='folder', 
    name_field='name',  
    elements=['C'],
    reader=reader,
    glob_string='*.xyz',
    generator=False,
)

3648it [00:15, 240.84it/s]


In [6]:
configurations[0].info

{'_name': {'inst_cubic_diamond_temp600_strain0.0_step2871'},
 'PBC': array([1, 1, 1]),
 'Energy': -508.16423563,
 'name': 'inst_cubic_diamond_temp600_strain0.0_step2871',
 '_labels': set()}

## Define properties and setup property mapping(s)

In [1]:
#Load from colabfit's definitions
client.insert_property_definition(potential_energy_pd)
client.insert_property_definition(atomic_forces_pd)
potential_energy_pd

NameError: name 'client' is not defined

In [8]:
metadata = { 'software': {'value':'VASP'},
            'method': {'value':'DFT-PBE+MDB'}
}
property_map = {
    'potential-energy': [{
        'energy':   {'field': 'Energy',  'units': 'eV'},
        'per-atom': {'field': 'per-atom', 'units': None},
        '_metadata': metadata
    }],

    'atomic-forces': [{
        'forces':   {'field': 'force',  'units': 'eV/Ang'},
            '_metadata': metadata

    }],
}

## Insert configurations and properties into database

In [9]:
ids = list(client.insert_data(
    configurations,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids, all_do_ids = list(zip(*ids))



RuntimeError: Property definition 'free-energy' does not exist. Use insert_property_definition() first

## Group configurations into convenient sets

In [10]:
cs_regexes = [
    ['carbon_allotropes',
     '.*',
     'All configurations'
    ],
    ['bilayer_graphene',
     'bilayer',
     'All bilayer graphene configurations'
    ],
    ['diamond',
     'diamond',
     'All diamond configurations'
    ],
    ['graphite',
     'graphite',
     'All graphite configurations'
    ],
    ['monolayer_graphene',
     'monolayer',
     'All monolayer graphene configurations'
    ]
]

cs_ids = []

for i, (name, regex, desc) in enumerate(cs_regexes):
    co_ids = client.get_data(
        'configurations',
        fields='hash',
        query={'hash': {'$in': all_co_ids}, 'names': {'$regex': regex}},
        ravel=True
    ).tolist()

    print(f'Configuration set {i}', f'({name}):'.rjust(22), f'{len(co_ids)}'.rjust(7))

    cs_id = client.insert_configuration_set(co_ids, description=desc,name=name)

    cs_ids.append(cs_id)


Configuration set 0   (carbon_allotropes):    3635
Configuration set 1    (bilayer_graphene):     833
Configuration set 2             (diamond):     841
Configuration set 3            (graphite):     655
Configuration set 4  (monolayer_graphene):    1306


## Create a dataset

In [11]:
ds_id = client.insert_dataset(
    cs_ids,
    all_do_ids,
    name='C_npj2020',
    authors=[
        'M. Wen', 'E. B. Tadmor'
    ],
    links=[
        'https://www.nature.com/articles/s41524-020-00390-8#Abs1',
        'https://figshare.com/articles/dataset/A_dataset_of_DFT_energies_and_forces_for_carbon_allotropes_of_monolayer_graphene_bilayer_graphene_graphite_and_diamond/12649811',
    ],
    description = 'The dataset consists of energies and forces for monolayer '\
        'graphene, bilayer graphene, graphite, and diamond in various' \
        'states, including strained static structures and configurations' \
        'drawn from ab initio MD trajectories. A total number of 4788' \
        'configurations was generated from DFT calculations using the' \
        'Vienna Ab initio Simulation Package (VASP).',
    verbose=True,
)
client.datasets.find_one()

Aggregating configuration info: 100%|██████████| 3635/3635 [00:00<00:00, 7807.83it/s]
Aggregating data_object info: 100%|██████████| 3642/3642 [00:00<00:00, 79193.20it/s]
Updating CA->DS relationships: 100%|██████████| 3642/3642 [00:00<00:00, 46616.60it/s]


{'_id': ObjectId('63c0520a436308b0bd904658'),
 'hash': '8186678686281254914507701352425145601259760796485051574656071008468895114605171170971695732081778015085071440531003004048290678638412913751942422889006305',
 'aggregated_info': {'nconfigurations': 3635,
  'nsites': 191300,
  'nelements': 1,
  'chemical_systems': ['C'],
  'elements': ['C'],
  'individual_elements_ratios': {'C': [1.0]},
  'total_elements_ratios': {'C': 1.0},
  'chemical_formula_reduced': ['C'],
  'chemical_formula_anonymous': ['A'],
  'chemical_formula_hill': ['C52', 'C64', 'C32', 'C2', 'C72', 'C76'],
  'nperiodic_dimensions': [3],
  'dimension_types': [[1, 1, 1]],
  'property_types': ['potential-energy', 'atomic-forces'],
  'property_types_counts': [3642, 3642]},
 'authors': ['M. Wen', 'E. B. Tadmor'],
 'colabfit-id': 'DS_povkyynx8tl8_0',
 'description': 'The dataset consists of energies and forces for monolayer graphene, bilayer graphene, graphite, and diamond in variousstates, including strained static structures

## Update data items

In [12]:
configurations_2 = load_data(
    file_path='/Users/piper/Code/colabfit/data/update_data/',
    file_format='folder', 
    name_field='name',  
    elements=['C'],
    reader=reader,
    glob_string='*.xyz',
    generator=False,
)
ids_2 = list(client.insert_data(
    configurations_2,
    property_map=property_map,
    generator=False,
    verbose=True
))

all_co_ids_2, all_do_ids_2 = list(zip(*ids_2))

1140it [00:02, 453.73it/s]
Preparing to add configurations to Database:  84%|████████▍ | 962/1140 [00:04<00:00, 257.46it/s]

KeyboardInterrupt: 

Preparing to add configurations to Database: 100%|██████████| 1140/1140 [00:05<00:00, 220.09it/s]


In [None]:
ids_2

[('771638729831922951', '905839713625822805'),
 ('788923460937636866', '500068402403752482'),
 ('694794636515712034', '9840288710134483'),
 ('2143862186015763127', '329752900776513319'),
 ('2121298364377492713', '551559687239468142'),
 ('1663976866748773139', '1815889576129623024'),
 ('245046170870521105', '1499992333530784143'),
 ('1989440475586154949', '1015891214816806050'),
 ('1183687723886596706', '2151609997701940080'),
 ('95475065203932800', '290007458858473770'),
 ('1189838681236946747', '1080992291541736969'),
 ('952573706810556671', '1424670388317218565'),
 ('419960913348364658', '2277291383624751037'),
 ('465091581386895146', '1449743946062597033'),
 ('711059787323784555', '497835280340165447'),
 ('2200480088215984818', '149956471345429415'),
 ('1146863625119734991', '277009645147078438'),
 ('505750329386425497', '1572478381012070195'),
 ('673822292629183142', '556163465827235313'),
 ('2224096112493951245', '2133388105136768851'),
 ('1075911906845622739', '311265281764476973

In [None]:
cs_id_2 = client.update_configuration_set(cs_id=cs_ids[3],add_ids=all_co_ids_2)

In [None]:
ds_id_2 = client.update_dataset(ds_id, add_cs_ids=cs_id_2, add_do_ids=all_do_ids_2)

Updating CA->DS relationships: 100%|██████████| 4776/4776 [00:00<00:00, 147375.36it/s]


# Model Training 
## Below likely won't work as it was a one-off test on custom code

In [None]:
from kliff import nn
from kliff.descriptors import SymmetryFunction
from kliff.calculators import CalculatorTorch
from kliff.dataset import Dataset
from kliff.models import NeuralNetwork
from kliff.loss import Loss
%load_ext autoreload
%autoreload 2

## Setup Model

In [None]:
descriptor = SymmetryFunction(
    cut_name="cos", cut_dists={"C-C": 5.0}, hyperparams="set51", normalize=True
)

In [None]:
model = NeuralNetwork(descriptor)
model.add_layers(
    # first hidden layer
    nn.Linear(descriptor.get_size(), 10),
    nn.Tanh(),
    # second hidden layer
    nn.Linear(10, 10),
    nn.Tanh(),
    # output layer
    nn.Linear(10, 1),
)

## Load data from database 

In [None]:
ds = Dataset(colabfit_database='test_e2e', colabfit_dataset=ds_id_2)
configs = ds.get_configs()[:300]

TypeError: __init__() got an unexpected keyword argument 'colabfit_database'

## Training

In [None]:
calc = CalculatorTorch(model, gpu=False)
_ = calc.create(configs, reuse=True,fingerprints_mean_stdev_filename='fingerprints_mean_and_stdev.pkl' )

In [None]:
loss = Loss(calc)
result = loss.minimize(method="Adam", num_epochs=100, batch_size=10, lr=0.01)

## Export model

In [None]:
model.write_kim_model()

# Using Model

In [None]:
from ase.lattice.cubic import Diamond
from ase.calculators.kim.kim import KIM

In [None]:
atoms = Diamond(symbol='C', latticeconstant=3.57,size=(1,1,1))
calc = KIM("NeuralNetwork_KLIFF__MO_000000111111_000")
atoms.calc = calc

energy = atoms.get_potential_energy()
energy

In [None]:
# Define KIM model and get Si diamond lattice parameter for this potential
kim_init          NeuralNetwork_KLIFF__MO_000000111111_000  metal
# Setup diamond crystal
boundary         p p p
lattice          diamond 3.57
region           simbox block 0 1 0 1 0 1 units lattice
create_box       1 simbox
create_atoms     1 box
mass             1 12.011
# Define atom type to species mapping
kim_interactions C
# Compute energy
run 0
