In [1]:
from colabfit.tools.database import MongoDatabase, load_data
from colabfit.tools.property_settings import PropertySettings

client = MongoDatabase('colabfit_database', nprocs=1)

In [9]:
import os
import shutil

for ds_doc in client.datasets.find({}, {'name'}):
    if not os.path.isdir(f'/home/josh/colabfit/colabfit/api/static/markdown/{ds_doc["_id"]}'):
        os.mkdir(f'/home/josh/colabfit/colabfit/api/static/markdown/{ds_doc["_id"]}')
    
    shutil.copyfile(f'/colabfit/markdown/{ds_doc["name"]}/README.md', f'/home/josh/colabfit/colabfit/api/static/markdown/{ds_doc["_id"]}/README.md')
    shutil.copyfile(f'/colabfit/markdown/{ds_doc["name"]}/histogram', f'/home/josh/colabfit/colabfit/api/static/markdown/{ds_doc["_id"]}/README.md')
    print(ds_doc)

{'_id': '17896922278085018239', 'name': 'AlNiTi_CMS2019'}
{'_id': '12299200623732822270', 'name': 'COMP6'}
{'_id': '2149155401424133298', 'name': 'CoNbV_CMS2019'}
{'_id': '13344296987139791283', 'name': 'CuPd_CMS2019'}
{'_id': '3976715385933640855', 'name': 'InP_JPCA2020'}
{'_id': '11436130794767089172', 'name': 'Mo_PRM2019'}
{'_id': '10687395576408685395', 'name': 'MoNbTaVW_PRB2021'}
{'_id': '14772445951895794273', 'name': 'Nb_PRM2019'}
{'_id': '9615499379302979156', 'name': 'QM9'}
{'_id': '3071118515767946833', 'name': 'QM9_filtered'}
{'_id': '5133304285144226850', 'name': 'Si_PRX_GAP'}
{'_id': '16179299706373259656', 'name': 'Si_PRX_GAP-no-xc'}
{'_id': '8081947610244469710', 'name': 'Si_PRX_GAP-pbe'}
{'_id': '5869266345656656225', 'name': 'Si_PRX_GAP-pw91'}
{'_id': '14511208659355684642', 'name': 'Ta_Linear_JCP2015'}
{'_id': '10817583392748138384', 'name': 'Ta_PINN_2021'}
{'_id': '1113141511992115938', 'name': 'Ta_PRM2019'}
{'_id': '5154244796734251497', 'name': 'TiZrHfTa_APS2021'}


In [None]:
client.insert_property_definition('tag:staff@noreply.openkim.org,2014-04-15:property/bulk-modulus-isothermal-cubic-crystal-npt')

In [None]:
client.configurations.distinct('elements', {})

In [None]:
client.configurations.distinct('labels', {'nsites': {'$lt': 10}})

# Verifying dataset correctness

In [None]:
client.datasets.distinct('name')

In [None]:
ds_id = client.datasets.find_one({'name': 'Mo_PRM2019'})['_id']
dataset = client.get_dataset(ds_id)['dataset']
dataset.name, ds_id

In [None]:
client.dataset_to_markdown(
    ds_id=ds_id,
    base_folder='/tmp',
    html_file_name='README',
    data_file_name=dataset.name+'.xyz',
    data_format='mongo',
    yscale='log'
)

In [None]:
dataset = client.dataset_from_markdown(
    html_file_path='/tmp/README',
    verbose=True
)

In [None]:
dataset['dataset'].name

In [None]:
client.dataset_to_markdown(
    ds_id=dataset['_id'],
    base_folder='/tmp',
    html_file_name='README-2',
    data_file_name=dataset['dataset'].name+'-2.xyz',
    data_format='mongo',
    yscale='log'
)

In [None]:
client.plot_histograms(dataset.aggregated_info['property_fields'], ids=dataset.property_ids)

# Number of configurations

In [None]:
client.configurations.count_documents({})

# Number of properties

In [None]:
client.properties.count_documents({})

# Number of datasets

In [None]:
client.datasets.count_documents({})

# Get configurations

In [None]:
configs = client.get_configurations('all', attach_properties=True, generator=False)
configs[0]

In [None]:
configs[0].info

In [None]:
"""
Get all Configurations

Get all properties that have a CO ID that's in the query
"""

cursor = client.properties.aggregate([
    {'$unwind': '$relationships.configurations'},
#     {'$match': {'relationships.configurations': {'$in': ids}}},
    {'$lookup': {
        'from': 'configurations',
        'localField': 'relationships.configurations',
        'foreignField': '_id',
        'as': 'linked_co'
    }}
])

next(cursor)

In [None]:
len(list(cursor))

## Names of datasets

In [None]:
sorted(list(client.datasets.find({}, {'name'})), key=lambda x: x['name'].lower())

# Number of configuration sets

In [None]:
client.configuration_sets.count_documents({})

## All configuration sets, and their linked datasets

In [None]:
cursor = client.configuration_sets.aggregate([
    {'$project': {'relationships.datasets': 1}},
    {'$unwind': '$relationships.datasets'},
    {'$project': {'ds_id': '$relationships.datasets'}},
    {'$lookup': {
        'from': 'datasets',
        'localField': 'ds_id',
        'foreignField': '_id',
        'as': 'linked_ds'
    }},
    {'$project': {'ds_name': '$linked_ds.name'}}
])

sorted(list(cursor), key=lambda x: x['ds_name'][0].lower())

## Configuration sets that are tied to more than one dataset

In [None]:
client.configuration_sets.count_documents({'relationships.datasets.1': {'$exists': True}})

In [None]:
cursor = client.configuration_sets.aggregate([
    {'$match': {'relationships.datasets.1': {'$exists': True}}},
    {'$project': {'relationships.datasets': 1}},
    {'$unwind': '$relationships.datasets'},
    {'$project': {'ds_id': '$relationships.datasets'}},
    {'$lookup': {
        'from': 'datasets',
        'localField': 'ds_id',
        'foreignField': '_id',
        'as': 'linked_ds'
    }},
    {'$project': {'ds_name': '$linked_ds.name'}}
])

list(cursor)

# Total number of atoms

In [None]:
next(client.configurations.aggregate([
    {'$group': {'_id': None, 'sum': {'$sum': '$nsites'}}}
]))

# Total number of configuration labels

In [None]:
cursor = client.configurations.aggregate([
#     { "$match": { "_id": { "$in": [1, 2] } } },
    {"$group": {"_id": None, "labels": { "$push": "$labels" }}},
    {
        "$project": {
            "labels": {
                "$reduce": {
                    "input": "$labels",
                    "initialValue": [],
                    "in": { "$setUnion": ["$$value", "$$this"] }
                }
            }
        }
    }
])

labels = next(cursor)['labels']
len(labels)

## Counts for each label

In [None]:
cursor = client.configurations.aggregate([
    {'$unwind': '$labels'},
    {'$group': {'_id': '$labels', 'count': {'$sum': 1}}}
])

sorted(cursor, key=lambda x: x['count'], reverse=True)

# Property distributions

In [None]:
client.properties.find_one({})

In [None]:
client.properties.distinct('type')

In [None]:
all_energies = client.get_data('properties', ['energy-forces-stress.energy', 'energy-forces-virial.energy'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_energies.items():        
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
all_forces = client.get_data('properties', ['energy-forces-stress.forces', 'energy-forces-virial.forces'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_forces.items():
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
all_stresses = client.get_data('properties', ['energy-forces-stress.stress', 'energy-forces-virial.virial'], ravel=True, verbose=True)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for k,v in all_stresses.items():
    _ = ax.hist(v, bins=100, label=k)
    
ax.legend()
ax.set_yscale('log')

In [None]:
client.properties.distinct('energy-forces-stress.stress.source-unit')

In [None]:
client.properties.distinct('energy-forces-virial.virial.source-unit')

In [None]:
client.datasets.find_one({'name': 'WBe_PRB2019'})

In [None]:
client.properties.find_one({'relationships.datasets': '-6889328512840717042'})

# Copying to another database

In [4]:
client.datasets.find_one({'name': 'Si_PRX_GAP-pbe'}, {'name'})

{'_id': '8081947610244469710', 'name': 'Si_PRX_GAP-pbe'}

In [7]:
dataset = client.get_dataset('8081947610244469710')['dataset']

In [15]:
configuration_ids = set()

for cs_id in dataset.configuration_set_ids:
    cs = client.get_configuration_set(cs_id)['configuration_set']
    
    configuration_ids = configuration_ids.union(set(cs.configuration_ids))

In [36]:
dataset.aggregated_info['property_types']

['si-prx-gap-data']

In [33]:
client.properties.find_one({'type': 'energy-forces-stress', 'relationships.datasets': '8081947610244469710'})

In [102]:
client2 = MongoDatabase('mini_database', drop_database=True)

In [103]:
definition = client.get_property_definition('energy-forces-stress')['definition']
definition['property-id'] = 'energy-forces-stress'

In [104]:
client2.insert_property_definition(definition)

In [105]:
configurations = client.get_configurations(list(configuration_ids), property_ids=dataset.property_ids, attach_properties=True)

In [106]:
def tform(c):
    c.info['energy-forces-stress.energy'] = c.info['energy-forces-stress.energy'][0][0]
    c.info['energy-forces-stress.per-atom'] = bool(c.info['energy-forces-stress.per-atom'][0][0])
    c.arrays['energy-forces-stress.forces'] = c.arrays['energy-forces-stress.forces'][0]

In [107]:
property_map = {
    'energy-forces-stress': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'energy': {'field': 'energy-forces-stress.energy', 'units': 'eV'},
        'forces': {'field': 'energy-forces-stress.forces', 'units': 'eV/Ang'},
        'stress': {'field': 'energy-forces-stress.stress', 'units': 'GPa'},
        'per-atom': {'field': 'energy-forces-stress.per-atom', 'units': None},
    },
}

In [108]:
from colabfit.tools.property_settings import PropertySettings

pso = PropertySettings(
    method='CASTEP',
    description='DFT calculations using the CASTEP software',
    files=None,
    labels=['Monkhorst-Pack'],
)

ids = client2.insert_data(
    configurations,
    property_map=property_map,
    property_settings={'energy-forces-stress': pso},
    generator=False,
    transform=tform,
    verbose=True
)

Preparing to add configurations to Database: 100%|██████████████████████████████████████████████████████| 50/50 [00:00<00:00, 437.71it/s]


In [111]:
all_co_ids, all_pr_ids = list(zip(*ids))

len(all_co_ids)

50

In [115]:
help(client2.insert_dataset)

Help on method insert_dataset in module colabfit.tools.database:

insert_dataset(cs_ids, pr_ids, name, authors=None, links=None, description='', resync=False, verbose=False) method of colabfit.tools.database.MongoDatabase instance
    Inserts a dataset into the database.
    
    Args:
    
        cs_ids (list or str):
            The IDs of the configuration sets to link to the dataset.
    
        pr_ids (list or str):
            The IDs of the properties to link to the dataset
    
        name (str):
            The name of the dataset
    
        authors (list or str or None):
            The names of the authors of the dataset. If None, then no
            authors are added.
    
        links (list or str or None):
            External links (e.g., journal articles, Git repositories, ...)
            to be associated with the dataset. If None, then no links are
            added.
    
        description (str or None):
            A human-readable description of the dataset.

In [114]:
new_cs_id = client2.insert_configuration_set(all_co_ids, description='An example configuration set that contains all of the configurations')

In [116]:
dataset.name

'Si_PRX_GAP-pbe'

In [117]:
new_ds_id = client2.insert_dataset(
    cs_ids=[new_cs_id],
    pr_ids=all_pr_ids,
    name=dataset.name,
    authors=dataset.authors,
    links=dataset.links,
    description=dataset.description,
)
new_ds_id

Updating PR->DS relationships: 100%|█████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 129453.83it/s]


'9569257247871315731'

In [120]:
dataset2 = client2.get_dataset(new_ds_id, resync=True, verbose=True)['dataset']

Aggregating configuration info: 100%|███████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 313.97it/s]
Aggregating property info: 100%|██████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 12647.16it/s]


In [121]:
for k,v in dataset2.aggregated_info.items():
    print(k, v)

nconfigurations 50
nsites 4800
nelements 1
chemical_systems ['Si']
elements ['Si']
individual_elements_ratios {'Si': [1.0]}
total_elements_ratios {'Si': 1.0}
configuration_labels ['surface', '111']
configuration_labels_counts [50, 50]
chemical_formula_reduced ['Si']
chemical_formula_anonymous ['A']
chemical_formula_hill ['Si96']
nperiodic_dimensions [3]
dimension_types [[1, 1, 1]]
property_types ['energy-forces-stress']
property_types_counts [50]
property_fields ['energy-forces-stress.energy', 'energy-forces-stress.forces', 'energy-forces-stress.per-atom']
property_fields_counts [50, 50, 50]
methods ['CASTEP']
methods_counts [50]
property_labels ['Monkhorst-Pack']
property_labels_counts [50]
