In [None]:
# Commented output corresponds to notebook run on 11 May, 2023
# Does not include OC20 or OC22 datasets

In [45]:
from collections import defaultdict
from pymongo import MongoClient
from pprint import pprint

In [1]:
client = MongoClient('mongodb://localhost:5000/')
db = client['colabfit-2023-4-27']

### Getting aggregate statistics

In [19]:
chem_sys = set()
for ds in db.datasets.find():

    systems = ds['aggregated_info']['chemical_systems']
    if type(systems) == list:
        chem_sys.update(systems)
    elif type(systems) == str:
        chem_sys.add(systems)
    else:
        print("null_set? ", type(systems))
print(f"Number of distinct chemical systems: {len(chem_sys)}")
# Number of distinct chemical systems: 68474


Number of distinct chemical systems: 68474


In [40]:
# Atoms per system
atoms_per_sys = defaultdict(int)
for system in chem_sys:
    caps = sum([x.isupper() for x in system])
    atoms_per_sys[caps] += 1
print(atoms_per_sys)


defaultdict(<class 'int'>, {20: 844, 4: 27662, 2: 3160, 5: 7769, 3: 21028, 13: 882, 15: 339, 6: 1883, 1: 89, 7: 1023, 14: 766, 18: 206, 16: 70, 10: 62, 8: 1006, 11: 200, 17: 58, 23: 73, 12: 569, 19: 500, 24: 44, 21: 108, 26: 5, 22: 104, 25: 16, 9: 8})


In [46]:
pprint(dict(sorted(list(atoms_per_sys.items()), key=lambda x: x[0])))
# {1: 89,
#  2: 3160,
#  3: 21028,
#  4: 27662,
#  5: 7769,
#  6: 1883,
#  7: 1023,
#  8: 1006,
#  9: 8,
#  10: 62,
#  11: 200,
#  12: 569,
#  13: 882,
#  14: 766,
#  15: 339,
#  16: 70,
#  17: 58,
#  18: 206,
#  19: 500,
#  20: 844,
#  21: 108,
#  22: 104,
#  23: 73,
#  24: 44,
#  25: 16,
#  26: 5}


{1: 89,
 2: 3160,
 3: 21028,
 4: 27662,
 5: 7769,
 6: 1883,
 7: 1023,
 8: 1006,
 9: 8,
 10: 62,
 11: 200,
 12: 569,
 13: 882,
 14: 766,
 15: 339,
 16: 70,
 17: 58,
 18: 206,
 19: 500,
 20: 844,
 21: 108,
 22: 104,
 23: 73,
 24: 44,
 25: 16,
 26: 5}


In [55]:

do_per_config = defaultdict(int)
for config in db.configurations.find():
    do_per_config[len(config['relationships']['data_objects'])] += 1
with open("data_objects_per_config.txt", "w") as f:
    f.write(str(do_per_config))

In [54]:
# Find configurations with large numbers of data objects pointing to them
with open("large_do_per_config.txt", "w") as f:
    for config in db.configurations.find():
        if len(config['relationships']['data_objects']) > 10:
            f.write(str(config))
            f.write("\n")


In [52]:
total_configs = db.configurations.estimated_document_count()
total_data_obj = db.data_objects.estimated_document_count()
total_datasets = db.datasets.estimated_document_count()

In [30]:
# Property counts
pot_energy = db.property_instances.estimated_document_count({"type":"potential-energy"})
free_energy = db.property_instances.estimated_document_count({"type":"free-energy"})
atomic_forces = db.property_instances.estimated_document_count({"type":"atomic-forces"})
formation_energy = db.property_instances.estimated_document_count({"type":"formation-energy"})
atomization_energy = db.property_instances.estimated_document_count({"type":"atomization-energy"})
band_gap = db.property_instances.estimated_document_count({"type":"band-gap"})
cauchy_stress = db.property_instances.estimated_document_count({"type":"cauchy-stress"})

In [37]:
# Atoms per dataset
atoms_per_ds = []
for ds in db.datasets.find():
    atoms_per_ds.append(ds['aggregated_info']['nsites'])
print(atoms_per_ds)
# [376978, 218820, 14180, 4650, 24851, 363129, 31437, 116380, 598356, 17964, 115805, 96736, 163746, 228396, 23041200, 1294832, 2812, 2450, 17982, 1483936, 304896, 5706023, 106761, 13500, 127913, 45667, 185070, 1857722, 45641, 1072856, 2407753, 4224, 135956, 45439, 3996, 165108, 165108, 223930, 22470, 46466, 524332, 42068, 35403, 1911177, 152130, 1186590, 2951584, 721113, 161580, 2838238, 1294832, 1467492, 4719, 307435, 247744, 26234002, 297397, 4224, 106761, 44480, 524332, 23735083, 2407753, 4022098, 1523988, 2736768, 576000, 694042, 698086, 248410, 12176245, 1479000, 504350, 2100100, 313150, 449000, 711378, 78249, 11938, 11938, 11938, 11938, 11938, 11938, 103804, 103804, 103804, 103804, 103804, 103804, 1000, 529214, 33240, 7629879, 2944000, 321600, 112218, 4513386, 887799, 383955, 15599712, 113804, 171018, 600000, 111768, 1975372, 143856, 117, 158304, 1841742, 137254445, 200038614]


[376978, 218820, 14180, 4650, 24851, 363129, 31437, 116380, 598356, 17964, 115805, 96736, 163746, 228396, 23041200, 1294832, 2812, 2450, 17982, 1483936, 304896, 5706023, 106761, 13500, 127913, 45667, 185070, 1857722, 45641, 1072856, 2407753, 4224, 135956, 45439, 3996, 165108, 165108, 223930, 22470, 46466, 524332, 42068, 35403, 1911177, 152130, 1186590, 2951584, 721113, 161580, 2838238, 1294832, 1467492, 4719, 307435, 247744, 26234002, 297397, 4224, 106761, 44480, 524332, 23735083, 2407753, 4022098, 1523988, 2736768, 576000, 694042, 698086, 248410, 12176245, 1479000, 504350, 2100100, 313150, 449000, 711378, 78249, 11938, 11938, 11938, 11938, 11938, 11938, 103804, 103804, 103804, 103804, 103804, 103804, 1000, 529214, 33240, 7629879, 2944000, 321600, 112218, 4513386, 887799, 383955, 15599712, 113804, 171018, 600000, 111768, 1975372, 143856, 117, 158304, 1841742, 137254445, 200038614]


In [41]:
# Total num atoms in database
total_atoms = sum(atoms_per_ds)
print(total_atoms)
# 512471967

512471967


In [36]:
# Configs per dataset
configs_per_ds = []
for ds in db.datasets.find():
    configs_per_ds.append(ds['aggregated_info']['nconfigurations'])