In [1]:
import json
import copy
import numpy as np
from pathlib import Path
from collections import defaultdict

def log(msg):
    print(msg)
    with open("./analysis.log", 'a') as f:
        f.write(str(msg) + '\n')


Check consistency of the Campi-v1 mc2d archive entry files.

The primary file to check and update is `structure_2d.json`

The primary source to check against is the full MC2D AiiDA database (on prnmarvelsrv3), but also supplementary metadata for exists in the discover-mc2d-data github repository, which contains:

* `materials_cloud_2D_EE_less6atoms.json` - Mounet18 data;
* `MC_olddb_merged_noph.json` - Campi23 data without phonons on structures found in Mounet18;
* `MC_olddb_merged.json` - this should just have the phonons extra, but there are other modifications as well (e.g. updated source databases, in multiple cases, ICSD is replaced by COD, without the id being changed. I manually checked some cases and this current file seems to have fixed these entries.). This file is newer as well, so just use data from here and ignore `MC_olddb_merged_noph.json`.
* `MC_newdb_merged.json` - new 2d materials from Campi23.

## Load data

In [2]:
with open(Path("./campi-v1/structure_2d_indented.json")) as fh:
    ARCHIVE_DATA = json.load(fh)
print("campi-v1-archive", len(ARCHIVE_DATA))


MC_DATA_DIR = Path("./discover-mc2d-data-files")

# initial/old data from first paper (Mounet 2018)
MC_MOUNET_FILE = MC_DATA_DIR / 'materials_cloud_2D_EE_less6atoms.json'
# data of Campi paper that analysed materials proposed/found in Mounet (needs both citations)
# NOTE: this dataset includes phonons!
MC_CAMPI_FILE_1 = MC_DATA_DIR / 'MC_olddb_merged.json'
# New materials/properties of Campi (only Campi citation)
MC_CAMPI_FILE_2 = MC_DATA_DIR / 'MC_newdb_merged.json'

# treat mounet and full campi data separately. Load as dictionaries based on optimized uuids
MC_MOUNET_DATA = {}
MC_CAMPI_DATA = {}

with open(MC_MOUNET_FILE, 'r') as fh:
    for entry in json.load(fh)['compounds']:
        uuid = entry['structure_2D']
        MC_MOUNET_DATA[uuid] = entry
        
for campi_file in [MC_CAMPI_FILE_1, MC_CAMPI_FILE_2]:
    with open(campi_file, 'r') as fh:
        for entry in json.load(fh)['compounds']:
            uuid = entry['structure_2D']
            entry["_file"] = campi_file.name
            MC_CAMPI_DATA[uuid] = entry

print("mc mounet entries:", len(MC_MOUNET_DATA))
print("mc campi entries:", len(MC_CAMPI_DATA))

campi-v1-archive 3078
mc mounet entries: 258
mc campi entries: 2501


## Set up aiida, load pydantic models and functions

In [3]:
import aiida
from aiida import orm
from aiida.orm import load_node

aiida.load_profile("mat2D")

Profile<uuid='635980d2973c41ff9ec9d7e82bbae1e2' name='mat2D'>

In [4]:
%load_ext autoreload
%autoreload 2
# import all the pydantic models and custom functions:
import analysis_tools as at

## Check Mounet MC data

This dataset is uniform and consistent, so there's not too much to do. Just

* it's missing the extracted uuids, which can be queries from aiida. In principle, they are not needed for MC2D, but it's they're good to have, just in case. And then the full mc2d dataset is more consistent, as all Campi structures have these uuids.
* adapt to the pydantic model (including the removal of the top-level-parent, etc) 

In [5]:
log("----")
log("Parse MC Mounet data")
log("----")

cleaned_mc_mounet_data = []

for opt_uuid, entry in MC_MOUNET_DATA.items():
    log(entry)
    
    opt_structure_uuid = entry["structure_2D"]
    opt_structure_node = load_node(opt_structure_uuid)
    aiida_opt_formula = opt_structure_node.get_formula(mode="hill")
    
    extracted_structure_uuid = at.query_for_extracted_2d_structure(opt_structure_uuid)
    extracted_structure_node = load_node(extracted_structure_uuid)
    
    # make sure formulas are compatible.
    assert at.get_element_set(aiida_opt_formula) == at.get_element_set(extracted_structure_node.get_formula(mode="hill"))
    assert at.get_element_set(aiida_opt_formula) == at.get_element_set(entry["formula"])
    
    try:
        # The top level binding energy props can be from multiple different parents for this data.
        # just ignore them and taken them consistely from the first entry in the parents list
        parents_3d = at.validate_and_return_parents(entry, aiida_opt_formula, log, ignore_top_binding_props=True)
    except at.ParentsException as e:
        log(f"    Error: {e}")
        continue
    
    if entry["band_gap"]["value"] is None:
        # this is set for conductors. But it basically means zero gap.
        entry["band_gap"]["value"] = 0.0
    
    # get pydantic fields
    formatted_entry = {
        "formula": aiida_opt_formula,
        "as_extracted_2D_structure_uuid": extracted_structure_uuid,
        "all_3D_parents": parents_3d,
        "space_group_number": at.get_space_group_number(entry.get("space_group")),
        "prototype": entry.get("prototype"),
        "abundance": entry.get("abundance"),
        "optimized_2D_structure_uuid": opt_uuid,
        "bands_uuid": entry.get("bands_2D"),
        "band_gap": at.check_and_return_aiida_prop(entry.get("band_gap")),
        "fermi_energy": at.check_and_return_aiida_prop(entry.get("fermi_energy")),
        "magnetic_state": entry.get("magnetic_state_long"),
        "absolute_magnetization": at.check_and_return_aiida_prop(entry.get("absolute_magnetization")),
        "total_magnetization": at.check_and_return_aiida_prop(entry.get("total_magnetization")),
        "phonon_bands_uuid": entry.get("phonons_2D"),
        "citations": ["Mounet18"],
    }
    
    try:
        # create the pydantic object for validation
        pydantic_entry = at.StructureEntry(**formatted_entry)
        # turn it back into a python dictionary when appending
        # note: probably good to not exclude unset or None, just to have a better overview of the data structure
        cleaned_mc_mounet_data.append(pydantic_entry.model_dump())
    except at.ValidationError as e:
        log(f"    Error - pydantic: {e.json()}")
        continue
    

----
Parse MC Mounet data
----
{'abundance': 7.9e-08, 'all_3D_parents': [{'binding_energy_per_substructure_per_unit_area_df2': {'key': 'binding_energy_per_substructure_per_unit_area', 'uuid': '30bb4b62-7c3a-47ab-ade7-a5629d21ebff', 'value': 0.0078432520982447}, 'binding_energy_per_substructure_per_unit_area_rvv10': {'key': 'binding_energy_per_substructure_per_unit_area', 'uuid': '579f959e-32ba-40e7-838f-97c5f92e3f62', 'value': 0.0143165661721327}, 'db_id': '33913', 'delta_df2': {'key': 'delta', 'uuid': 'a87c3753-3976-42a4-8984-f92abd52f8b0', 'value': 0.107809821166031}, 'delta_rvv10': {'key': 'delta', 'uuid': '531ed4b5-0dec-4360-ba1f-c507c08cd0fa', 'value': 0.133133853742913}, 'formula': 'Ag2Pb2O2Br2', 'source_db': 'ICSD', 'spg': 'P4/nmm', 'uuid': '12d8ac54-c98e-4261-b8c3-07de5258cc4f'}], 'band_gap': {'key': 'band_gap', 'uuid': '89315c33-2f9b-41ab-b7d4-22aff0ae75f4', 'value': 1.25790023795923}, 'bands_2D': '7fbb633d-a224-46b7-84b1-74a3ceca81e0', 'bands_2D_y_max_lim': 5.41838796033316, 

## Clean up Campi archive data

In [6]:
log("----")
log("Parse Archive Campi data")
log("----")

cleaned_archive_data_1 = []
extracted_uuids = set()

for entry in ARCHIVE_DATA:
    log(entry)
    
    extracted_uuid = entry["as_extracted_2D_structure_uuid"]
    extracted_node = load_node(extracted_uuid)
    extracted_formula = extracted_node.get_formula(mode="hill")
    
    if extracted_uuid in extracted_uuids:
        log("    Error: DUPLICATE!")
        continue
    else:
        extracted_uuids.add(extracted_uuid)
        
    if "formula" in entry:
        json_elements = at.get_element_set(entry["formula"])
        if at.get_element_set(extracted_formula) != json_elements:
            log(f"    Error: formula conflict - {entry['formula']}, AiiDA: {extracted_formula}")
            continue
    
    try:
        parents_3d = at.validate_and_return_parents(entry, extracted_formula, log)
    except at.ParentsException as e:
        log(f"    Error: {e}")
        continue
    
    cleaned_formula = extracted_formula
    optimized_uuid = entry.get("optimized_2D_structure_uuid")
    
    if optimized_uuid:
        optimized_node = load_node(optimized_uuid)
        optimized_formula = optimized_node.get_formula(mode="hill")
        cleaned_formula = optimized_formula
        if at.get_element_set(optimized_formula) != at.get_element_set(extracted_formula):
            log(f"    Error: formula conflict - {optimized_formula}, {extracted_formula}")
            continue
    
    if "band_gap" in entry and entry["band_gap"]["value"] is None:
        # this is set for conductors. But it basically means zero gap.
        entry["band_gap"]["value"] = 0.0
    
    # get pydantic fields
    formatted_entry = {
        "formula": cleaned_formula,
        "as_extracted_2D_structure_uuid": extracted_uuid,
        "all_3D_parents": parents_3d,
        "space_group_number": at.get_space_group_number(entry.get("space_group")),
        "prototype": entry.get("prototype"),
        "abundance": entry.get("abundance"),
        "optimized_2D_structure_uuid": optimized_uuid,
        "bands_uuid": entry.get("bands_uuid"),
        "band_gap": at.check_and_return_aiida_prop(entry.get("band_gap")),
    }
    
    try:
        # create the pydantic object for validation
        pydantic_entry = at.StructureEntry(**formatted_entry)
        # turn it back into a python dictionary when appending
        # note: probably good to not exclude unset or None, just to have a better overview of the data structure
        cleaned_archive_data_1.append(pydantic_entry.model_dump())
    except at.ValidationError as e:
        log(f"    Error - pydantic: {e.json()}")
        continue
    
            

----
Parse Archive Campi data
----
{'abundance': 5.0000000000000004e-08, 'initial_3D_spg': 'P2_1/c', 'band_gap': {'key': 'band_gap', 'value': 4.006325670648691, 'uuid': 'f14bb2a8-9d7c-4727-9570-ee071f6e7531'}, 'number_of_atoms': 38, 'bands_uuid': '019ef4b9-b184-4643-9fb7-fa21c8d3a058', 'initial_3D_bulk_structure_uuid': 'bea3c00d-bb9d-4c81-a06d-e18145efe657', 'initial_3D_formula': 'Zn2In4Se8O24', 'point_group': 'm', 'binding_energy_per_substructure_per_unit_area_df2': {'value': 0.0149816350662265, 'key': 'binding_energy_per_substructure_per_unit_area', 'uuid': 'bd666977-a210-457f-a860-2b1a9211b8a3'}, 'formula': 'ZnIn2Se4O12', 'all_3D_parents': [{'db_id': '263061', 'uuid': 'bea3c00d-bb9d-4c81-a06d-e18145efe657', 'binding_energy_per_substructure_per_unit_area_df2': {'value': 0.0149816350662265, 'key': 'binding_energy_per_substructure_per_unit_area', 'uuid': 'bd666977-a210-457f-a860-2b1a9211b8a3'}, 'source_db': 'ICSD', 'formula': 'Zn2In4Se8O24', 'spg': 'P2_1/c'}], 'initial_3D_source_db': '

In [7]:
len(cleaned_archive_data_1)

2936

## Check Archive and MC Campi consistency

The MC Campi data only contains the optimized structures, but it also contains extra information:

* if the 2d structure originates from Mounet, or is a completely new structure;
* phonons, fermi energy

Check that the data is consistent, and include this extra information in the archive data.

In [8]:
# BANDS consistency - preliminary checks show that the mc data has mismatching bands uuids
# in many cases. To investigate which one should be preferred, pick two conflicting ones:
bands_mc = load_node("2f281790-2db6-4ca5-b1f5-ab27694dd993")
bands_archive = load_node("964070b7-9065-4552-a693-597cc29818b8")

# opt structure and band gap uuids are the same, load these as well:
band_gap = load_node("41f7622d-19cd-4385-95ba-ab133ba5ac8a")
opt_struct = load_node("2031007b-e65b-475a-bb27-58abb1f55b47")

# check ctimes
print(bands_mc.ctime)
print(bands_archive.ctime)
print(band_gap.ctime)
print(opt_struct.ctime)
# everything but bands_archive seems to be calculated together.

# check input structures
print(opt_struct.cell)
print(bands_mc.creator.inputs.structure.cell)
print(bands_archive.creator.inputs.structure.cell)

# bands_mc uses the correct optimized structure, while bands_archive seems to use a
# completely different one
# as a result - just replace the Archive band uuids with the MC ones.


2018-11-26 14:53:27.381997+00:00
2019-04-13 09:08:19.306371+00:00
2018-11-26 14:55:21.955119+00:00
2018-11-26 13:10:49.309084+00:00
[[4.69390559772059, 0.0, 0.0], [0.0, 10.1623902135053, 0.0], [0.0, 0.0, 28.4138875414]]
[[4.69390559772059, 0.0, 0.0], [0.0, 10.1623902135053, 0.0], [0.0, 0.0, 28.4138875414]]
[[5.34529441092952, 0.0, 0.0], [0.0, 8.89053102039853, 0.0], [0.0, 0.0, 28.4138875413998]]


In [9]:

log("----\nArchive and MC consistency check\n----")

# 2nd iteration of the data cleaning
cleaned_archive_data_2 = []

for _entry in cleaned_archive_data_1:
    
    entry = copy.deepcopy(_entry)
    
    opt_structure_uuid = str(entry.get("optimized_2D_structure_uuid"))
    
    mc_entry = MC_CAMPI_DATA.get(opt_structure_uuid)
    
    if mc_entry:
        log(opt_structure_uuid)
        
        mc_extracted_uuid = mc_entry.get("structure_unrelaxed_2D")
        if mc_extracted_uuid:
            if str(entry.get("as_extracted_2D_structure_uuid")) != mc_extracted_uuid:
                log("    Error: extracted uuid mismatch!")
                continue
        
        # ignore parents for now, as it's cumbersome and they should already match
        
        mc_bands_uuid = mc_entry.get("bands_2D")
        if mc_bands_uuid:
            bands_uuid = str(entry.get("bands_uuid"))
            if mc_bands_uuid != bands_uuid:
                entry["bands_uuid"] = mc_bands_uuid
                log(f"    Warning, replaced bands_uuid {bands_uuid} with {mc_bands_uuid}")
        
        mc_band_gap = mc_entry.get("band_gap")
        if mc_band_gap:
            band_gap = entry.get("band_gap")
            if (band_gap is None
                or mc_band_gap['uuid'] != str(band_gap.get('uuid'))
                or mc_band_gap['value'] != band_gap.get('value')
            ):
                entry["band_gap"] = {
                    'uuid': mc_band_gap['uuid'],
                    'value': mc_band_gap['value']
                }
                log(f"    Warning, replaced band_gap {band_gap} with {entry['band_gap']}")

        if "fermi_energy" in mc_entry:
            # this doesn't currently comply with pydantic, but we'll fix it later
            entry["fermi_energy"] = mc_entry["fermi_energy"]
            
        if "phonons_2D" in mc_entry:
            entry["phonon_bands_uuid"] = mc_entry["phonons_2D"]
        
        if mc_entry["_file"] == "MC_olddb_merged.json":
            # exfoliated in Mounet18, rest of calculations in Campi23
            entry["citations"] = ['Mounet18', "Campi23"]
        else:
            # new structure in Campi23
            entry["citations"] = ["Campi23"]
            
    cleaned_archive_data_2.append(entry)


----
Archive and MC consistency check
----
47df4485-5c81-4597-921b-70f06df6a848
aeaa0d21-ef60-427a-acaa-8f038f40057c
692253ed-571b-478e-b209-f47a34e292e2
a888c559-95fb-4119-b70e-0545ed88f830
42f78dcd-0a4b-4ce6-b71c-a02f514169b9
8c4493b4-d4c5-4d17-bbeb-326693ad696f
e599eb61-9be6-400e-ad74-6c2ac7fe2895
6f3e35e9-55fd-43f7-b08d-f070ad249cbc
b965f47e-0bbc-424f-8cc6-f84c21f9df8a
5b16acf7-9ee2-43d0-92d9-85d12957cdea
04f3a93a-01aa-47e8-8e3e-abcdaad9488d
2031007b-e65b-475a-bb27-58abb1f55b47
c4c41fbb-baf0-4460-9ccd-af222d8965a2
0125375f-3558-4fa3-860d-92ac9897b276
659c0e6a-371a-4117-837b-c5447379e222
4b379010-addf-44ef-ba24-d32b563d6801
8dfca540-30f4-4205-8c12-b225badde670
cbff6d3d-3032-43a5-a0a6-7d9c253882b5
e8646c2e-1114-4b71-bab2-75513f37d016
bb65b0e8-e91e-48f8-b4b8-4498345ed953
f522528a-7d82-40bd-b18f-12e20d4001dd
d81d41a1-17f2-47e2-a18d-faee79407195
8aa6ce8f-87a5-4336-aea5-bef22e72bb9e
82c7652b-f893-42b1-b4eb-ffeb7ff7f2be
31b8575d-11d9-4879-9741-c26cf5d484e6
17742011-ac92-4711-87f1-381187e8

In [10]:
log("----")
log("Does every entry with a bands_uuid have a fermi_energy and band_gap?")
log("----")

for entry in cleaned_archive_data_2:
    
    bands_uuid = entry.get("bands_uuid")
    if bands_uuid:
        band_gap_missing = entry.get("band_gap") is None
        fermi_energy_missing = entry.get("fermi_energy") is None
        if band_gap_missing or fermi_energy_missing:
            log(entry)
        if band_gap_missing:
            log("    Missing band_gap!")
        if fermi_energy_missing:
            log(f"    Missing fermi_energy!")


----
Does every entry with a bands_uuid have a fermi_energy and band_gap?
----
{'formula': 'GaO8S2', 'as_extracted_2D_structure_uuid': UUID('98cdcfb9-5f3d-4013-b5de-620ed4001861'), 'all_3D_parents': [{'formula': 'GaNO8S2', 'source_db': 'MPDS', 'source_db_id': 'S307100', 'space_group_number': 150, 'initial_structure_uuid': UUID('89c2642b-491f-4b1d-865f-7fbfc18c6697'), 'binding_energy_df2': {'value': 0.09236007954646058, 'uuid': UUID('7da310c8-f0d4-485c-a42a-a187540766a0')}, 'binding_energy_rvv10': None, 'delta_df2': None, 'delta_rvv10': None, 'opt_structure_df2_uuid': UUID('61d7be45-f8b4-4ceb-af7d-e13f778e0f47'), 'opt_structure_revpbe_uuid': None, 'opt_structure_rvv10_uuid': None}], 'space_group_number': 150, 'prototype': 'Al2H8Mg', 'abundance': 1.9e-05, 'optimized_2D_structure_uuid': UUID('7e8f17e9-41d9-4ddc-b43e-b46b52b2655b'), 'citations': ['Campi23'], 'bands_uuid': UUID('e15083fd-d6a9-452d-8c0d-0e6d36b74ed5'), 'band_gap': None, 'fermi_energy': None, 'magnetic_state': None, 'absolute

{'formula': 'CS2Ta2', 'as_extracted_2D_structure_uuid': UUID('b8472668-1670-409c-849c-d68851c782d8'), 'all_3D_parents': [{'formula': 'CS2Ta2', 'source_db': 'ICSD', 'source_db_id': '23791', 'space_group_number': 166, 'initial_structure_uuid': UUID('a08472f3-2783-46b3-8848-736d7367ad19'), 'binding_energy_df2': {'value': 0.0235296754264423, 'uuid': UUID('1b956c81-baed-4c02-957b-d6cf53d85fd5')}, 'binding_energy_rvv10': {'value': 0.0310283053352772, 'uuid': UUID('2f5243de-d926-41bf-bb05-c743e2fe2ce4')}, 'delta_df2': {'value': 0.188492836654579, 'uuid': UUID('fe7c1033-ed81-4c25-86d8-2d767c112552')}, 'delta_rvv10': {'value': 0.177834387914273, 'uuid': UUID('1bbcac7d-601d-4dba-a16b-10d1267db9ba')}, 'opt_structure_df2_uuid': UUID('5164fd8b-2135-4eb0-852d-0fd8e561ad52'), 'opt_structure_revpbe_uuid': None, 'opt_structure_rvv10_uuid': UUID('3dafbec4-285c-4242-81bc-98dcc5b8a43c')}, {'formula': 'CS2Ta2', 'source_db': 'ICSD', 'source_db_id': '23790', 'space_group_number': 164, 'initial_structure_uuid

In [11]:
# a lot of missing data. query from AiiDA

# test
at. query_band_properties("019ef4b9-b184-4643-9fb7-fa21c8d3a058")

{'structure': <StructureData: uuid: fa30cda5-6396-45ee-8f5d-d68a6994c474 (pk: 10381100)>,
 'band_gap': {'uuid': 'f552ef2c-c5f5-4f64-a3e5-f647c5bec09b',
  'value': 4.00632567064869},
 'fermi_energy': {'uuid': '1619cf74-82da-4222-943c-547008bee6cb',
  'value': [-4.65514997227876]}}

In [12]:
class BandsException(Exception):
    pass

log("----")
log("Querying fermi energy and band gap from Aiida for existing band uuids.")
log("----")

# 3rd data cleaning iteration
cleaned_archive_data_3 = []

for _entry in cleaned_archive_data_2:
    entry = copy.deepcopy(_entry)
    
    bands_uuid = entry.get("bands_uuid")
    
    try:
        if bands_uuid:
            log(bands_uuid)
            
            queried_bands_props = at.query_band_properties(bands_uuid)
            
            if queried_bands_props is None:
                raise BandsException(f"Couldn't find band properties: {bands_uuid}")
                
            # make sure queried structure is equivalent to the optimized one one
            q_struct = queried_bands_props["structure"]
            opt_struct = load_node(entry.get("optimized_2D_structure_uuid"))
            if q_struct.uuid != opt_struct.uuid:
                # not the same structure node, check formula and cell
                if q_struct.get_formula() != opt_struct.get_formula():
                    raise BandsException("different formula")
                if not np.allclose(q_struct.cell, opt_struct.cell, atol=0.1, rtol=0.1): # allow 0.1 angstrom and 10% diff
                    raise BandsException("bands were calculated for a structure with a very different cell.")
            
            fermi = entry.get("fermi_energy")
            q_fermi = queried_bands_props["fermi_energy"]
            if fermi:
                if [fermi] != q_fermi["value"]:
                    log(f"   Warning: fermi energy different, overwriting. {fermi} vs {q_fermi['value']}")
            
            entry["fermi_energy"] = q_fermi
            
            band_gap = entry.get("band_gap")
            qbg = queried_bands_props["band_gap"]
            if band_gap:
                if not np.isclose(band_gap['value'], qbg['value']):
                    # i think the old notation linked the calcfunction uuid here, don't check that 
                    log(f"   Warning: band gaps different, overwriting. {band_gap} vs {qbg}")
            entry["band_gap"] = qbg
    except BandsException as e:
        log(f"    Error: {str(e)}")
        log(f"    Removing bands data.")
        entry.pop("bands_uuid", None)
        entry.pop("band_gap", None)
        entry.pop("fermi_energy", None)
        
    
    cleaned_archive_data_3.append(entry)

----
Querying fermi energy and band gap from Aiida for existing band uuids.
----
019ef4b9-b184-4643-9fb7-fa21c8d3a058


c85e8621-9638-4d91-9f28-f791c1139b1a
cdd40b78-fe58-46eb-a8df-d804e0e9cae2
57961ee1-fd5e-4a47-972f-53589aff8a80
c796dd33-2233-4f95-b38e-d563aabb34e6
94836484-bee0-427f-a15d-015e5cd02000
1d033941-8285-4c24-8dc4-d2fa03de5777
272a24d5-7b2d-46ce-a1b3-db73fd5c40c2
6f6ca5e1-d157-4831-9727-eee0019cf128
9fef6fd3-4f61-4638-8868-dfbe26e37a04
95e6d001-2f20-4504-b2b5-707135adc43f
2f281790-2db6-4ca5-b1f5-ab27694dd993
b300079f-cd3e-4e7b-b9ea-d2f396181f16
48219629-0618-446d-9dd5-f86dae6fbeb3
90c9c97a-1fea-4a91-ae03-69fe7fbd2e72
5106776d-a323-488d-b340-88231c006dbf
750072ba-8c4b-4d3f-a595-febd5b05ba4b
48886bd6-87dc-49dc-993f-c241b44340c4
9ac2506b-f933-4952-bb35-ef5416e334bc
b13e8d9e-32b8-4513-8dd3-66c342a4b5ff
da11eda9-3172-4057-841d-4b6331a2270a
03befb56-ab8f-4f4d-b744-cf81e06fc0a4
b73a66d6-1857-4f4f-ae41-fd25b6b8e238
e98e5ead-b267-45cb-abf9-8de93486ee86
d81ee782-bb4c-4db1-99ae-9f6ac5a329f6
6bcbc0aa-6b49-45bb-a1fa-713d302393b1
6f860547-2a47-4763-817a-9836821b1c67
baab8956-f80a-495b-a7ed-f2cd53d61465
3

### How to determine if a structure was exfoliated in Mounet18 or Campi23

The Mounet18 archive entry (https://archive.materialscloud.org/record/2024.157) says that
* 258 are exfoliated + computationally investigated;
* another 1567 are just exfoliated.

The file `MC_olddb_merged.json` contains 1556 materials, which i assume covers the 2nd group. These are already included in the "Archive data".

For the first group, i can use the Mounet dataset, but i need to compare structures directly to see if they match.

In [13]:
from pymatgen.analysis.structure_matcher import StructureMatcher

matcher = StructureMatcher() # default settings
# looser settings
#matcher = StructureMatcher(ltol=0.4, stol=0.6, angle_tol=10)
#matcher = StructureMatcher(ltol=0.8, stol=1.2, angle_tol=20)

def get_pymatgen_structure(aiida_structure_node):
    c = copy.deepcopy(aiida_structure_node)
    c.pbc = (True, True, True)
    return c.get_pymatgen_structure()

In [14]:
collected_mounet_data = defaultdict(lambda: [])

for entry in cleaned_mc_mounet_data:
    key = entry["formula"]
    aiida_struct_opt = load_node(entry["optimized_2D_structure_uuid"])
    aiida_struct_ext = load_node(entry["as_extracted_2D_structure_uuid"])
    collected_mounet_data[key].append((entry, aiida_struct_opt, aiida_struct_ext))
    if len(collected_mounet_data[key]) > 1:
        print(entry["formula"], len(collected_mounet_data[key]))

Bi2STe2 2
Br2Hf2N2 2
Br2N2Zr2 2
Cl2Zn 2
Cu2I2 2
Ga2S2 2
GeI2 2
I2N2Zr2 2
In2Se2 2
MoS2 2
NbS2 2
NbSe2 2
O2Sn2 2
S2Ta 2
Sb2Se2Te 2
Se2Ta 2


In [15]:
# apply [Mounet18, Campi23] for the matches, as they still were studied again with Campi23 methods.
# apply also a label "_mounet_duplicate" = True, which allows later to filter these out for the MC dataset

count = 0
for entry in cleaned_archive_data_3:
    key = entry["formula"]
    mounet_entries = collected_mounet_data.get(key)
    
    if mounet_entries:
        opt_structure_uuid = entry.get("optimized_2D_structure_uuid")
        ext_structure_uuid = entry.get("as_extracted_2D_structure_uuid")
        
        # prefer the optimized structure
        if opt_structure_uuid:
            campi_aiida_node = load_node(opt_structure_uuid)
        else:
            campi_aiida_node = load_node(ext_structure_uuid)
        
        campi_pmg = get_pymatgen_structure(campi_aiida_node)
        
        for mounet_entry in mounet_entries:
            if opt_structure_uuid:
                mounet_aiida_node = mounet_entry[1]
            else: 
                mounet_aiida_node = mounet_entry[2]
            mounet_pmg = get_pymatgen_structure(mounet_aiida_node)
            match = matcher.fit(mounet_pmg, campi_pmg)
            if match:
                count += 1
                log(str(entry))
                log(f"    matches with Mounet's: {mounet_entry[0]['optimized_2D_structure_uuid']}")
                entry["citations"] = ['Mounet18', 'Campi23']
                entry["_mounet_duplicate"] = True
print("total maches: ", count)

{'formula': 'Cu2Te2', 'as_extracted_2D_structure_uuid': UUID('eeff5a28-9b37-4c5a-80c6-3905105bba2e'), 'all_3D_parents': [{'formula': 'Bi2Cu2O2Te2', 'source_db': 'ICSD', 'source_db_id': '187824', 'space_group_number': 129, 'initial_structure_uuid': UUID('519c7e52-5281-433e-adcc-185ede9130f1'), 'binding_energy_df2': {'value': 0.107867810351554, 'uuid': UUID('0ceaa24e-343f-47ef-8dcf-70d6594a202e')}, 'binding_energy_rvv10': {'value': 0.102130602284505, 'uuid': UUID('fc030cfc-5c09-4fdd-bc02-b0b938c407b6')}, 'delta_df2': None, 'delta_rvv10': None, 'opt_structure_df2_uuid': UUID('ec4cce42-2102-404b-a6f3-18faa2ab5af0'), 'opt_structure_revpbe_uuid': None, 'opt_structure_rvv10_uuid': UUID('19115510-6d98-41cd-ba30-495b70c032d6')}], 'space_group_number': 129, 'prototype': 'FeSe', 'abundance': 9.9e-10, 'optimized_2D_structure_uuid': UUID('0e405da8-2f58-4d3a-bbf5-e81f62b4243b'), 'citations': ['Mounet18', 'Campi23'], 'bands_uuid': UUID('c828d6f1-14ee-4a0b-a984-8107bd2222a5'), 'band_gap': {'uuid': 'ac

In [16]:
# for the rest, add just "Campi23"
count2 = 0
for entry in cleaned_archive_data_3:
    citations = entry.get("citations")
    if citations is None:
        entry["citations"] = ["Campi23"]
        count2 += 1
print("added: ", count2)

added:  267


## Export the cleaned data files for a new Campi archive entry

In [17]:
# final pydantic validation (everything should be good)
# don't include None values, as that can cause confusion
# (e.g. Campi data should not have any phonon or magnetization keys)

# for the archive entry json file, skip some fields.
skip_fields = [
    "fermi_energy",
    "citations",
    "phonon_bands_uuid",
    "_mounet_duplicate",
]

export_entries = []

for entry in cleaned_archive_data_3:
    try:
        pydantic_entry = at.StructureEntry(**entry)
        tmp_entry = pydantic_entry.model_dump(exclude_unset=True, exclude_none=True)
        for skip_field in skip_fields:
            tmp_entry.pop(skip_field, None)
        export_entries.append(tmp_entry)
    except at.ValidationError as e:
        log(f"    Error - pydantic: {e.json()}")

In [18]:
# Export the cleaned json file

with open("./campi-v2/structure_2d_new.json", "w") as fh:
    json.dump(export_entries, fh, default=str, indent=2, sort_keys=True)

def rm_icsd_mpds_uuids(entries_list):
    entries_updated = copy.deepcopy(entries_list)
    for entry in entries_updated:
        for parent in entry["all_3D_parents"]:
            if parent["source_db"] in ["ICSD", "MPDS"]:
                parent["initial_structure_uuid"] = None
    return entries_updated

with open("./campi-v2/structure_2d_new_rm-icsd-mpds-uuids.json", "w") as fh:
    json.dump(rm_icsd_mpds_uuids(export_entries), fh, default=str, indent=2, sort_keys=True)

In [19]:
# Export also the structures as cif/xsf files

extracted_files_path = Path("./campi-v2/as_extracted_2d_structures")
extracted_files_path.mkdir(exist_ok=True)
if len(list(extracted_files_path.iterdir())) != 0:
    print(f"{extracted_files_path} not empty, skipping export")
else:
    for entry in export_entries:
        extracted_uuid = entry["as_extracted_2D_structure_uuid"]
        sd_node = load_node(extracted_uuid)
        sd_node.export(extracted_files_path / f'{extracted_uuid}.xsf')
        sd_node.export(extracted_files_path / f'{extracted_uuid}.cif')

campi-v2/as_extracted_2d_structures not empty, skipping export


### Export data for Materials Cloud

Merge the Mounet and Campi sets together (only optimized structures). Remove the structures from Campi dataset that match with a Mounet one.

In [20]:
all_mc_data = list(cleaned_mc_mounet_data)

# add entries from Campi data, skipping duplicate and unoptimized ones
for entry in cleaned_archive_data_3:
    if entry.get("optimized_2D_structure_uuid") is None:
        continue
    if entry.get("_mounet_duplicate"):
        continue
    all_mc_data.append(entry)

In [21]:
# make sure these entries have some fields that are specified as optional in pydantic
required_fields = [
    "space_group_number",
    "abundance",
    "optimized_2D_structure_uuid",
    "citations",
]

for entry in all_mc_data:
    if any([field not in entry for field in required_fields]):
        log(f"Error, missing fields for {entry}")
# all is fine.

In [22]:
with open("./new-mc-data.json", "w") as fh:
    json.dump(rm_icsd_mpds_uuids(all_mc_data), fh, default=str, indent=2, sort_keys=True)

### Prepare a new AiiDA archive

* create a group in main aiida database with all the uuids (from cleaned json and mounet data);
* export the group into an aiida archive;
* import the aiida archive into a new profile;
* remove commercial 3D structures;
* set up final groups;
* create a final export.

In [23]:
# collect all uuids by type.

TOP_LEVEL_UUID_KEYS = [
    "as_extracted_2D_structure_uuid",
    "optimized_2D_structure_uuid",
    "bands_uuid",
    "band_gap",
    "fermi_energy",
    "phonon_bands_uuid",
]

PARENTS_UUID_KEYS = [
    "initial_structure_uuid",
    "binding_energy_df2",
    "binding_energy_rvv10",
    "delta_df2",
    "delta_rvv10",
    "opt_structure_df2_uuid",
    "opt_structure_revpbe_uuid",
    "opt_structure_rvv10_uuid",
]

campi_uuid_collections = defaultdict(lambda: set())
mounet_uuid_collections = defaultdict(lambda: set())

def collect_uuids(entries):
    collections = defaultdict(lambda: set())
    for entry in entries:
        for key in TOP_LEVEL_UUID_KEYS:
            if key in entry:
                if isinstance(entry[key], dict):
                    uuid = entry[key]["uuid"]
                else:
                    uuid = entry[key]
                if uuid:
                    collections[key].add(uuid)
        for key in PARENTS_UUID_KEYS:
            for parent in entry["all_3D_parents"]:
                if key in parent:
                    if isinstance(parent[key], dict):
                        uuid = parent[key]["uuid"]
                    else:
                        uuid = parent[key]
                    if uuid:
                        collections[f'parents.{key}'].add(uuid)
    return collections

campi_uuid_collections = collect_uuids(cleaned_archive_data_3)
mounet_uuid_collections = collect_uuids(cleaned_mc_mounet_data)

# print collections and counts:
print("campi:")
for col in campi_uuid_collections:
    print("    ", col, len(campi_uuid_collections[col]))
print("mounet:")
for col in mounet_uuid_collections:
    print("    ", col, len(mounet_uuid_collections[col]))
    

campi:
     as_extracted_2D_structure_uuid 2936
     optimized_2D_structure_uuid 2683
     bands_uuid 2437
     band_gap 2437
     fermi_energy 2437
     parents.initial_structure_uuid 3013
     parents.binding_energy_df2 2918
     parents.opt_structure_df2_uuid 2671
     phonon_bands_uuid 407
     parents.binding_energy_rvv10 1642
     parents.opt_structure_rvv10_uuid 1542
     parents.delta_df2 1101
     parents.delta_rvv10 1071
mounet:
     as_extracted_2D_structure_uuid 258
     optimized_2D_structure_uuid 258
     bands_uuid 258
     band_gap 258
     fermi_energy 258
     phonon_bands_uuid 245
     parents.initial_structure_uuid 438
     parents.binding_energy_df2 431
     parents.binding_energy_rvv10 418
     parents.delta_df2 311
     parents.delta_rvv10 308
     parents.opt_structure_df2_uuid 255
     parents.opt_structure_revpbe_uuid 228
     parents.opt_structure_rvv10_uuid 250


In [24]:
# Make a group and add all relevant UUIDs to it

# group = orm.Group(label='mc2d-2024-10-24')
# group.store()

# group = load_group("mc2d_kristjan_2")

# for coll in [mounet_uuid_collections, campi_uuid_collections]:
#     for key in coll:
#         uuid_set = coll[key]
#         for uuid in uuid_set:
#             group.add_nodes(load_node(uuid))

In [25]:
# Make the .aiida archive and load it into a new profile

# ! verdi -p mat2D archive create --groups "mc2d-2024-10-24" -- mc2d-2024-10-24_prelim.aiida

# set up a new profile or just import to an existing one

# ! verdi archive import mc2d-2024-10-24_prelim.aiida

In [26]:
aiida.load_profile("mc2d_kristjan_3", allow_switch=True)

Profile<uuid='c77f1981872c4db5b8878ff54c1af023' name='mc2d_kristjan_3'>

In [38]:
from aiida.common.exceptions import NotExistent

# Find all parent uuids and pks (needed for delete_nodes) by source db:

parent_ids = {"ICSD": set(), "MPDS": set(), "COD": set()}

for entry in cleaned_mc_mounet_data + cleaned_archive_data_3:
    for parent in entry["all_3D_parents"]:
        uuid = parent["initial_structure_uuid"]
        pk = None
        try:
            pk = load_node(uuid).pk
        except NotExistent:
            pass
        parent_ids[parent["source_db"]].add((uuid, pk))
        
print(len(parent_ids["ICSD"]))
print(len(parent_ids["MPDS"]))
print(len(parent_ids["COD"]))

1489
873
770


In [39]:
commercial_uuids = [n[0] for n in parent_ids["ICSD"]] + [n[0] for n in parent_ids["MPDS"]]

# strangely, there seems to be a duplicate.
# could ICSD and MPDS have the same structure? ignore for now...
commercial_uuids_set = set()
for uuid in commercial_uuids:
    if uuid in commercial_uuids_set:
        print(f"Duplicate {uuid}")
    commercial_uuids_set.add(uuid)

print(len(commercial_uuids))
print(len(commercial_uuids_set))

Duplicate d4ab5770-d136-4623-9b35-96d0a8029eed
2362
2361


In [40]:
# Commercial StructureData nodes also have ancestor nodes that contain commercial info.
# for each uuid, also delete any ancestor Process and CifData node

from aiida.tools import delete_nodes

def get_ancestor_pks(pk, node_type):
    qb = orm.QueryBuilder().append(
        orm.StructureData, filters = {"id": pk}, tag="root"
    ).append(
        node_type, with_descendants = "root", project="*"
    )
    return [n[0].pk for n in qb.all()]

for uuid, pk in list(parent_ids["ICSD"]) + list(parent_ids["MPDS"]):
    
    ancestor_process_pks = get_ancestor_pks(pk, orm.ProcessNode)
    ancestor_cif_pks = get_ancestor_pks(pk, orm.CifData)
    
    deleted = delete_nodes(
        [pk] + ancestor_process_pks + ancestor_cif_pks,
        dry_run=False, create_forward=False, call_calc_forward=False, call_work_forward=False
    )


TypeError: one of the starting_pks is not of type int:
 [None]

In [33]:
# make groups based on the collected UUIDs

from aiida.orm import load_group

def set_up_groups(group_collection_mapping, uuid_collections):
    for key, group_name in group_collection_mapping.items():
        uuid_set = uuid_collections[key]
        try:
            group = load_group(group_name)
        except NotExistent:
            group = orm.Group(label=group_name).store()
        for uuid in uuid_set:
            try:
                node = load_node(uuid)
                group.add_nodes(node)
            except NotExistent:
                print(f"Warning: {key} node does not exist: {uuid}")

mounet_group_mapping = {
    "optimized_2D_structure_uuid":  "mounet18_optimized_2d_structures",
    "bands_uuid":                   "mounet18_bands",
    "phonon_bands_uuid":            "mounet18_phonon_bands",
}

set_up_groups(mounet_group_mapping, mounet_uuid_collections)

In [37]:
campi_group_mapping = {
    "as_extracted_2D_structure_uuid": "campi23_extracted_2d_structures",
    "optimized_2D_structure_uuid": "campi23_optimized_2d_structures",
    "band_gap": "campi23_band_gap",
    "bands_uuid": "campi23_bands",
    "parents.initial_structure_uuid": "campi23_3d_parents",
    "parents.opt_structure_df2_uuid": "campi23_optimized_3d_parents_df2",
    "parents.opt_structure_rvv10_uuid": "campi23_optimized_3d_parents_rvv10",
    "parents.binding_energy_df2": "campi23_binding_energy_df2",
    "parents.binding_energy_rvv10": "campi23_binding_energy_rvv10",
    "parents.delta_df2": "campi23_delta_df2",
    "parents.delta_rvv10": "campi23_delta_rvv10",
}

set_up_groups(campi_group_mapping, campi_uuid_collections)



In [None]:
# make the groups that are mentioned in readme.txt
# expanded_two_dimensional_database_initial_2D_structures_as_extracted_from_3D_parent - just alias of campi23_extracted_2d_structures
# expanded_two_dimensional_database_bands_relaxed_2D_structures_pbe - alias of campi23_bands
# expanded_two_dimensional_database_binding_energies_from_all_3D_parents - merge of campi23_binding_energy_df2 and campi23_binding_energy_rvv10

# ! verdi group copy campi23_extracted_2d_structures expanded_two_dimensional_database_initial_2D_structures_as_extracted_from_3D_parent
# ! verdi group copy campi23_bands expanded_two_dimensional_database_bands_relaxed_2D_structures_pbe
# ! verdi group copy campi23_binding_energy_df2 expanded_two_dimensional_database_binding_energies_from_all_3D_parents
# ! verdi group copy campi23_binding_energy_rvv10 tmp_group
# ! verdi group move-nodes -a -s tmp_group -t expanded_two_dimensional_database_binding_energies_from_all_3D_parents
# ! verdi group delete tmp_group

In [None]:
# Create the final archive

# nohup verdi archive create -a mc2d_20241024.aiida &