# Preparing hMOFX-DB benchmark set:

### Package requirements: mofdb_client, tqdm, pandas, numpy, ase, rdkit, h5py

**Notes:**
- Atomic point clouds: Stored in .h5 file as dictionaries. Keys derived from structure IDs used to access atomic point cloud arrays.
- Target values and adsorption conditions: Stored as .csv files following this column scheme: | ID | P | T | Gas | Forcefield | Uptake(mol/kg) |. It is imperative to maintain these column names for proper integration with the model training loop.
- Documentation for the mofdb_client api can be found in the official GitHub repo (https://github.com/n8ta/mofdb-client)

- **Run the below code cells in a sequential manner to construct the benchmark set**.
- **When performing train/val/test splits, ensure you are splitting according to structural IDs.** The same structure (with different adsorption conditions) should not appear across splits...! This can be performed according to unique values of the ID column.

### Definitions

In [1]:
# Paths for saving generated data:
point_cloud_h5_save_path = "datasets/hMOFXDB_structures.h5"
target_and_conditions_csv_path = "datasets/hMOFXDB_target_and_adsorption_conditions.csv"

# CAUTION: MOFX-DB stores structural information files for MOFs in JSON format. However, to parse these structures, ASE requires actual files with a .cif identifier.To this end, we select a directory which will be used to temporarily write and delete .cif files.
# CAUTION: This notebook will perform 130k+ writes and deletes throughout it's run. Ensure you run it in an environment which supports writing many files.  
cif_working_dir = "datasets/working_dir" 

# Number of MOF unit cells:
supercell_matrix = [[1,0,0],[0,1,0],[0,0,1]] # Ensures the generation of a single unit cell (recommended). 

### Load Libraries

In [2]:
from mofdb_client import fetch
from tqdm import tqdm
import pandas as pd
import os
import numpy as np
import h5py

from ase.io import read, write
from ase.build import make_supercell
from ase.data import atomic_numbers
from rdkit import Chem
from rdkit.Chem import AllChem

### Definitions for Atomic Properties

In [3]:
pauling_en = {
    "H": 0.8492077756084468, "He": 0.9234948886510139, "Li": 0.47493569014597764, "Be": 0.6642111641550714, 
    "B": 0.840168050416806, "C": 1.0, "N": 1.1258799375612023, "O": 1.2909944487358056,
    "F": 1.3693063937629155, "Ne": 1.4194806702221516, "Na": 0.5723140958578757, "Mg": 0.6878662178008077, 
    "Al": 0.8006407690254357, "Si": 0.908623583766197, "P": 0.9989599581169395, "S": 1.1050160600213705, 
    "Cl": 1.1924392738883696, "Ar": 1.2738116634126702, "K": 0.5089466451388751, "Ca": 0.6201736729460421, 
    "Sc": 0.6839411288813299, "Ti": 0.731096616353433, "V": 0.7538648981947593, "Cr": 0.7469990402967646, 
    "Mn": 0.835538990118209, "Fe": 0.8637251994466477, "Co": 0.9004503377814961, "Ni": 0.9217648016985401, 
    "Cu": 0.8731338026686615, "Zn": 0.9223432547217107, "Ga": 0.9646352117828865, "Ge": 1.0380552995055379, 
    "As": 1.0961412988206825, "Se": 1.1758511788040868, "Br": 1.2403473458920846, "Kr": 1.2756249193674616, 
    "Rb": 0.49168917189444195, "Sr": 0.596246052824969, "Y": 0.6517120879556719, "Zr": 0.6870429186215166, 
    "Nb": 0.6629935441317955, "Mo": 0.7023610444702197, "Tc": 0.8056292332943622, "Ru": 0.7745966692414834, 
    "Rh": 0.7922703501282296, "Pd": 1.1477402547212905, "Ag": 0.816741885599305, "Cd": 0.8591403680107819, 
    "In": 0.9014253790393105, "Sn": 0.9650485383226495, "Sb": 1.0190493307301367, "Te": 1.085955175195522, 
    "I": 1.1483385035264293, "Xe": 1.2055362602143995, "Cs": 0.46779577942376166, "Ba": 0.5792730788177665, 
    "La": 0.6201736729460422, "Ce": 0.6517120879556719, "Pr": 0.6113009170521597, "Nd": 0.6148041028495473, 
    "Pm": 0.6165784329539861, "Sm": 0.6183682144631293, "Eu": 0.6256864362310539, "Gd": 0.6400386879521874, 
    "Tb": 0.6256864362310539, "Dy": 0.6275569529190723, "Ho": 0.6294443465127104, "Er": 0.631348872337157, 
    "Tm": 0.6332707911583599, "Yb": 0.6219950386090711, "Lu": 0.6537204504606134, "Hf": 0.691548166360639, 
    "Ta": 0.7222199706358509, "W": 0.7623215819277851, "Re": 0.7963510440251993, "Os": 0.8190487086369509, 
    "Ir": 0.8588975014708029, "Pt": 0.8161135189404553, "Au": 0.8298105855798943, "Hg": 0.8687758883885015, 
    "Tl": 0.8951435925492911, "Pb": 0.9515506912134104, "Bi": 0.9812298519789698, "Po": 1.0517132668916793, 
    "At": 1.0922877925089394, "Rn": 1.1579018646071315, "Fr": 0.4771422345272372, "Ra": 0.5720228171406342, 
    "Ac": 0.6100888760865631, "Th": 0.6445033866354902, "Pa": 0.6400386879521874, "U": 0.6381534447172755, 
    "Np": 0.6362847629757779, "Pu": 0.6183682144631301, "Am": 0.6294443465127104, "Cm": 0.6457962733644264, 
    "Bk": 0.6256864362310547, "Cf": 0.6256864362310547, "Es": 0.631348872337157, "Fm": 0.6275569529190731, 
    "Md": 0.616578432953987, "No": 0.6113009170521605, "Lr": 0.6557474954819612, "Rf": 0.6804471538998222, 
    "Db": 0.7149123293218679, "Sg": 0.7461574507879195, "Bh": 0.7675923631762812, "Hs": 0.8036226816523603, 
    "Mt": 0.8352690695845574, "Ds": 0.8000150238973879, "Rg": 0.8400345102346997, "Cn": 0.907096861568455, 
    "Nh": 0.921095157729515, "Fl": 0.9548719951727265, "Mc": 0.9473309334313417, "Lv": 0.9573314559145735, 
    "Ts": 1.0309883912717273, "Og": 1.101199737705377
}

hardness = {
    "H": 6.422119502568, "He": 22.143693968, "Li": 2.3868328805, "Be": 5.8613495, 
    "B": 4.009148, "C": 4.9990885, "N": 7.967065, "O": 6.0784702500000005, 
    "F": 7.010815150000001, "Na": 2.29557535, "Al": 2.7764692, "Si": 3.3810809500000003, 
    "P": 4.870039500000001, "S": 4.141452985000001, "Cl": 4.677452499999999, "Ar": 13.629805600000001,
    "K": 1.9195967700000003, "Ca": 3.0443026, "Sc": 3.186745, "Ti": 3.3745600000000002, 
    "V": 3.1105935, "Cr": 3.050255, "Mn": 3.717009, "Fe": 3.8757339, 
    "Co": 3.60937677, "Ni": 3.2419385000000003, "Cu": 3.2456899999999997, "Zn": 4.6970995, 
    "Ga": 2.7846509, "Ge": 3.3333615, "As": 4.4925, "Se": 3.865861, 
    "Br": 4.2251109, "Rb": 1.8456039999999998, "Sr": 2.8234336, "Y": 2.9551299999999996, 
    "Zr": 3.1039499999999998, "Nb": 2.920722, "Mo": 3.172215, "Tc": 3.2846905, 
    "Ru": 3.15525, "Rh": 3.1609499999999997, "Pd": 3.8874299999999997, "Ag": 3.1371170000000004, 
    "Cd": 4.496911, "In": 2.7431776, "Sn": 3.1159250000000003, "Sb": 3.7811945000000002, 
    "Te": 3.5193920000000003, "I": 3.6961116, "Xe": 6.09292155, "Cs": 1.711139774, 
    "Ba": 2.533522, "La": 2.55345, "Ce": 2.4442999999999997, "Pr": 2.2555, 
    "Nd": 1.8045000000000002, "Eu": 2.4031925, "Tb": 2.3494, "Dy": 2.793525, 
    "Tm": 2.577655, "Yb": 3.1370799999999996, "Lu": 2.5429355, "Hf": 3.405535, 
    "Ta": 3.613785, "W": 3.523885, "Re": 3.84176, "Os": 3.6691150000000006, 
    "Ir": 3.7016099999999996, "Pt": 3.4154150000000003, "Au": 3.4584615, "Tl": 2.8656435, 
    "Pb": 3.52996822, "Bi": 3.171577, "Po": 3.2569999999999997, "At": 3.2587550000000003, 
    "Fr": 1.7933704500000003, "Ra": 2.5892120000000003, "Ac": 2.5151130000000004
}

polarizability = {
    "H": 0.3988592920353982, "He": 0.12245575221238937, "Li": 14.523230088495575, "Be": 3.3398230088495575, 
    "B": 1.8141592920353982, "C": 1.0, "N": 0.6548672566371682, "O": 0.4690265486725663, 
    "F": 0.3309734513274336, "Ne": 0.2354955752212389, "Na": 14.398230088495573, "Mg": 6.300884955752212, 
    "Al": 5.11504424778761, "Si": 3.300884955752212, "P": 2.2123893805309733, "S": 1.7168141592920352, 
    "Cl": 1.2920353982300883, "Ar": 0.9807964601769911, "K": 25.637168141592916, "Ca": 14.230088495575222, 
    "Sc": 8.584070796460177, "Ti": 8.849557522123893, "V": 7.699115044247787, "Cr": 7.345132743362831, 
    "Mn": 6.017699115044247, "Fe": 5.486725663716814, "Co": 4.867256637168142, "Ni": 4.336283185840708, 
    "Cu": 4.11504424778761, "Zn": 3.42212389380531, "Ga": 4.424778761061947, "Ge": 3.5398230088495573, 
    "As": 2.654867256637168, "Se": 2.557522123893805, "Br": 1.8584070796460175, "Kr": 1.4849557522123893, 
    "Rb": 28.300884955752213, "Sr": 17.451327433628318, "Y": 14.336283185840706, "Zr": 9.91150442477876, 
    "Nb": 8.672566371681416, "Mo": 7.699115044247787, "Tc": 6.991150442477876, "Ru": 6.371681415929203, 
    "Rh": 5.840707964601769, "Pd": 2.3132743362831856, "Ag": 4.867256637168142, "Cd": 4.070796460176991, 
    "In": 5.752212389380531, "Sn": 4.6902654867256635, "Sb": 3.805309734513274, "Te": 3.3628318584070795, 
    "I": 2.911504424778761, "Xe": 2.4176991150442477, "Cs": 35.477876106194685, "Ba": 24.07079646017699, 
    "La": 19.02654867256637, "Ce": 18.141592920353983, "Pr": 19.11504424778761, "Nd": 18.4070796460177, 
    "Pm": 17.699115044247787, "Sm": 16.991150442477874, "Eu": 16.283185840707965, "Gd": 13.982300884955752, 
    "Tb": 15.044247787610619, "Dy": 14.424778761061946, "Ho": 13.805309734513273, "Er": 13.27433628318584, 
    "Tm": 12.743362831858406, "Yb": 12.300884955752212, "Lu": 12.123893805309734, "Hf": 9.11504424778761, 
    "Ta": 6.548672566371681, "W": 6.017699115044247, "Re": 5.486725663716814, "Os": 5.044247787610619, 
    "Ir": 4.778761061946902, "Pt": 4.2477876106194685, "Au": 3.1858407079646014, "Hg": 3.0008849557522117, 
    "Tl": 4.424778761061947, "Pb": 4.15929203539823, "Bi": 4.2477876106194685, "Po": 3.893805309734513, 
    "At": 3.716814159292035, "Rn": 3.0973451327433628, "Fr": 28.123893805309734, "Ra": 21.769911504424776, 
    "Ac": 17.964601769911503, "Th": 19.20353982300885, "Pa": 13.628318584070795, "U": 11.415929203539822, 
    "Np": 13.362831858407079, "Pu": 11.681415929203538, "Am": 11.5929203539823, "Cm": 12.743362831858406, 
    "Bk": 11.061946902654867, "Cf": 10.79646017699115, "Es": 10.442477876106194, "Fm": 10.0, 
    "Md": 9.646017699115044, "No": 9.734513274336283, "Lr": 28.318584070796458, "Rf": 9.91150442477876, 
    "Db": 3.716814159292035, "Sg": 3.5398230088495573, "Bh": 3.3628318584070795, "Hs": 3.1858407079646014, 
    "Mt": 3.0088495575221237, "Ds": 2.831858407079646, "Rg": 2.831858407079646, "Cn": 2.47787610619469, 
    "Nh": 2.566371681415929, "Fl": 2.743362831858407, "Mc": 6.283185840707964, "Ts": 6.725663716814159, 
    "Og": 5.132743362831858
}

### Convert CIF to Atomic Point Cloud

In [4]:
def extract_coord_and_element_from_cif(
    structure_id,                              # Structural ID assigned to each MOF in dataset
    cif_path,                                  # Path assigned to temporarily written CIF file 
    supercell_matrix=[[1,0,0],[0,1,0],[0,0,1]] # Ensures generation of a single unit cell
):

    # Load and generate supercell
    loaded_cif = read(cif_path)
    supercell_atoms = make_supercell(loaded_cif, supercell_matrix, order='cell-major')

    # Center point cloud to origin
    positions = supercell_atoms.get_positions()
    center = positions.mean(axis=0)
    shifted_positions = positions - center

    # Return coordinates and elements as dictionary
    coord_and_atom_type = {
        "x": shifted_positions[:, 0],
        "y": shifted_positions[:, 1],
        "z": shifted_positions[:, 2],
        "element": supercell_atoms.get_chemical_symbols()
    }
    
    return coord_and_atom_type
    
def featureize_atomic_point_cloud(
    coord_and_atom_type,                                # Point cloud containing (xyz + element)
    pauling_en_dict, hardness_dict, polarizability_dict # Pre-defined atomic property dicitionaries
):
    
    
    pt = Chem.GetPeriodicTable()
    elements = coord_and_atom_type["element"]
    positions = np.column_stack((
        coord_and_atom_type["x"],
        coord_and_atom_type["y"],
        coord_and_atom_type["z"]
    ))

    # Assign atomic properties to each row:
    atomic_numbers_arr = [atomic_numbers[el] for el in elements]

    pauling_en_arr     = [pauling_en_dict.get(el, 0.0) for el in elements]
    hardness_arr       = [hardness_dict.get(el, 0.0) for el in elements]
    polarizability_arr = [polarizability_dict.get(el, 0.0) for el in elements]

    atomic_mass_arr   = [pt.GetAtomicWeight(Z) for Z in atomic_numbers_arr]
    vdw_radii_arr     = [pt.GetRvdw(Z) for Z in atomic_numbers_arr]
    cov_radii_arr     = [pt.GetRcovalent(Z) for Z in atomic_numbers_arr]
    n_outer_elec_arr  = [pt.GetNOuterElecs(Z) for Z in atomic_numbers_arr]

    return np.column_stack((
        positions[:, 0],
        positions[:, 1],
        positions[:, 2],
        pauling_en_arr,
        hardness_arr,
        polarizability_arr,
        vdw_radii_arr,
        cov_radii_arr,
        atomic_mass_arr,
        atomic_numbers_arr,
        n_outer_elec_arr
    ))

### Save H5DF File

In [5]:
def save_h5df(h5df_path, point_cloud_data):
    with h5py.File(h5df_path, "w") as h5f:
        for structure_id, array in point_cloud_data.items():
            h5f.create_dataset(str(structure_id), data=array, compression="gzip")
    print("Point cloud data saved at " + h5df_path)

### Fetch Loop

In [6]:
target_data = [] # Each row will store a single regression sample: | ID | P | T | Gas | Forcefield | Uptake(mol/kg)
    
point_cloud_data = {} # Will be used to generate the H5DF file for loading atomic point clouds

# Fetch function in mofdb_client package
i = 0
for mof in tqdm(fetch(database="hMOF", loading_unit="mol/kg", pressure_unit="Pa"), desc="Fetching hMOFs (total ~130k): "):
    i+=1
    if i > 1000:
        break
    # Extract ID and CIF File
    structure_id = mof.id
    cif_txt = mof.cif
    
    # Extract Adsorption Data for CO2, N2, CH4, H2 - Following proprietary JSON structure for MOFX-DB
    for iso in mof.isotherms:
        for adsorbate in iso.adsorbates:
            if adsorbate.formula in ["CO2", "N2", "CH4", "H2"]:
                for jdata in iso.isotherm_data:
                    target_data.append((
                        structure_id, 
                        jdata.pressure, 
                        iso.temperature, 
                        adsorbate.formula, 
                        iso.molecule_forcefield,
                        jdata.total_adsorption
                    ))
                    
    # Write CIF file into working directory (cif content needs to be stored as a ".cif" for processing)
    cif_path = os.path.join(cif_working_dir, f"{structure_id}.cif")
    with open(cif_path, "w") as f:
        f.write(cif_txt)
        
    # Generate atomic point cloud from CIF file
    atomic_pc = featureize_atomic_point_cloud(
        extract_coord_and_element_from_cif(structure_id, cif_path, supercell_matrix), pauling_en, hardness, polarizability
    )
    
    # Atomic point cloud data is referenced with unique database IDs.
    point_cloud_data[structure_id] = atomic_pc
    
    # Remove CIF file
    os.remove(cif_path)

Fetching hMOFs (total ~130k): : 1000it [05:07,  3.25it/s]


### Save Data:

In [7]:
# Save point cloud data as H5DF:
save_h5df(point_cloud_h5_save_path, point_cloud_data)

# Save Dataframe Containing Target + Global Descriptors (Adsorption conditions)
target_and_conditions_df = pd.DataFrame(
    target_data, columns= ["ID", "P", "T", "Gas", "Forcefield", "Uptake(mol/kg)"]
)
target_and_conditions_df.to_csv(target_and_conditions_csv_path, index=False)
print("Target and conditions CSV file saved at " + target_and_conditions_csv_path)
target_and_conditions_df.head()

Point cloud data saved at 0_final_submission_bench/datasets/hMOFXDB_structures.h5
Target and conditions CSV file saved at 0_final_submission_bench/datasets/hMOFXDB_target_and_adsorption_conditions.csv


Unnamed: 0,ID,P,T,Gas,Forcefield,Uptake(mol/kg)
0,15338,1000.0,298.0,CO2,TraPPE,0.107653
1,15338,10000.0,298.0,CO2,TraPPE,1.19017
2,15338,250000.0,298.0,CO2,TraPPE,8.49693
3,15338,5000.0,298.0,CO2,TraPPE,0.702453
4,15338,50000.0,298.0,CO2,TraPPE,2.80141
