In [33]:
from __future__ import annotations
from collections import defaultdict
from monty.serialization import loadfn

from typing import Literal, TYPE_CHECKING
if TYPE_CHECKING:
    from pathlib import Path
    from typing import Sequence

In [16]:
def get_geometry_for_single_dataset(
    dataset : Literal["BEGDB_H2O","WATER27","H2O_alkali_clusters","H2O_halide_clusters"],
    geometries_filename : str | Path = "geometries.json.gz",
    return_type : Literal["pmg","ase"] = "pmg"
) -> dict:
    """
    Get geometry files for single dataset.

    Includes charge and spin multiplicity info.

    Args:
        dataset : Literal["BEGDB_H2O", "WATER27", "H2O_alkali_clusters", or "H2O_halide_clusters"]
            Name of the dataset
        geometries_filename : str | Path
            Name of the file containing the geometries. Don't recommend changing this.
        return_type : Literal["pmg", "ase"] = "pmg"
            Whether to return a dict of pymatgen.core.molecule objects or ase.Atoms objects
    Returns:
        dict, a dict of molecules with charge and spin info.
    """
    geometries = loadfn(geometries_filename)[dataset]
    if return_type == "ase":
        geometries = {k: v.to_ase_atoms() for k, v in geometries.items()}
    return geometries

In [55]:
def get_total_energies_by_dataset(
    dataset : Literal["BEGDB_H2O","WATER27","H2O_alkali_clusters","H2O_halide_clusters"],
    functional : str | Sequence[str] | None = None,
    energies_filename : str | Path = "total_energies.json.gz",
) -> dict:
    """
        Get total energies for a single dataset.

        Optionally filter by functional or a list of functionals.
        Accepts input such as r2SCAN@HF or SCAN-FLOSIC.

        The energies dict is structured as:
        
        ```
        {
            dataset : {
                functional_for_energy : {
                    functional_for_density : {
                        molecules in that dataset
                    } 
                }
            }
        }
        ```

        Thus requesting dataset=WATER27 and functional = r2SCAN@HF corresponds to
            functional_for_energy = r2SCAN
            functional_for_density = HF

        In the BEGDB dataset, the "*dmono*" entries correspond to the distorted monomers contained within
        each oligomer.

        Args:
            dataset : Literal["BEGDB_H2O", "WATER27", "H2O_alkali_clusters", or "H2O_halide_clusters"]
                Name of the dataset
            functional : str | Sequence[str] | None = None,
                If None, returns all entries in a given dataset.
                If a str, returns a single functional.
                If a Sequence (list, tuple, etc.) of str's, returns that subset of functionals
            energies_filename : str | Path
                Name of the file containing the total energies. Don't recommend changing this.
        Returns:
            dict, a dict of energies corresponding to the systems in the geometry file.
    """
    _energies = loadfn(energies_filename)[dataset]

    if functional is None:
        functionals_to_return = []
        for dfa, at_dfa_d in _energies.items():
            functionals_to_return += [
                f"{dfa}" if dfa == at_dfa else f"{dfa}@{at_dfa}"
                for at_dfa in at_dfa_d
            ]
    elif isinstance(functional,str):
        functionals_to_return = [functional]
    else:
        functionals_to_return = [f for f in functional]
    
    energies = defaultdict(dict)
    for f in functionals_to_return:
        func = f.split("@")[0]
        at_f = f.split("@")[-1]
        energies[f] = _energies[func][at_f]

    return dict(energies)
