gcpy/core.py

""" Core utilities for handling GEOS-Chem data """

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import yaml
import shutil
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import xarray as xr
import xbpch
import cartopy.crs as ccrs
import gcpy.constants as gcon
from .plot import WhGrYlRd
from .grid.horiz import make_grid_LL, make_grid_CS
from .grid.gc_vertical import GEOS_72L_grid, GEOS_47L_grid
from cartopy.mpl.geoaxes import GeoAxes


# YAML files to read
lumped_spc = "lumped_species.yml"
bpch_to_nc_names = "bpch_to_nc_names.yml"


def open_dataset(filename, **kwargs):
    """
    Load and decode a dataset from an output file generated by GEOS-Chem.

    This method inspects a GEOS-Chem output file and chooses a way to
    load it into memory as an xarray Dataset. Because two different
    libraries to support BPCH and netCDF outputs, you may need to pass
    additional keyword arguments to the function.

    Args:
    -----
        filename : str
            Path to a GEOS-Chem output file (netCDF or BPCH format)
            which can be loaded through either xarray or xbpch.
            Note that xarray conventions for netCDF files apply.

    Keyword Args (optional):
    ------------------------
        Additional keyword arguments to be passed directly to
        `xarray.open_dataset` or `xbpch.open_bpchdataset`.

    Returns
    -------
        dataset : xarray.Dataset
            The dataset loaded from the referenced filename.

    See Also
    --------
        xarray.open_dataset
        xbpch.open_bpchdataset
        open_mfdataset
    """

    basename, file_extension = os.path.splitext(filename)

    # Modify the search so that we account for file names
    # such as ".nc4.Ref", ".nc4.Dev", etc. (bmy, 10/1/19)
    if file_extension == ".bpch":
        _opener = xbpch.open_bpchdataset
    elif ".nc" in file_extension:
        _opener = xr.open_dataset
    elif ".nc" in basename:
        _opener = xr.open_dataset
    else:
        raise ValueError(
            "Found unknown file extension ({}); please "
            "pass a BPCH or netCDF file with extension "
            '"bpch" or "nc"!'.format(file_extension)
        )

    return _opener(filename, **kwargs, drop_variables=gcon.skip_these_vars)


def open_mfdataset(
    filenames,
    concat_dim="time",
    compat="no_conflicts",
    preprocess=None,
    lock=None,
    **kwargs
):
    """
    Load and decode multiple GEOS-Chem output files as a single Dataset.

    Parameters
    ----------

        filenames : list of str
            Paths to GEOS-Chem output files to load. Must have the same
            extension and be able to be concatenated along some common axis.

        concat_dim : str, default='time'
            Dimension to concatenate Datasets over. We default to "time"
            since this is how GEOS-Chem splits output files.

        compat : {'identical', 'equals', 'broadcast_equals',
                  'no_conflicts'}, optional
            String indicating how to compare variables of the same name for
            potential conflicts when merging:
                - 'broadcast_equals': all values must be equal when
                   variables are broadcast against each other to ensure
                   common dimensions.
                - 'equals': all values and dimensions must be the same.
                - 'identical': all values, dimensions and attributes
                   must be the same.
                - 'no_conflicts': only values which are not null in
                   both datasets must be equal. The returned dataset
                   then contains the combination of all non-null values.

    Keyword Arguments (optional):
    -----------------------------
        preprocess : callable (optional)
            A pre-processing function to apply to each Dataset prior to
            concatenation

        lock : False, True, or threading.Lock (optional)
            Passed to :py:func:`dask.array.from_array`. By default,
            xarray employs a per-variable lock when reading data from
            NetCDF files, but this model has not yet been extended or
            implemented for bpch files and so this is not actually used.
            However, it is likely necessary before dask's multi-threaded
            backend can be used.

        **kwargs
            Additional keyword arguments to be passed directly to
            `xbpch.open_mfbpchdataset` or `xarray.open_mfdataset`.

    Returns
    -------
        dataset : xarray.Dataset
            A dataset containing the data in the specified input filenames.

    See Also
    --------
        xarray.open_mfdataset
        xbpch.open_mfbpchdataset
        open_dataset
    """

    # If filenames is a single string, then make it a list of length 1
    if isinstance(filenames, str):
        filenames = [filenames]

    # Make sure that filenames is a list before proceeding
    if not isinstance(filenames, list):
        raise ValueError("The filenames argument must be a list of str!")

    # Get the file name and file extension
    test_fn = filenames[0]
    basename, file_extension = os.path.splitext(test_fn)

    # Modify the search so that we account for file names
    # such as ".nc4.Ref", ".nc4.Dev", etc. (bmy, 10/1/19)
    if file_extension == ".bpch":
        _opener = xbpch.open_mfbpchdataset
    elif ".nc" in file_extension:
        _opener = xr.open_mfdataset
    elif ".nc" in basename:
        _opener = xr.open_mfdataset
    else:
        raise ValueError(
            "Found unknown file extension ({}); please ".format(file_extension)
            + "pass a BPCH or netCDF file with extension "
            + '"bpch" or "nc" or "nc4"'
        )

    return _opener(
        filenames,
        concat_dim=concat_dim,
        compat=compat,
        preprocess=preprocess,
        lock=lock,
        drop_variables=gcon.skip_these_vars,
        **kwargs
    )


def check_paths(refpath, devpath):
    """
    Checks to see if paths to data files exist.

    Args:
    -----
        refpath : str
            Path to the "Reference" data.

        devpath : str
            Path to the "Development" data.
    """

    if not os.path.exists(refpath):
        print("ERROR! Path 1 does not exist: {}".format(refpath))
    else:
        print("Path 1 exists: {}".format(refpath))
    if not os.path.exists(devpath):
        print("ERROR! Path 2 does not exist: {}".format(devpath))
    else:
        print("Path 2 exists: {}".format(devpath))


def compare_varnames(refdata, devdata, refonly=[], devonly=[], quiet=False):
    """
    Finds variables that are common to two xarray Dataset objects.

    Args:
    -----
        refdata : xarray Dataset
            The first Dataset to be compared.
            (This is often referred to as the "Reference" Dataset.)

        devdata : xarray Dataset
            The second Dataset to be compared.
            (This is often referred to as the "Development" Dataset.)

    Keyword Args (optional):
    ------------------------
        quiet : boolean
            Set this flag to True if you wish to suppress printing
            informational output to stdout.
            Default value: False

    Returns:
    --------
        vardict : dict of lists of str
            Dictionary containing several lists of variable names:

            Key              Value
            -----            -----
            commonvars       List of variables that are common to
                             both refdata and devdata

            commonvarsOther  List of variables that are common
                             to both refdata and devdata, but do
                             not have lat, lon, and/or level
                             dimensions (e.g. index variables).

            commonvars2D     List of variables that are common to
                             common to refdata and devdata, and that
                             have lat and lon dimensions, but not level.

            commonvars3D     List of variables that are common to
                             refdata and devdata, and that have lat,
                             lon, and level dimensions.

            refonly          List of 2D or 3D variables that are only
                             present in refdata.

            devonly          List of 2D or 3D variables that are only
                             present in devdata
    """
    refvars = [k for k in refdata.data_vars.keys()]
    devvars = [k for k in devdata.data_vars.keys()]
    commonvars = sorted(list(set(refvars).intersection(set(devvars))))
    refonly = [v for v in refvars if v not in devvars]
    devonly = [v for v in devvars if v not in refvars]
    dimmismatch = [v for v in commonvars if refdata[v].ndim != devdata[v].ndim]
    commonvarsOther = [
        v
        for v in commonvars
        if (
            ("lat" not in refdata[v].dims or "Xdim" not in refdata[v].dims)
            and ("lon" not in refdata[v].dims or "Ydim" not in refdata[v].dims)
            and ("lev" not in refdata[v].dims)
        )
    ]
    commonvars2D = [
        v
        for v in commonvars
        if (
            ("lat" in refdata[v].dims or "Xdim" in refdata[v].dims)
            and ("lon" in refdata[v].dims or "Ydim" in refdata[v].dims)
            and ("lev" not in refdata[v].dims)
        )
    ]
    commonvars3D = [
        v
        for v in commonvars
        if (
            ("lat" in refdata[v].dims or "Xdim" in refdata[v].dims)
            and ("lon" in refdata[v].dims or "Ydim" in refdata[v].dims)
            and ("lev" in refdata[v].dims)
        )
    ]

    # Print information on common and mismatching variables,
    # as well as dimensions
    if quiet == False:
        print("\nComparing variable names in compare_varnames")
        print("{} common variables".format(len(commonvars)))
        if len(refonly) > 0:
            print("{} variables in ref only (skip)".format(len(refonly)))
            print("   Variable names: {}".format(refonly))
        else:
            print("0 variables in ref only")
            if len(devonly) > 0:
                print("{} variables in dev only (skip)".format(len(devonly)))
                print("   Variable names: {}".format(devonly))
            else:
                print("0 variables in dev only")
                if len(dimmismatch) > 0:
                    print(
                        "{} common variables have different dimensions".format(
                            len(dimmismatch)
                        )
                    )
                    print("   Variable names: {}".format(dimmismatch))
                else:
                    print("All variables have same dimensions in ref and dev")

    # For safety's sake, remove the 0-D and 1-D variables from
    # refonly and devonly.  This will ensure that refonly and
    # devonly will only contain variables that can be plotted.
    refonly = [v for v in refonly if v not in commonvarsOther]
    devonly = [v for v in devonly if v not in commonvarsOther]

    return {
        "commonvars": commonvars,
        "commonvarsOther": commonvarsOther,
        "commonvars2D": commonvars2D,
        "commonvars3D": commonvars3D,
        "refonly": refonly,
        "devonly": devonly,
    }


def compare_stats(refdata, refstr, devdata, devstr, varname):
    """
    Prints out global statistics (array sizes, mean, min, max, sum)
    from two xarray Dataset objects.

    Args:
    ----
        refdata : xarray Dataset
            The first Dataset to be compared.
            (This is often referred to as the "Reference" Dataset.)

        refstr : str
            Label for refdata to be used in the printout

        devdata : xarray Dataset
            The second Dataset to be compared.
            (This is often referred to as the "Development" Dataset.)

        devstr : str
            Label for devdata to be used in the printout

        varname : str
            Variable name for which global statistics will be printed out.
    """

    refvar = refdata[varname]
    devvar = devdata[varname]
    units = refdata[varname].units
    print("Data units:")
    print("    {}:  {}".format(refstr, units))
    print("    {}:  {}".format(devstr, units))
    print("Array sizes:")
    print("    {}:  {}".format(refstr, refvar.shape))
    print("    {}:  {}".format(devstr, devvar.shape))
    print("Global stats:")
    print("  Mean:")
    print("    {}:  {}".format(refstr, np.round(refvar.values.mean(), 20)))
    print("    {}:  {}".format(devstr, np.round(devvar.values.mean(), 20)))
    print("  Min:")
    print("    {}:  {}".format(refstr, np.round(refvar.values.min(), 20)))
    print("    {}:  {}".format(devstr, np.round(devvar.values.min(), 20)))
    print("  Max:")
    print("    {}:  {}".format(refstr, np.round(refvar.values.max(), 20)))
    print("    {}:  {}".format(devstr, np.round(devvar.values.max(), 20)))
    print("  Sum:")
    print("    {}:  {}".format(refstr, np.round(refvar.values.sum(), 20)))
    print("    {}:  {}".format(devstr, np.round(devvar.values.sum(), 20)))


def get_collection_data(datadir, collection, day, time):
    datafile = get_gcc_filepath(datadir, collection, day, time)
    if not os.path.exists(datafile):
        print("ERROR! File does not exist: {}".format(datafile))
    data_ds = xr.open_dataset(datafile)
    return data_ds


def get_gchp_collection_data(datadir, collection, day, time):
    datafile = get_gchp_filepath(datadir, collection, day, time)
    data_ds = xr.open_dataset(datafile)
    return data_ds


def convert_bpch_names_to_netcdf_names(ds, verbose=False):

    """
    Function to convert the non-standard bpch diagnostic names
    to names used in the GEOS-Chem netCDF diagnostic outputs.

    Args:
    -----
        ds : xarray Dataset
            The xarray Dataset object whose names are to be replaced.

    Keyword Args (optional):
    ------------------------
        verbose : boolean
            Set this flag to True to print informational output.
            Default value: False


    Returns:
    --------
        ds_new : xarray Dataset
            A new xarray Dataset object all of the bpch-style
            diagnostic names replaced by GEOS-Chem netCDF names.

    Remarks:
    --------
        To add more diagnostic names, edit the dictionary contained
        in the bpch_to_nc_names.yml.
    """

    # Names dictionary (key = bpch id, value[0] = netcdf id,
    # value[1] = action to create full name using id)
    # Now read from YAML file (bmy, 4/5/19)
    yamlfile = os.path.join(os.path.dirname(__file__), bpch_to_nc_names)
    names = yaml.load(open(yamlfile))

    # define some special variable to overwrite above
    special_vars = {
        "Met_AIRNUMDE": "Met_AIRNUMDEN",
        "Met_UWND": "Met_U",
        "Met_VWND": "Met_V",
        "Met_CLDTOP": "Met_CLDTOPS",
        "Met_GWET": "Met_GWETTOP",
        "Met_PRECON": "Met_PRECCON",
        "Met_PREACC": "Met_PRECTOT",
        "Met_PBL": "Met_PBLH",
    }

    # Tags for the UVFlux* diagnostics
    uvflux_tags = [
        "187nm",
        "191nm",
        "193nm",
        "196nm",
        "202nm",
        "208nm",
        "211nm",
        "214nm",
        "261nm",
        "267nm",
        "277nm",
        "295nm",
        "303nm",
        "310nm",
        "316nm",
        "333nm",
        "380nm",
        "574nm",
    ]

    # Python dictionary for variable name replacement
    old_to_new = {}

    # Loop over all variable names in the data set
    for variable_name in ds.data_vars.keys():

        # Save the original variable name, since this is the name
        # that we actually need to replace in the dataset.
        original_variable_name = variable_name

        # Replace "__" with "_", in variable name (which will get tested
        # against the name sin the YAML file.  This will allow us to
        # replace variable names in files created with BPCH2COARDS.
        if "__" in variable_name:
            variable_name = variable_name.replace("__", "_")

        # Check if name matches anything in dictionary. Give warning if not.
        oldid = ""
        newid = ""
        idaction = ""
        for key in names:
            if key in variable_name:
                if names[key][1] == "skip":
                    # Verbose output
                    if verbose:
                        print("WARNING: skipping {}".format(key))
                else:
                    oldid = key
                    newid = names[key][0]
                    idaction = names[key][1]
                break

        # Go to the next line if no definition was found
        if oldid == "" or newid == "" or idaction == "":
            continue

        # If fullname replacement:
        if idaction == "replace":
            oldvar = oldid
            newvar = newid

            # Update the dictionary of names with this pair
            # Use the original variable name.
            old_to_new.update({original_variable_name: newvar})

        # For all the rest:
        else:
            linearr = variable_name.split("_")
            varstr = linearr[-1]
            oldvar = oldid + varstr

            # These categories use append
            if oldid in [
                "IJ_AVG_S_",
                "RN_DECAY_",
                "WETDCV_S_",
                "WETDLS_S_",
                "BXHGHT_S_",
                "DAO_3D_S_",
                "PL_SUL_",
                "CV_FLX_S_",
                "EW_FLX_S_",
                "NS_FLX_S_",
                "UP_FLX_S_",
                "MC_FRC_S_",
            ]:
                newvar = newid + "_" + varstr

            # DAO_FLDS
            # Skip certain fields that will cause conflicts w/ netCDF
            elif oldid in "DAO_FLDS_":
                if oldid in ["DAO_FLDS_PS_PBL", "DAO_FLDS_TROPPRAW"]:

                    # Verbose output
                    if verbose:
                        print("Skipping: {}".format(oldid))
                else:
                    newvar = newid + "_" + varstr

            # Special handling for J-values: The bpch variable names all
            # begin with "J" (e.g. JNO, JACET), so we need to strip the first
            # character of the variable name manually (bmy, 4/8/19)
            elif oldid == "JV_MAP_S_":
                newvar = newid + "_" + varstr[1:]

            # IJ_SOA_S_
            elif oldid == "IJ_SOA_S_":
                newvar = newid + varstr

            # DRYD_FLX_, DRYD_VEL_
            elif "DRYD_" in oldid:
                newvar = newid + "_" + varstr[:-2]

            # BIOBSRCE_, BIOFSRCE_, BIOGSRCE_. ANTHSRCE_
            elif oldid in ["BIOBSRCE_", "BIOFSRCE_", "BIOGSRCE_", "ANTHSRCE_"]:
                newvar = "Emis" + varstr + "_" + newid

            # Special handling for UV radiative flux diagnostics:
            # We need to append the bin descriptor to the new name.
            elif "FJX_FLXS" in oldid:
                uvind = int(original_variable_name[-2:]) - 1
                newvar = newid + "_" + uvflux_tags[uvind]

            # If nothing found...
            else:

                # Verbose output
                if verbose:
                    print("WARNING: Nothing defined for: {}".format(variable_name))
                continue

            # Overwrite certain variable names
            if newvar in special_vars:
                newvar = special_vars[newvar]

            # Update the dictionary of names with this pair
            old_to_new.update({original_variable_name: newvar})

    # Verbose output
    if verbose:
        print("\nList of bpch names and netCDF names")
        for key in old_to_new:
            print("{} ==> {}".format(key.ljust(25), old_to_new[key].ljust(40)))

    # Rename the variables in the dataset
    if verbose:
        print("\nRenaming variables in the data...")
    with xr.set_options(keep_attrs=True):
        ds = ds.rename(name_dict=old_to_new)

    # Return the dataset
    return ds


def get_lumped_species_definitions():
    yamlfile = os.path.join(os.path.dirname(__file__), lumped_spc)
    with open(yamlfile, "r") as f:
        lumped_spc_dict = yaml.load(f.read())
    return lumped_spc_dict


def archive_lumped_species_definitions(dst):
    src = os.path.join(os.path.dirname(__file__), lumped_spc)
    print("Archiving {} in {}".format(lumped_spc, dst))
    shutil.copyfile(src, os.path.join(dst, lumped_spc))


def add_lumped_species_to_dataset(
    ds,
    lspc_dict={},
    lspc_yaml="",
    verbose=True,
    overwrite=False,
    prefix="SpeciesConc_",
):

    """
    Function to calculate lumped species concentrations and add
    them to an xarray Dataset. Lumped species definitions may be passed
    as a dictionary or a path to a yaml file. If neither is passed then
    the lumped species yaml file stored in gcpy is used. This file is
    customized for use with benchmark simuation SpeciesConc diagnostic
    collection output.

    Args:
    -----
        ds : xarray Dataset
            An xarray Dataset object prior to adding lumped species.

    Keyword Args (optional):
    ------------------------

        lspc_dict : dictionary
            Dictionary containing list of constituent species and their
            integer scale factors per lumped species.
            Default value: False

        lspc_yaml : str
            Set this flag to True to print informational output.
            Default value: False

        verbose : boolean
            Whether to print informational output.
            Default value: True

        overwrite : boolean
            Whether to overwrite an existing species dataarray in a dataset
            if it has the same name as a new lumped species. If False and
            overlapping names are found then the function will raise an error.
            Default value: False

        prefix : str
            Prefix to prepend to new lumped species names. This argument is
            also used to extract an existing dataarray in the dataset with
            the correct size and dimensions to use during initialization of
            new lumped species dataarrays.
            Default value: SpeciesConc_

    Returns:
    --------
        ds_new : xarray Dataset
            A new xarray Dataset object containing all of the original
            species plus new lumped species.

    """

    # Default is to add all benchmark lumped species.
    # Can overwrite by passing a dictionary
    # or a yaml file path containing one
    assert not (
        lspc_dict != {} and lspc_yaml != ""
    ), "Cannot pass both lspc_dict and lspc_yaml. Choose one only."
    if lspc_dict == {} and lspc_yaml == "":
        lspc_dict = get_lumped_species_definitions()
    elif lspc_dict == {} and lspc_yaml != "":
        with open(lspc_yaml, "r") as f:
            lspc_dict = yaml.load(f.read())

    # Get a dummy array to use for initialization
    for var in ds.data_vars:
        if prefix in var:
            dummy_darr = ds[var]
            break

    # Create a new dataset equivalent to the old
    ds_new = ds

    for lspc in lspc_dict:

        # Assemble lumped species variable name
        varname_new = prefix + lspc

        # Check if overlap with existing species
        if varname_new in ds_new.data_vars and overwrite:
            ds_new.drop(varname_new)
        else:
            assert(varname_new not in ds_new.data_vars), \
                "{} already in dataset. To overwrite pass overwrite=True.".\
                format(varname_new)

        # Verbose prints
        if verbose:
            print("Creating {}".format(varname_new))

        # Initialize new dataarray
        darr = dummy_darr
        darr.name = varname_new
        darr.values = np.full(darr.shape, 0.0)

        # Loop over and sum constituent species values
        num_spc = 0
        for i, spc in enumerate(lspc_dict[lspc]):
            varname = prefix + spc
            if varname not in ds_new.data_vars:
                print("Warning: {} needed for {} not in dataset.".\
                      format(spc, lspc))
                continue
            if verbose:
                print(" -> adding {} with scale {}".\
                      format(spc, lspc_dict[lspc][spc]))
            darr.values = darr.values + \
                          ds_new[varname].values * lspc_dict[lspc][spc]
            num_spc = num_spc + 1

        # Replace values with NaN is no species found in dataset
        if num_spc == 0:
            print('No constituent species found in file. Setting to NaN.')
            darr.values = np.full(darr.shape, np.nan)

        # Merge new variable into dataset
        ds_new = xr.merge([ds_new, darr])

    return ds_new


def filter_names(names, text=""):
    """
    Returns elements in a list that match a given substring.
    Can be used in conjnction with compare_varnames to return a subset
    of variable names pertaining to a given diagnostic type or species.

    Args:
    -----
        names: list of str
            Input list of names.

        text: str
            Target text string for restricting the search.

    Returns:
    --------
        filtered_names: list of str
            Returns all elements of names that contains the substring
            specified by the "text" argument.  If "text" is omitted,
            then the original contents of names will be returned.
    """

    if text != "":
        filtered_names = [k for k in names if text in k]
    else:
        filtered_names = [k for k in names if k]

    return filtered_names


def divide_dataset_by_dataarray(ds, dr, varlist=None):
    """
    Divides variables in an xarray Dataset object by a single DataArray
    object.  Will also make sure that the Dataset variable attributes
    are preserved.

    This method can be useful for certain types of model diagnostics
    that have to be divided by a counter array.  For example, local
    noontime J-value variables in a Dataset can be divided by the
    fraction of time it was local noon in each grid box, etc.

    Args:
    -----
        ds: xarray Dataset
            The Dataset object containing variables to be divided.

        dr: xarray DataArray
            The DataArray object that will be used to divide the
            variables of ds.

    Keyword Args (optional):
    ------------------------
        varlist: list of str
            If passed, then only those variables of ds that are listed
            in varlist will be divided by dr.  Otherwise, all variables
            of ds will be divided by dr.

    Returns:
    --------
        ds_new : xarray Dataset
            A new xarray Dataset object with its variables divided by dr.
    """

    # -----------------------------
    # Check arguments
    # -----------------------------
    if not isinstance(ds, xr.Dataset):
        raise TypeError("The ds argument must be of type xarray.Dataset!")

    if not isinstance(dr, xr.DataArray):
        raise TypeError("The dr argument must be of type xarray.DataArray!")

    if varlist is None:
        varlist = ds.data_vars.keys()

    # -----------------------------
    # Do the division
    # -----------------------------

    # Keep all Dataset attributes
    with xr.set_options(keep_attrs=True):

        # Loop over variables
        for v in varlist:

            # Divide each variable of ds by dr
            ds[v] = ds[v] / dr

    return ds


def get_shape_of_data(data, vertical_dim="lev", return_dims=False):
    """
    Convenience routine to return a the shape (and dimensions, if
    requested) of an xarray Dataset, or xarray DataArray.  Can also
    also take as input a dictionary of sizes (i.e. {'time': 1,
    'lev': 72, ...} from an xarray Dataset or xarray Datarray object.

    Args:
    -----
    data : xarray Dataset, xarray DataArray, or dict
        The data for which the size is requested.

    Keyword Args (optional):
    -------------------------
    vertical_dim : str
        Specify the vertical dimension that you wish to
        return: lev or ilev.
        Default value: 'lev'

    return_dims : bool
        Set this switch to True if you also wish to return a list of
        dimensions in the same order as the tuple of dimension sizes.
        Default value: False

    Returns:
    --------
    shape : tuple of int
        Tuple containing the sizes of each dimension of dr in order:
        (time, lev|ilev, nf, lat|YDim, lon|XDim).

    dims : list of str
        If return_dims is True, then dims will contain a list of
        dimension names in the same order as shape
        (['time', 'lev', 'lat', 'lon'] for GEOS-Chem "Classic",
         or ['time', 'lev', 'nf', 'Ydim', 'Xdim'] for GCHP.
    """

    # Validate the data argument
    if isinstance(data, xr.Dataset) or isinstance(data, xr.DataArray):
        sizelist = data.sizes
    elif isinstance(data, dict):
        sizelist = data
    else:
        msg = (
            'The "dataset" argument must be either an xarray Dataset, '
            + " xarray DataArray, or a dictionary!"
        )
        raise ValueError(msg)

    # Initialize
    dimlist = ["time", vertical_dim, "lat", "nf", "Ydim", "lon", "Xdim"]
    shape = ()
    dims = []

    # Return a tuple with the shape of each dimension (and also a
    # list of each dimension if return_dims is True).
    for d in dimlist:
        if d in sizelist:
            shape += (sizelist[d],)
            dims.append(d)

    if return_dims:
        return shape, dims
    else:
        return shape


def get_area_from_dataset(ds):
    """
    Convenience routine to return the area variable (which is
    usually called "AREA" for GEOS-Chem "Classic" or "Met_AREAM2"
    for GCHP) from an xarray Dataset object.

    Args:
    -----
        ds : xarray Dataset
            The input dataset.

    Returns:
    --------
        area_m2 : xarray DataArray
            The surface area in m2, as found in ds.
    """

    if "Met_AREAM2" in ds.data_vars.keys():
        return ds["Met_AREAM2"]
    elif "AREA" in ds.data_vars.keys():
        return ds["AREA"]
    else:
        msg = (
            'An area variable ("AREA" or "Met_AREAM2" is missing'
            + " from this dataset!"
        )
        raise ValueError(msg)


def get_variables_from_dataset(ds, varlist):
    """
    Convenience routine to return multiple selected DataArray
    variables from an xarray Dataset.  All variables must be
    found in the Dataset, or else an error will be raised.

    Args:
    -----
        ds : xarray Dataset
            The input dataset.

        varlist : list of str
            List of DataArray variables to extract from ds.

    Returns:
    --------
        ds_subset : xarray Dataset
            A new data set containing only the variables
            that were requested.

    Remarks:
    -------
    Use this routine if you absolutely need all of the requested
    variables to be returned.  Otherwise
    """

    ds_subset = xr.Dataset()
    for v in varlist:
        if v in ds.data_vars.keys():
            ds_subset = xr.merge([ds_subset, ds[v]])
        else:
            msg = "{} was not found in this dataset!".format(v)
            raise ValueError(msg)

    return ds_subset


def create_dataarray_of_nan(name, sizes, coords, attrs, vertical_dim="lev"):
    """
    Given an xarray DataArray dr, returns a DataArray object with
    the same dimensions, coordinates, attributes, and name, but
    with its data set to missing values (NaN) everywhere.

    This is useful if you need to plot or compare two DataArray
    variables, and need to represent one as missing or undefined.

    Args:
    -----
    name : str
        The name for the DataArray object that will contain NaNs.

    sizes : dict of int
        Dictionary of the dimension names and their sizes (e.g.
        {'time' : 1 ', 'lev': 72, ...} that will be used to create
        the DataArray of NaNs.  This can be obtained from an
        xarray Dataset as ds.sizes.

    coords : dict of lists of float
        Dictionary containing the coordinate variables that will
        be used to create the DataArray of NaNs.  This can be obtained
        from an xarray Dataset with ds.coords.

    attrs : dict of str
        Dictionary containing the DataArray variable attributes
        (such as "units", "long_name", etc.).  This can be obtained
        from an xarray Dataset with dr.attrs.

    Returns:
    --------
    dr : xarray DataArray
        The output DataArray object, which will contain NaN values
        everywhere.  This will denote missing data.
    """

    # Save dims and coords into local variables
    # NOTE: Cast to type dict so that we can delete keys and values
    new_sizes = dict(sizes)
    new_coords = dict(coords)

    # Only keep one of the vertical dimensions (lev or ilev)
    if vertical_dim == "lev":
        if "ilev" in new_sizes:
            del new_sizes["ilev"]
            del new_coords["ilev"]
    elif vertical_dim == "ilev":
        if "lev" in new_sizes:
            del new_sizes["lev"]
            del new_coords["lev"]
    else:
        msg = 'The "vertical_lev" argument must be either "lev" or "ilev"!'
        raise ValueError(msg)

    # Get the names and sizes of the dimensions
    # after discarding one of "lev" or "ilev"
    [new_shape, new_dims] = get_shape_of_data(new_sizes, return_dims=True)

    # Create an array full of NaNs of the required size
    nan_arr = np.empty(new_shape, np.float)
    nan_arr.fill(np.nan)

    # Create a DataArray of NaN's
    return xr.DataArray(
        nan_arr, name=name, dims=new_dims, coords=new_coords, attrs=attrs
    )


def normalize_colors(vmin, vmax, is_difference=False, log_color_scale=False, ratio_log=False):
    """
    Normalizes a data range to the colormap range used by matplotlib
    functions.

    For log-color scales, special handling is done to prevent
    taking the log of data that is all zeroes.

    Args:
    -----
        vmin : float
            Minimum value of the data range.

        vmax : float
            Maximum value of the data range.

    Keyword Args:
    -------------
        is_difference : boolean
            Set this switch to denote that we are using a difference
            color scale (i.e. with zero in the middle of the range).
            Default value: False

        log_color_scale : boolean
            Logical flag to denote that we are using a logarithmic
            color scale instead of a linear color scale.
            Default value: False

    Returns:
    --------
        norm : matplotlib Norm
            The normalized matplotlib color range, stored in
            a matplotlib Norm object.

    Remarks:
    --------
         For log color scales, we will use a range of 3 orders of
         magnitude (i.e. from vmax/1e3 to vmax).
    """

    #Define class for logarithmic non-symmetric color scheme
    class MidpointLogNorm(mcolors.LogNorm):
        def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
            mcolors.LogNorm.__init__(self,vmin=vmin,vmax=vmax,clip=clip)
            self.midpoint=midpoint
        def __call__(self,value,clip=None):
            result, is_scalar = self.process_value(value)
            x = [np.log(self.vmin), np.log(self.midpoint), np.log(self.vmax)]
            y = [0, 0.5, 1]
            return np.ma.array(np.interp(np.log(value), x, y), mask=result.mask, copy = False)
            
    if (abs(vmin) == 0 and abs(vmax) == 0) or (np.isnan(vmin) and np.isnan(vmax)):
        # If the data is zero everywhere (vmin=vmax=0) or undefined
        # everywhere (vmin=vmax=NaN), then normalize the data range
        # so that the color corresponding to zero (white) will be
        # placed in the middle of the colorbar, where we will
        # add a single tick.
        if is_difference:
            return mcolors.Normalize(vmin=-1.0, vmax=1.0)
        else:
            return mcolors.Normalize(vmin=0.0, vmax=1.0)

    else:
        
        # For log color scales, assume a range 3 orders of magnitude
        # below the maximum value.  Otherwise use a linear scale.
        if log_color_scale and not ratio_log:
            return mcolors.LogNorm(vmin=vmax / 1e3, vmax=vmax)
        elif log_color_scale:
            return MidpointLogNorm(vmin=vmin, vmax=vmax, midpoint=1)
        else:
            return mcolors.Normalize(vmin=vmin, vmax=vmax)


def check_for_area(ds, gcc_area_name="AREA", gchp_area_name="Met_AREAM2"):
    """
    Makes sure that a dataset has a surface area variable contained
    within it.

    GEOS-Chem Classic files all contain surface area as variable AREA.
    GCHP files do not and area must be retrieved from the met-field
    collection from variable Met_AREAM2. To simplify comparisons,
    the GCHP area name will be appended to the dataset under the
    GEOS-Chem "Classic" area name if it is present.

    Args:
    -----
        ds : xarray Dataset
            The Dataset object that will be checked.

    Keyword Args (optional):
    ------------------------
        gcc_area_name : str
            Specifies the name of the GEOS-Chem "Classic" surface
            area varaible
            Default value: "AREA"

        gchp_area_name : str
            Specifies the name of the GCHP surface area variable.
            Default value: "Met_AREAM2"

    Returns:
    --------
        ds : xarray Dataset
            The modified Dataset object
    """

    found_gcc = gcc_area_name in ds.data_vars.keys()
    found_gchp = gchp_area_name in ds.data_vars.keys()

    if (not found_gcc) and (not found_gchp):
        msg = "Could not find {} or {} in the dataset!".format(
            gcc_area_name, gchp_area_name
        )
        raise ValueError(msg)

    if found_gchp:
        ds[gcc_area_name] = ds[gchp_area_name]

    return ds


def get_filepaths(outputdir, collections, dates, is_gcc=False, is_gchp=False):
    """
    Routine to return filepaths for a given GEOS-Chem "Classic"
    (aka "GCC") or GCHP diagnostic collection.

    Args:
    -----
        outputdir : str
            Path name of the directory containing GCC or GCHP data
            files.

        collections : list of str
            Names of collections (e.g. Emissions, SpeciesConc, etc.)
            for which file paths will be returned.

        dates : array of numpy.datetime64
            Array of dates for which file paths are requested.

    Keyword Args (optional):
    ------------------------
        is_gcc : bool
            Set this switch to True to obtain file pathnames to
            GEOS-Chem "Classic" diagnostic data files.

        is_gchp : bool
            Set this switch to True to obtain file pathnames to
            GCHP diagnostic data files.

    Returns:
    --------
        paths : list of str
            A list of pathnames for each specified collection and date.
    """

    # ==================================================================
    # Initialization
    # ==================================================================

    # Error check input flags
    if is_gcc == is_gchp:
        msg = (
            "Both is_gcc={} and is_gchp={}!  At present, "
            + "get_filepaths returns either GCC or GCHP data! "
            + "but not both!".format(is_gcc, is_gchp)
        )
        raise ValueError(msg)

    # If collections is passed as a scalar
    # make it a list so that we can iterate
    if not isinstance(collections, list):
        collections = [collections]

    # Create the return variable
    paths = []

    # Alias for the join function
    join = os.path.join

    # ==================================================================
    # Create the file list
    # ==================================================================
    for collection in collections:

        if is_gcc:
            # ---------------------------------------
            # Get the file path template for GCC
            # ---------------------------------------
            if "Emissions" in collection:
                file_tmpl = join(outputdir, "HEMCO_diagnostics.")
                separator = ""
                extension = ".nc"

            else:
                file_tmpl = join(outputdir, "GEOSChem.{}.".format(collection))
                separator = "_"
                extension = "z.nc4"

        elif is_gchp:
            # ---------------------------------------
            # Get the file path template for GCHP
            # ---------------------------------------
            if "Restart" in collection:
                file_tmpl = join(outputdir,
                                 "gcchem_internal_checkpoint.restart.")
                separator = "_"
                extension = ".nc4"
            else:
                file_tmpl = join(outputdir, "GCHP.{}.".format(collection))
                separator = "_"
                extension = "z.nc4"

        # --------------------------------------------
        # Create a list of files for each date/time
        # --------------------------------------------
        for date in dates:
            if is_gchp and "Restart" in collection:
                date_time = np.datetime_as_string(date, unit="s")
            else:
                date_time = np.datetime_as_string(date, unit="m")
            date_time = date_time.replace("T", separator)
            date_time = date_time.replace("-", "")
            date_time = date_time.replace(":", "")
            paths.append(file_tmpl + date_time + extension)

    return paths


def extract_pathnames_from_log(filename, prefix_filter=""):
    """
    Returns a list of pathnames from a GEOS-Chem log file.
    This can be used to get a list of files that should be
    downloaded from gcgrid or from Amazon S3.

    Args:
    -----
        filename : str
            GEOS-Chem standard log file

        prefix_filter : str
            Restricts the output to file paths starting with
            this prefix (e.g. "/home/ubuntu/ExtData/HEMCO/")
            Default value: ''

    Returns:
    --------
        data list : list of str
            List of full pathnames of data files found in
            the log file.

    Author:
    -------
        Jiawei Zhuang (jiaweizhuang@g.harvard.edu)
    """

    # Initialization
    prefix_len = len(prefix_filter)
    data_list = set()  # only keep unique files

    # Open file (or die with error)
    try:
        f = open(filename, "r")
    except FileNotFoundError:
        raise FileNotFoundError("Could not find file {}".format(filename))

    # Read data from the file line by line.
    # Add file paths to the data_list set.
    line = f.readline()
    while line:
        upcaseline = line.upper()
        if (": OPENING" in upcaseline) or (": READING" in upcaseline):
            data_path = line.split()[-1]
            # remove common prefix
            if data_path.startswith(prefix_filter):
                trimmed_path = data_path[prefix_len:]
                data_list.add(trimmed_path)

        # Read next line
        line = f.readline()

    # Close file and return
    f.close()
    data_list = sorted(list(data_list))
    return data_list

def get_gcc_filepath(outputdir, collection, day, time):
    if collection == "Emissions":
        filepath = os.path.join(
            outputdir, "HEMCO_diagnostics.{}{}.nc".format(day, time)
        )
    else:
        filepath = os.path.join(
            outputdir, "GEOSChem.{}.{}_{}z.nc4".format(collection, day, time)
        )
    return filepath


def get_gchp_filepath(outputdir, collection, day, time):
    filepath = os.path.join(
        outputdir, "GCHP.{}.{}_{}z.nc4".format(collection, day, time)
    )
    return filepath

def gcplot(plot_vals,
           ax=None,
           plot_type="single_level",
           grid={},
           gridtype="",
           title="fill",
           comap=WhGrYlRd,
           norm=[],
           unit="",
           extent=(None, None, None, None),
           masked_data=None,
           use_cmap_RdBu=False,
           log_color_scale=False,
           add_cb=True,
           pres_range = [0, 2000],
           pedge=np.full((1, 1), -1),
           pedge_ind=np.full((1,1), -1),
           log_yaxis=False,
           xtick_positions=np.arange(-90,91,30),
           xticklabels = []
):
    """
    Core plotting routine -- creates a single plot panel.

    Args:
    -----

        plot_vals : xarray DataArray or numpy array
            Single data variable GEOS-Chem output to plot

    Keyword Args (Optional):
    -----
    
        ax : matplotlib axes
            Axes object to plot information. Will create a new axes if None is passed.

        plot_type : str
            Either "single_level" or "zonal_mean"

        grid : dict
            Dictionary mapping plot_vals to plottable coordinates

        gridtype : str
            "ll" for lat/lon or "cs" for cubed-sphere

        title : str
            Title to put at top of plot
    
        comap : matplotlib Colormap
            Colormap for plotting data values

        norm : list
            List with range [0..1] normalizing color range for matplotlib methods

        unit : ""
            Units of plotted data

        extent : tuple (minlon, maxlon, minlat, maxlat)
            Describes minimum and maximum latitude and longitude of input data
    
        masked_data : numpy array
            Masked area for cubed-sphere plotting

        use_cmap_RdBu : boolean
            Set this flag to True to use a blue-white-red colormap

        log_color_scale : boolean
            Set this flag to True to use a log-scale colormap

        add_cb : boolean
            Set this flag to True to add a colorbar to the plot 

        pres_range : list(int)
            Range from minimum to maximum pressure for zonal mean plotting

        pedge : numpy array
            Edge pressures of vertical grid cells in plot_vals

        pedge_ind : numpy array
            Index of edge pressure values within pressure range in plot_vals

        log_yaxis : boolean
            Set this flag to True to enable log scaling of pressure in zonal mean plots

        xtick_positions : list(float)
            Locations of lat/lon or lon ticks on plot

        xtick_labels: list(str)
            Labels for lat/lon ticks

    Returns:
    -----
    
    plot : matplotlib plot
        Plot object created from input
    """

    data_is_xr = type(plot_vals) is xr.DataArray

    #Generate grid if not passed
    if grid == {}:
        res, gridtype = get_input_res(plot_vals)
        [grid, _] = call_make_grid(res, gridtype, False, False)

    # Normalize colors (put into range [0..1] for matplotlib methods)
    if norm == []:
        if data_is_xr:
            vmin = plot_vals.data.min()
            vmax = plot_vals.data.max()
        elif type(plot_vals) is np.ndarray:
            vmin = np.min(plot_vals)
            vmax = np.max(plot_vals)
        norm = normalize_colors(
            vmin, vmax, is_difference=use_cmap_RdBu, log_color_scale=log_color_scale
        )
    if xticklabels == []:
        xticklabels = ["{}$\degree$".format(x) for x in xtick_positions]

    if unit == "" and data_is_xr:
        unit = plot_vals.units.strip()

    if ax == None:
        if plot_type == "zonal_mean":
            ax = plt.axes()
        if plot_type == "single_level":
            ax = plt.axes(projection = ccrs.PlateCarree())

    if title == "fill" and data_is_xr:
        title = plot_vals.name

    # Create plot
    ax.set_title(title)
    if plot_type == "zonal_mean":
        if pedge.all() == -1:
            pedge = GEOS_72L_grid.p_edge()
        if pedge_ind.all() == -1:
            pedge_ind = np.where((pedge <= np.max(pres_range)) & (pedge >= np.min(pres_range)))
            pedge_ind = pedge_ind[0]
            # Pad edges if subset does not include surface or TOA so data spans entire subrange
            if min(pedge_ind) != 0:
                pedge_ind = np.append(min(pedge_ind) - 1, pedge_ind)
            if max(pedge_ind) != 72:
                pedge_ind = np.append(pedge_ind, max(pedge_ind) + 1)
        # Zonal mean plot
        plot = ax.pcolormesh(
            grid["lat_b"], pedge[pedge_ind], plot_vals, cmap=comap, norm=norm
        )
        ax.set_aspect("auto")
        ax.set_ylabel("Pressure (hPa)")
        if log_yaxis:
            ax.set_yscale("log")
            ax.yaxis.set_major_formatter(
                mticker.FuncFormatter(lambda y, _: "{:g}".format(y))
            )
        ax.invert_yaxis()
        ax.set_xticks(xtick_positions)
        ax.set_xticklabels(xticklabels)

    elif gridtype == "ll":
        #Lat/Lon single level
        if extent == (None, None, None, None):
            extent = get_grid_extents(grid)
        elif type(plot_vals) is xr.DataArray:
            [minlon, maxlon, minlat, maxlat] = extent
            #filter data by bounds of extent
            plot_vals = plot_vals.where(plot_vals.lon>=minlon, 
                                        drop=True).where(plot_vals.lon<=maxlon,
                                                         drop=True).where(plot_vals.lat>=minlat, 
                                                                          drop=True).where(plot_vals.lat<=maxlat, drop=True)
        else:
            #for numpy arrays
            [minlon, maxlon, minlat, maxlat] = extent
            minlon_ind = np.where(grid["lon"] >= minlon)[0][0]
            maxlon_ind = np.where(grid["lon"] <= maxlon)[0][-1]
            minlat_ind = np.where(grid["lat"] >= minlat)[0][0]
            maxlat_ind = np.where(grid["lat"] <= maxlat)[0][-1]
            #assume lat comes first in indexing
            plot_vals = plot_vals[minlat_ind:maxlat_ind+1,minlon_ind:maxlon_ind+1].squeeze()
        # Create a lon/lat plot
        plot = ax.imshow(
            plot_vals, extent=extent, transform=ccrs.PlateCarree(), cmap=comap, norm=norm
        )
        ax.coastlines()
        ax.set_xticks(xtick_positions)
        ax.set_xticklabels(xticklabels)
        
    else:
        #Cubed-sphere single level
        ax.coastlines()
        try:
            if masked_data == None:
                masked_data = np.ma.masked_where(np.abs(grid["lon"] - 180) < 2, plot_vals.data.reshape(6, res, res))
        except ValueError:
            #Comparison of numpy arrays throws errors
            pass
        [minlon,maxlon,minlat,maxlat] = extent
        
        #Catch issue with plots extending into both the western and eastern hemisphere
        if np.max(grid["lon_b"] > 180):
            grid["lon_b"] = (((grid["lon_b"]+180)%360)-180)
        for j in range(6):
            plot = ax.pcolormesh(
                grid["lon_b"][j, :, :],
                grid["lat_b"][j, :, :],
                masked_data[j, :, :],
                transform=ccrs.PlateCarree(),
                cmap=comap,
                norm=norm
            )
        ax.set_xlim(minlon, maxlon)
        ax.set_ylim(minlat, maxlat)
        ax.set_xticks(xtick_positions)
        ax.set_xticklabels(xticklabels)

    if add_cb == True:
        cb = plt.colorbar(plot, ax=ax, orientation="horizontal", pad=0.10)
        cb.mappable.set_norm(norm)
        all_zero, all_nan = all_zero_or_nan(plot_vals.values)
        if all_zero or all_nan:
            if use_cmap_RdBu:
                cb.set_ticks([0.0])
            else:
                cb.set_ticks([0.5])
            if all_nan:
                cb.set_ticklabels(["Undefined throughout domain"])
            else:
                cb.set_ticklabels(["Zero throughout domain"])
        else:
            if log_color_scale:
                cb.formatter = mticker.LogFormatter(base=10)
            else:
                if (vmax - vmin) < 0.1 or (vmax - vmin) > 100:
                    cb.locator = mticker.MaxNLocator(nbins=4)

        try:
            cb.formatter.set_useOffset(False)
        except:
            #not all automatically chosen colorbar formatters properly handle the above method
            pass
        cb.update_ticks()
        cb.set_label(unit)

    return plot


def get_input_res(data):
    """
    Returns resolution of dataset passed to compare_single_level or compare_zonal_means
    
    Args:
    -----
    
        data : xarray Dataset
            Input GEOS-Chem dataset

    Returns:
    -----
        
        res : str or int
            Lat/lon res of the form 'latresxlonres' or cubed-sphere resolution
        
        gridtype : str
            'll' for lat/lon or 'cs' for cubed-sphere
    
    """
    vdims = data.dims
    if "lat" in vdims and "lon" in vdims:
        lat = data["lat"].values
        lon = data["lon"].values
        if lat.size / 6 == lon.size:
            return lon.size, "cs"
        else:
            lat.sort()
            lon.sort()
            #use increment of second and third coordinates
            # to avoid polar mischief
            lat_res = np.abs(lat[2]-lat[1])
            lon_res = np.abs(lon[2]-lon[1])
            return str(lat_res) + "x" + str(lon_res), "ll"

    else:
        #print("grid is cs: ", vdims)
        # GCHP data using MAPL v1.0.0+ has dims time, lev, nf, Ydim, and Xdim
        return data.dims["Xdim"], "cs"

def get_nan_mask(data):
    """
    Create a mask with NaN values removed from an input array
    
    Args:
    -----
    
        data : numpy array
            Input array possibly containing NaNs

    Returns:
    -----
        new_data : numpy array
            Original array with NaN values removed
    """

    #remove NaNs
    fill = np.nanmax(data)+100000
    new_data = np.where(np.isnan(data), fill, data)
    new_data = np.ma.masked_where(data == fill, data)
    return new_data


def call_make_grid(res, gridtype, zonal_mean, comparison, in_extent=[-180,180,-90,90], 
                   out_extent=[-180,180,-90,90]):
    """
    Create a mask with NaN values removed from an input array
    
    Args:
    -----
    
        res : str or int
            Resolution of grid (format 'latxlon' or csres)

        gridtype : str
            'll' for lat/lon or 'cs' for cubed-sphere

        zonal_mean : boolean
            Set to True if the output grid is for a zonal mean plot

        comparison : boolean
            Set to True if the output grid is a comparison grid

        in_extent : list (minlon, maxlon, minlat, maxlat)
            Describes minimum and maximum latitude and longitude of input data

        out_extent : list (minlon, maxlon, minlat, maxlat)
            Desired minimum and maximum latitude and longitude of output grid

    Returns:
    -----
        [grid, grid_list] : list(dict, list(dict))
            Returns the created grid. grid_list is a list of grids if gridtype is 'cs', else it is None
    """

    # call appropriate make_grid function and return new grid
    if gridtype == "ll" or (zonal_mean and comparison):
        return [make_grid_LL(res, in_extent, out_extent), None]
    else:
        return make_grid_CS(res)

def all_zero_or_nan(ds):
    """
    Return whether ds is all zeros, or all nans
    
    Args:
    -----
        ds : numpy array
            Input GEOS-Chem data
    Returns:
    -----
        all_zero, all_nan : boolean, boolean
            All_zero is whether ds is all zeros, all_nan is whether ds is all NaNs
    """

    return not np.any(ds), np.isnan(ds).all()

def get_grid_extents(data, edges=True):
    """
    Get min and max lat and lon from an input GEOS-Chem xarray dataset or grid dict
    
    Args:
    -----
        data : xarray Dataset or dict
            A GEOS-Chem dataset or a grid dict
    Returns:
    -----
        minlon : float
            Minimum longitude of data grid

        maxlon : float
            Maximum longitude of data grid

        minlat : float
            Minimum latitude of data grid

        maxlat : float
            Maximum latitude of data grid
    """

    if type(data) is dict:
        if "lon_b" in data and edges:
            return np.min(data["lon_b"]), np.max(data["lon_b"]), np.min(data["lat_b"]), np.max(data["lat_b"])
        elif not edges:
            return np.min(data["lon"]), np.max(data["lon"]), np.min(data["lat"]), np.max(data["lat"])
        else:
            return -180, 180, -90, 90
    elif "lat" in data.dims and "lon" in data.dims:
        lat = data["lat"].values
        lon = data["lon"].values
        if lat.size / 6 == lon.size:
            #No extents for CS plots right now
            return -180, 180, -90, 90
        else:
            lat = np.sort(lat)
            minlat = np.min(lat)
            if abs(abs(lat[1])-abs(lat[0])) != abs(abs(lat[2]) - abs(lat[1])):
                #pole is cutoff
                minlat = minlat - 1
            maxlat = np.max(lat)
            if abs(abs(lat[-1])-abs(lat[-2])) != abs(abs(lat[-2])- abs(lat[-3])):
                maxlat = maxlat+1
            #add longitude res to max longitude
            lon = np.sort(lon)
            minlon = np.min(lon)
            maxlon = np.max(lon)+abs(abs(lon[-1]-abs(lon[-2])))
            return minlon, maxlon, minlat, maxlat
    else:
        # GCHP data using MAPL v1.0.0+ has dims time, lev, nf, Ydim, and Xdim
        return -180, 180, -90, 90


def get_vert_grid(dataset):
    """
    Determine vertical grid of input dataset
    
    Args:
    -----
        dataset : xarray Dataset
            A GEOS-Chem output dataset
    Returns:
    -----
        p_edge : numpy array
            Edge pressure values for vertical grid
        
        p_mid  : numpy array
            Midpoint pressure values for vertical grid
        
        nlev : int
            Number of levels in vertical grid
    """

    if dataset.sizes["lev"] in (72, 73):
        return GEOS_72L_grid.p_edge(), GEOS_72L_grid.p_mid(), 72
    elif dataset.sizes["lev"] in (47, 48):
        return GEOS_47L_grid.p_edge(), GEOS_47L_grid.p_mid(), 47
    else:
        raise ValueError("Only 72/73 or 47/48 level vertical grids are supported")

def get_pressure_indices(pedge, pres_range):
    """
    Get indices where edge pressure values are within a given pressure range
    
    Args:
    -----
        pedge : numpy array
            A GEOS-Chem output dataset

        pres_range : list(float, float)
            Contains minimum and maximum pressure

    Returns:
    -----
        numpy array
            Indices where edge pressure values are within a given pressure range
    """

    return np.where((pedge <= np.max(pres_range)) & (pedge >= np.min(pres_range)))[0]

def pad_pressure_edges(pedge_ind, max_ind):
    """
    Add outer indices to edge pressure index list
    
    Args:
    -----
        pedge_ind : list
            List of edge pressure indices

        max_ind : int
            Maximum index

    Returns:
    -----
        pedge_ind : list
            List of edge pressure indices, possibly with new minimum and maximum indices
    """

    if max_ind in (48, 73):
        max_ind = max_ind - 1
    if min(pedge_ind) != 0:
        pedge_ind = np.append(min(pedge_ind) - 1, pedge_ind)
    if max(pedge_ind) != max_ind:
        pedge_ind = np.append(pedge_ind, max(pedge_ind) + 1)
    return pedge_ind
    
def convert_lev_to_pres(dataset, pmid, pedge):
    """
    Convert lev dimension to pressure in a GEOS-Chem dataset
    
    Args:
    -----
        dataset : xarray Dataset
            GEOS-Chem dataset

        pmid : np.array
            Midpoint pressure values
        
        pedge : np.array
            Edge pressure values

    Returns:
    -----
        dataset : xarray Dataset
            Input dataset with "lev" dimension values replaced with pressure values
    """

    if dataset.sizes["lev"] in (72, 47):
        dataset["lev"] = pmid
    elif dataset.sizes["lev"] in (73, 48):
        dataset["lev"] = pedge
    else:
        msg = "compare_zonal_mean implemented for 72, 73, 47, or 48 levels only. " \
              + "Other values found in Ref."
        raise ValueError(msg)
    dataset["lev"].attrs["unit"] = "hPa"
    dataset["lev"].attrs["long_name"] = "level pressure"
    return dataset