## Cleaning up Atlas data - UoR CALL
**Function**      : Preprocess netCDF files and restructure the dataset<br>
**Description**   : In this notebook serves to clean up Atlas data which is given in netcdf format and aggregate the data into a single file.<br>
**Return Values   : .nc files**<br>
**Note**          : All the data is saved to netCDF4 format. Note that data from different models may vary concerning the resolution and coordinates.<br>

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

### Path
Specify the path to the dataset and the place to save the outputs. <br>

In [2]:
# please specify data path
datapath = Path("./AtlasData/raw")

# please specify output path
output_path = Path("./AtlasData/preprocess")
os.makedirs(output_path, exist_ok=True)

Components used to create the output file names. Here, only `institution_id` and `cmor_var` is based on on CMIP DRS conventions.

In [3]:
output_file_name = {
    "prefix": "atlas",
    "activity": "EUCP",  # project name e.g. EUCP
    "institution_id": "UoR",  # UoR
    "source": "CMIP5",  # e.g. CMIP6 or CMIP5
    "method": "CALL",  # e.g. CALL
    "sub_method": "cons",  # e.g. cons or uncons
    "cmor_var": "tas",  # e.g. tas or pr
}

### Load and process raw data

Make some functions to combining multiple dimensions with a preprocessor and load data

In [4]:
INSTITUTION_ID = output_file_name["institution_id"]
METHOD = output_file_name["method"]


def add_percentile(ds):
    filename = ds.encoding["source"]
    percentile = int(filename.split("_")[-2][:2])

    return ds.assign_coords(percentile=percentile).expand_dims("percentile")


# data loader and batch processing
def load_data(project, season, variable):
    # open multiple files with xarray
    ds_cons = xr.open_mfdataset(
        str(
            Path(
                datapath,
                f"{INSTITUTION_ID}_{METHOD}",
                f"{variable}_{season}_*perc_CONST.nc",
            )
        ),
        preprocess=add_percentile,
    )
    ds_uncons = xr.open_mfdataset(
        str(
            Path(
                datapath,
                f"{INSTITUTION_ID}_{METHOD}",
                f"{variable}_{season}_*perc_UNCONST.nc",
            )
        ),
        preprocess=add_percentile,
    )

    weighted = (
        ds_cons["VARchange"]
        .rename(variable)
        .assign_coords(constrained=1)
        .expand_dims("constrained")
    )
    unweighted = (
        ds_uncons["VARchange"]
        .rename(variable)
        .assign_coords(constrained=0)
        .expand_dims("constrained")
    )
    return xr.concat([weighted, unweighted], dim="constrained")

Call functions

In [5]:
project = output_file_name["source"].lower()
seasons = []
for season in ["djf", "jja"]:
    tas = load_data(project, season, "tas")
    pr = load_data(project, season, "pr")
    ds = xr.merge([tas, pr]).assign_coords(season=season.upper())
    seasons.append(ds)
uor_call_ds = xr.concat(seasons, dim="season")
# re-arrange the dimensions from (lon, lat) to (lat, lon)
uor_call_ds = uor_call_ds.transpose(..., "lat", "lon")
uor_call_ds

Unnamed: 0,Array,Chunk
Bytes,53.12 kiB,2.66 kiB
Shape,"(2, 2, 5, 17, 20)","(1, 1, 1, 17, 20)"
Count,180 Tasks,20 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 53.12 kiB 2.66 kiB Shape (2, 2, 5, 17, 20) (1, 1, 1, 17, 20) Count 180 Tasks 20 Chunks Type float64 numpy.ndarray",2  2  20  17  5,

Unnamed: 0,Array,Chunk
Bytes,53.12 kiB,2.66 kiB
Shape,"(2, 2, 5, 17, 20)","(1, 1, 1, 17, 20)"
Count,180 Tasks,20 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,53.12 kiB,2.66 kiB
Shape,"(2, 2, 5, 17, 20)","(1, 1, 1, 17, 20)"
Count,180 Tasks,20 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 53.12 kiB 2.66 kiB Shape (2, 2, 5, 17, 20) (1, 1, 1, 17, 20) Count 180 Tasks 20 Chunks Type float64 numpy.ndarray",2  2  20  17  5,

Unnamed: 0,Array,Chunk
Bytes,53.12 kiB,2.66 kiB
Shape,"(2, 2, 5, 17, 20)","(1, 1, 1, 17, 20)"
Count,180 Tasks,20 Chunks
Type,float64,numpy.ndarray


Make some metadata. Here, we follow CF-conventions as much as possible.

In [6]:
attrs = {
    "tas": {
        "description": "Change in Air Temperature",
        "standard_name": "Change in Air Temperature",
        "long_name": "Change in Near-Surface Air Temperature",
        "units": "K",  # in line with raw data
        "cell_methods": "time: mean changes over 20 years 2041-2060 vs 1995-2014",
    },
    "pr": {
        "description": "Relative precipitation",
        "standard_name": "Relative precipitation",
        "long_name": "Relative precipitation",
        "units": "%",  # in line with raw data
        "cell_methods": "time: mean changes over 20 years 2041-2060 vs 1995-2014",
    },
    "latitude": {"units": "degrees_north", "long_name": "latitude", "axis": "Y"},
    "longitude": {"units": "degrees_east", "long_name": "longitude", "axis": "X"},
    "time": {
        "climatology": "climatology_bounds",
        "long_name": "time",
        "axis": "T",
        "climatology_bounds": ["2050-6-1", "2050-9-1", "2050-12-1", "2051-3-1"],
        "description": "mean changes over 20 years 2041-2060 vs 1995-2014. The mid point 2050 is chosen as the representative time.",
    },
    "percentile": {"units": "%", "long_name": "percentile", "axis": "Z"},
}

### Assemble data and save to netcdf

Make a function to assemble and save data 

In [7]:
TIMES = {
    "JJA": "2050-7-16",
    "DJF": "2051-1-16",
}  # "0000-4-16", "0000-7-16", "0000-10-16", "0000-1-16" MAM JJA SON DJF
PERCENTILES = [10, 25, 50, 75, 90]

LAT = uor_call_ds.coords["lat"]
LON = uor_call_ds.coords["lon"]


def assembly(ds_original, var, cons):
    """
    Select data from original nc files and save the target fields.
    """
    ds_target = xr.Dataset(
        {
            var: (
                ("time", "latitude", "longitude", "percentile"),
                np.full([len(TIMES), len(LAT), len(LON), len(PERCENTILES)], np.nan),
            ),
            "climatology_bounds": (
                pd.to_datetime(["2050-6-1", "2050-9-1", "2050-12-1", "2051-3-1"])
            ),
        },
        coords={
            "time": pd.to_datetime(list(TIMES.values())),
            "latitude": LAT.values,
            "longitude": LON.values,
            "percentile": PERCENTILES,
        },
        attrs={
            "description": f"Contains modified {INSTITUTION_ID} {METHOD} data used for Atlas in EUCP project.",
            "history": (
                f"original {INSTITUTION_ID} {METHOD} data files pr_djf_*perc_CONST.nc, pr_jja_*perc_CONST.nc,"
                "pr_djf_*perc_UNCONST.nc, pr_jja_*perc_UNCONST.nc,"
                "tas_djf_*perc_CONST.nc, tas_jja_*perc_CONST.nc,"
                "tas_djf_*perc_UNCONST.nc, tas_jja_*perc_UNCONST.nc"
            ),
        },
    )
    for season in ["JJA", "DJF"]:
        for j, p in enumerate(PERCENTILES):
            ds_target[var].values[list(TIMES).index(season), :, :, j] = (
                ds_original[var]
                .sel(percentile=p, season=season, constrained=cons)
                .values
            )
    return ds_target

Call the function

In [8]:
PROJECTIONS = ["uncons", "cons"]
for VAR_NAME in ["tas", "pr"]:
    output_file_name["cmor_var"] = VAR_NAME
    for projection in PROJECTIONS:
        output_file_name["sub_method"] = projection
        new_ds = assembly(uor_call_ds, VAR_NAME, PROJECTIONS.index(projection))

        # Fix attributes
        for key in new_ds.keys():
            new_ds[key].attrs = attrs[key]

        file_name = f"{'_'.join(output_file_name.values())}.nc"
        print(f"one dataset is saved to {file_name}")
        new_ds.to_netcdf(output_path / file_name)

one dataset is saved to atlas_EUCP_UoR_CMIP5_CALL_uncons_tas.nc
one dataset is saved to atlas_EUCP_UoR_CMIP5_CALL_cons_tas.nc
one dataset is saved to atlas_EUCP_UoR_CMIP5_CALL_uncons_pr.nc
one dataset is saved to atlas_EUCP_UoR_CMIP5_CALL_cons_pr.nc


### Check output

Load one of the saved data.

In [10]:
ds = xr.open_dataset(output_path / "atlas_EUCP_UoR_CMIP5_CALL_cons_tas.nc")
ds