## Cleaning up Atlas data - UKMO UKCP
**Function**      : Preprocess netCDF files and restructure the dataset<br>
**Author          : Team BETA**<br>
**First Built**   : 2021.09.15<br>
**Last Update     : 2021.09.30**<br>
**Library**       : os, numpy, netcdf4, xarray<br>
**Description**   : In this notebook serves to clean up Atlas data which is given in netcdf format and aggregate the data into a single file.<br>
**Return Values   : .nc files**<br>
**Note**          : All the data is saved to netCDF4 format. Note that data from different models may vary concerning the resolution and coordinates.<br>

In [1]:
import os
import numpy as np
import xarray as xr

### Path
Specify the path to the dataset and the place to save the outputs. <br>

In [2]:
# please specify data path
datapath = '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data'
# please specify output path
output_path = '/mnt/d/NLeSC/BETA/EUCP/Atlas/preprocess'
os.makedirs(output_path, exist_ok = True)

### Extract data
Extract weather/climate data from given netCDF files.

In [3]:
# UKMO UKCP
# first check of data
dataset_pr_djf = xr.open_dataset(os.path.join(datapath, 'prAnom', 'djf',
                          'prAnom_rcp85_eu_300km_Wall-N600000-P21_cdf_b9514_20y_djf_20401201-20601130.nc'))
dataset_pr_djf

In [4]:
# select the correct index for our target percentile
print(dataset_pr_djf.coords['percentile'].values[15])
print(dataset_pr_djf.coords['percentile'].values[30])
print(dataset_pr_djf.coords['percentile'].values[55])
print(dataset_pr_djf.coords['percentile'].values[75])
print(dataset_pr_djf.coords['percentile'].values[95])

10.0
25.0
50.0
70.0
90.0


In [5]:
# load all the data
nc_files = {}
nc_files_namelist = []
for files_root, files_dirs, files in os.walk(datapath):
    for ncfile in files:
        if ncfile.endswith(".nc"):
            var = ncfile.split('_')[0][:-4]
            season = ncfile.split('_')[-2]
            cons = ncfile.split('_')[4][:2]
            if cons == 'Wa': # constrained
                nc_files[f'{var}_{season}_cons'] = os.path.join(files_root, ncfile)
            else:
                nc_files[f'{var}_{season}_uncons'] = os.path.join(files_root, ncfile)
print(nc_files)

{'pr_djf_cons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/prAnom/djf/prAnom_rcp85_eu_300km_Wall-N600000-P21_cdf_b9514_20y_djf_20401201-20601130.nc', 'pr_djf_uncons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/prAnom/djf/prAnom_rcp85_eu_300km_Wprior-N600000-P21_cdf_b9514_20y_djf_20401201-20601130.nc', 'pr_jja_cons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/prAnom/jja/prAnom_rcp85_eu_300km_Wall-N600000-P21_cdf_b9514_20y_jja_20401201-20601130.nc', 'pr_jja_uncons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/prAnom/jja/prAnom_rcp85_eu_300km_Wprior-N600000-P21_cdf_b9514_20y_jja_20401201-20601130.nc', 'tas_djf_cons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/tasAnom/djf/tasAnom_rcp85_eu_300km_Wall-N600000-P21_cdf_b9514_20y_djf_20401201-20601130.nc', 'tas_djf_uncons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/tasAnom/djf/tasAnom_rcp85_eu_300km_Wprior-N600000-P21_cdf_b9514_20y_djf_20401201-20601130.nc', 'tas_jja_cons': '/mnt/d/NLeSC/BETA/EUCP/Atlas/UKMO_UKCP/Data/tasAnom/jja/ta

In [6]:
# create an empty xarray to host the processed
ds = xr.Dataset(
                {"tas": (("season", "constrained", "percentile", "lat", "lon"),
                 np.random.rand(2, 2, 5, 17, 13)),
                 "pr": (("season", "constrained", "percentile", "lat", "lon"),
                 np.random.rand(2, 2, 5, 17, 13))},
                coords={
                         "season": ["DJF", "JJA"],
                         "constrained": [1, 0],
                         "percentile": [10, 25, 50, 75, 90],
                         "lat": dataset_pr_djf.coords["latitude"].values[::-1],
                         "lon": dataset_pr_djf.coords["longitude"].values[:]
                 },
                 attrs={"description":"UKMO UKCP data."}
)
ds

In [7]:
# assembly data
constrained = ["cons", "uncons"]
seasons = ["DJF", "JJA"]
percentile = [10, 25, 50, 75, 90]
tar_percentile_data = [15, 30, 55, 75, 95]
key_p_sel = dict(zip(percentile, tar_percentile_data))
key_c = dict(zip(constrained, range(len(constrained))))
key_s = dict(zip(seasons, range(len(seasons))))

for f in nc_files:
    # load dataset via xarray
    dataset = xr.open_dataset(nc_files[f])
    # string operations
    var = f.split('_')[0]
    season = f.split('_')[1]
    cons = f.split('_')[2]
    # set all missing values to nan
    dataset[f'{var}Anom'].values[dataset[f'{var}Anom'].values[:]>1.1e+10]=np.nan
    # save values to the target array
    for i, p in enumerate(percentile):
        ds[f'{var}'].values[key_s[season.upper()],key_c[cons],i,:,:] = dataset[f'{var}Anom'].values[::-1,:,key_p_sel[p]]

# save to netcdf
ds.to_netcdf(os.path.join(output_path, 'cleaned_UKMO_UKCP_CMIP6.nc'))

### Check output
Preview saved data via hvplot. <br>

In [8]:
ds = xr.open_dataset(os.path.join(output_path,'cleaned_UKMO_UKCP_CMIP6.nc'))
ds