## Cleaning up Atlas data - CNRM HistC
**Function**      : Preprocess netCDF files and restructure the dataset<br>
**Author          : Team BETA**<br>
**First Built**   : 2021.09.14<br>
**Last Update     : 2021.10.01**<br>
**Library**       : os, numpy, netcdf4, xarray<br>
**Description**   : In this notebook serves to clean up Atlas data which is given in netcdf format and aggregate the data into a single file.<br>
**Return Values   : .nc files**<br>
**Note**          : All the data is saved to netCDF4 format. Note that data from different models may vary concerning the resolution and coordinates.<br>

In [1]:
import os
import numpy as np
import xarray as xr

### Path
Specify the path to the dataset and the place to save the outputs. <br>

In [2]:
# please specify data path
datapath = '/home/sarah/GitHub/atlas/AtlasData/raw'
# please specify output path
output_path = '/home/sarah/GitHub/atlas/AtlasData/preprocess'
os.makedirs(output_path, exist_ok = True)

Only `institution_id` and `cmor_var` is based on on CMIP DRS.

In [3]:
output_file_name = {
    "prefix": "atlas",
    "activity": "EUCP", # project name e.g. EUCP
    "institution_id": "CNRM", # CNRM
    "source": "CMIP6", # e.g. CMIP6 or CMIP5
    "method": "HistC", # e.g. HistC
    "sub_method": "cons", # e.g. cons or uncons
    "cmor_var": "tas", # e.g. tas or pr
}

### Extract data
Extract weather/climate data from given netCDF files.

In [4]:
# CNRM HistC
# first check of data
dataset_tas_djf = xr.open_dataset(os.path.join(datapath,'CNRS_HistC',
                          'CNRM_atlas_tas_CMIP6_histssp585_DJF_latlon.nc'))
dataset_tas_djf

In [5]:
# data loader
dataset_tas_jja = xr.open_dataset(os.path.join(datapath,'CNRS_HistC',
                                  'CNRM_atlas_tas_CMIP6_histssp585_JJA_latlon.nc'))
dataset_tas_jja

In [6]:
# check target lat and lon from data sets
print(dataset_tas_djf["lat"][48:67])
print(dataset_tas_djf["lon"][:16])
print(dataset_tas_djf["lon"][-4:]-360)

<xarray.DataArray 'lat' (lat: 19)>
array([31.25, 33.75, 36.25, 38.75, 41.25, 43.75, 46.25, 48.75, 51.25, 53.75,
       56.25, 58.75, 61.25, 63.75, 66.25, 68.75, 71.25, 73.75, 76.25])
Coordinates:
  * lat      (lat) float64 31.25 33.75 36.25 38.75 ... 68.75 71.25 73.75 76.25
Attributes:
    units:      degrees_north
    long_name:  lat
    axis:       Y
<xarray.DataArray 'lon' (lon: 16)>
array([ 1.25,  3.75,  6.25,  8.75, 11.25, 13.75, 16.25, 18.75, 21.25, 23.75,
       26.25, 28.75, 31.25, 33.75, 36.25, 38.75])
Coordinates:
  * lon      (lon) float64 1.25 3.75 6.25 8.75 11.25 ... 31.25 33.75 36.25 38.75
Attributes:
    units:      degrees_east
    long_name:  lon
    axis:       X
<xarray.DataArray 'lon' (lon: 4)>
array([-8.75, -6.25, -3.75, -1.25])
Coordinates:
  * lon      (lon) float64 351.2 353.8 356.2 358.8


In [6]:
# due to the lon from 0-360 to -180-180
dataset_tas_djf.coords['lon'] = (dataset_tas_djf.coords['lon'] + 180) % 360 - 180
dataset_tas_djf = dataset_tas_djf.sortby(dataset_tas_djf.lon)

dataset_tas_jja.coords['lon'] = (dataset_tas_jja.coords['lon'] + 180) % 360 - 180
dataset_tas_jja = dataset_tas_djf.sortby(dataset_tas_jja.lon)

In [26]:
VAR_NAME = "tas"
SEASON =  ["DJF", "JJA"]
PERCENTILE = [10, 25, 75, 90] # TODO add 50
# select Europe
LAT = dataset_tas_djf.coords['lat'].sel(lat=slice(30, 77))
LON = dataset_tas_djf.coords['lon'].sel(lon=slice(-9, 39))

In [37]:
# create an empty xarray to host the processed
# TODO create none data
ds = xr.Dataset(
                {CMOR_VAR: (("longitude", "latitude", "season", "percentile"),
                 np.random.rand(len(LON), len(LAT), len(SEASON), len(PERCENTILE)))},
                coords={
                        "longitude": LON.values,     
                        "latitude": LAT.values,
                        "season": SEASON, # TODO use Climatological seasons
                        "percentile": PERCENTILE,
                 },
                 attrs={"description": "Contains modified CNRM/S HistC data used for Atlas in EUCP project.",
                       "history": "original CNRM/S HistC CNRM_atlas_tas_CMIP6_histssp585_JJA_latlon.nc, CNRM_atlas_tas_CMIP6_histssp585_DJF.nc" }
)
ds

Fix attributes of each variables

In [41]:
attrs = {"tas" : {"description":"Change in Air Temperature.",
            "standard_name":"Air Temperature",
            "long_name": "Near-Surface Air Temperature",
            "units": "degC",
           },
         "longitude": {"units": "degrees_east", "long_name": "longitude", "axis": "X"},
         "latitude": {"units": "degrees_north", "long_name": "latitude", "axis": "Y"},
         "season": {"units": "", "long_name": "season", "axis": "T"},
         "percentile": {"units": "%", "long_name": "percentile", "axis": "Z"},
}

for key in attrs.keys():
    ds[key].attrs = attrs[key]
    
ds

In [19]:
# assembly data and save to netcdf
def assembly(ds_original, ds_target, var, season, cons, percentile):
    """
    Select data from original nc files and save the target fields
    
    """
    key_s = dict(zip(SEASON, range(len(SEASON))))
    for j, p in enumerate(PERCENTILE):
        # select Europe
        ds_eu = ds_original[f"q{p}_{cons}"].sel(lon=slice(-9, 39), lat=slice(30, 77))
        
        # reorder the lat and lon of the original dataset, lon should be first
        ds_eu_reshaped = ds_eu.transpose("lon","lat")
        
        ds_target[f"{var}"].values[:,:,key_s[season],j] = ds_eu_reshaped #values[48:67, :16]
        

In [20]:
# call the function to preprocess the files and export them as netcdf files
output_file_name["cmor_var"] = CMOR_VAR 
for projection in ["uncons","cons"]:
    output_file_name["sub_method"] = projection
    # DJF
    assembly(dataset_tas_djf, ds, "tas", "DJF", projection, ds.coords["percentile"].values[:])

    # JJA
    assembly(dataset_tas_jja, ds, "tas", "JJA", projection, ds.coords["percentile"].values[:])
    
    file_name = f"{'_'.join(output_file_name.values())}.nc"
    print(f"one dataset is saved to {file_name}")
    ds.to_netcdf(os.path.join(output_path, file_name))

one dataset is saved to atlas_EUCP_CNRM_CMIP6_HistC_uncons_tas.nc
one dataset is saved to atlas_EUCP_CNRM_CMIP6_HistC_cons_tas.nc


### Check output
Preview saved data via hvplot. <br>

In [21]:
ds = xr.open_dataset(os.path.join(output_path,"atlas_EUCP_CNRM_CMIP6_HistC_cons_tas.nc"))
ds