## **CORDEX data pre-processing for East Africa drought study**

In [1]:
import glob, re, regionmask, geopandas as gpd, numpy as np, xarray as xr
xr.set_options(keep_attrs = True)

from xclim.core.units import convert_units_to
from xclim.core.calendar import convert_calendar

from xclim.indices._conversion import potential_evapotranspiration

# load shapefile of study region
sf = gpd.read_file("sf_gha")

ERROR 1: PROJ: proj_create_from_database: Open of /rds/general/user/cb2714/home/anaconda3/envs/wwa/share/proj failed


---
### **Precipitation**
- OND accumulated precipitation averaged over the study region
- MAM accumulated precipitation averaged over the study region
- 2-year accumulated precipitation (Jan-Dec) averaged over the study region
- seasonal cycle over the study region
- spatial pattern of OND and MAM precip over the region from 28E-54E and 15S-18N

In [6]:
for fnm in sorted(glob.glob("../synda/data/CORDEX/AFR-*/*/*/*/day/pr")):
    
    mdl_fl = sorted(glob.glob(fnm+"/*.nc"))
    new_fnm = re.sub("_[0-9]{8}-", "_"+mdl_fl[0][-20:-12]+"-", mdl_fl[-1].split("/")[-1])
    print(new_fnm)
    
    # skip if file already exists
#     if len(glob.glob(new_fnm)) > 0: continue
    if len(glob.glob(new_fnm)) > 0: continue
      
    # also skip if not both historical & RCP runs available
    if not all([any([s in fnm for fnm in mdl_fl]) for s in ["historical", "rcp85"]]): continue
    
    # otherwise - carry on
    rcp = xr.open_mfdataset([fnm for fnm in mdl_fl if "rcp85" in fnm]).sel(time = slice(None, "2050"))
    hist = xr.open_mfdataset([fnm for fnm in mdl_fl if "historical" in fnm])
    
    if "rlon" in rcp.dims:
        hist = hist.assign_coords(rlon = rcp.rlon, rlat = rcp.rlat)
        xy_dims = ["rlon", "rlat"]
    elif "x" in rcp.dims:
        hist = hist.assign_coords(x = rcp.x, y = rcp.y)
        xy_dims = ["x", "y"]
    else:
        print(hist.dims)
        continue
    
    # combine historical & rcp into single array, create regionmask & larger rectangular mask
    da = xr.concat([hist, rcp], "time").pr
    rm = regionmask.mask_3D_geopandas(sf, da.lon, da.lat, drop = False).squeeze(drop = True)
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                  
    # daily time series over study region (will compute seasonal/annual indices separately)
    ts = convert_units_to(da.where(rm == 1).mean(xy_dims), "mm/day")
    ts.to_netcdf(new_fnm)
                  
    # seasonal cycle over the study region (obs period only)
    sc = ts.sel(time = slice("1980", "2022")).groupby("time.dayofyear").mean()
    sc.to_netcdf("sc_"+re.sub("[0-9]{8}-[0-9]{8}", "19800101-20221231", new_fnm))
    
    # spatial pattern over larger rectangular region (obs period only)
    sp_mask = np.logical_and(np.logical_and(da.lat > -15, da.lat < 18), np.logical_and(da.lon > 28, da.lon < 54))
    sp = da.where(sp_mask == 1).dropna(xy_dims[0], "all").dropna(xy_dims[1], "all").sel(time = slice("1980", "2022"))
    
    # check spatial patterns for short & long rains separately
    sp.sel(time = [m in [10,11,12] for m in sp.time.dt.month]).mean("time").to_netcdf("sp-ond-"+re.sub("[0-9]{8}-[0-9]{8}", "19800101-20221231", new_fnm))
    sp.sel(time = [m in [3,4,5] for m in sp.time.dt.month]).mean("time").to_netcdf("sp-mam-"+re.sub("[0-9]{8}-[0-9]{8}", "19800101-20221231", new_fnm))

pr_AFR-22_CCCma-CanESM2_rcp85_r1i1p1_CCCma-CanRCM4_r2_day_19500101-21001231.nc
pr_AFR-22_MOHC-HadGEM2-ES_rcp85_r1i1p1_CLMcom-KIT-CCLM5-0-15_v1_day_19500101-20991230.nc
pr_AFR-22_MOHC-HadGEM2-ES_rcp85_r1i1p1_GERICS-REMO2015_v1_day_19700101-20991230.nc
pr_AFR-22_MOHC-HadGEM2-ES_rcp85_r1i1p1_ICTP-RegCM4-7_v0_day_19700101-20991230.nc
pr_AFR-22_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_CLMcom-KIT-CCLM5-0-15_v1_day_19500101-21001231.nc
pr_AFR-22_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_GERICS-REMO2015_v1_day_19700101-21001231.nc
pr_AFR-22_MPI-M-MPI-ESM-MR_rcp85_r1i1p1_ICTP-RegCM4-7_v0_day_19700101-20991231.nc
pr_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_CLMcom-KIT-CCLM5-0-15_v1_day_19500101-21001231.nc
pr_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_GERICS-REMO2015_v1_day_19700101-21001231.nc
pr_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_ICTP-RegCM4-7_v0_day_19700101-20991231.nc
pr_AFR-44_CCCma-CanESM2_historical_r1i1p1_UQAM-CRCM5_v1_day_19500101-20051231.nc
pr_AFR-44_CCCma-CanESM2_rcp85_r1i1p1_CCCma-CanRCM4_r2_day_19500101-21001231.nc
pr_AF

---
### **Temperature**
 - gridded PET for the study region
 - seasonal cycle over the study region
 - spatial pattern of annual mean temperatures over the region from 28E-54E and 15S-18N

In [2]:
for fpath in glob.glob("/rds/general/user/cb2714/home/synda/data/CORDEX/AFR-*/*/*/*/day/tas")[:2]:
    
    mdl_fl = sorted(glob.glob(fpath+"/*.nc"))
    new_fnm = re.sub("_[0-9]{8}-", "_"+mdl_fl[0][-20:-12]+"-", mdl_fl[-1].split("/")[-1])
    print(new_fnm)
    
    # skip if file already exists
    if len(glob.glob(new_fnm)) > 0: continue

    # also skip if not both historical & RCP runs available
    if not all([any([s in fnm for fnm in mdl_fl]) for s in ["historical", "rcp85"]]): continue
        
    # skip if file already exists
#     if len(glob.glob(new_fnm)) > 0: continue

    # otherwise - carry on
    rcp = convert_calendar(xr.open_mfdataset([fnm for fnm in mdl_fl if "rcp85" in fnm]).sel(time = slice(None, "2050")), "default", align_on = "year")
    hist = convert_calendar(xr.open_mfdataset([fnm for fnm in mdl_fl if "historical" in fnm]), "default", align_on = "year")

    if "rlon" in rcp.dims:
        hist = hist.assign_coords(rlon = rcp.rlon, rlat = rcp.rlat)
        xy_dims = ["rlon", "rlat"]
    elif "x" in rcp.dims:
        hist = hist.assign_coords(x = rcp.x, y = rcp.y)
        xy_dims = ["x", "y"]
    else:
        print(hist.dims)
        continue
    
    # combine historical & rcp into single array, create regionmask & larger rectangular mask
    da = convert_units_to(xr.concat([hist, rcp], "time").tas, "degC")
    rm = regionmask.mask_3D_geopandas(sf, da.lon, da.lat, drop = False).squeeze(drop = True)
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    # define region over which to compute PET & spatial pattern
    sp_mask = np.logical_and(np.logical_and(da.lat > -15, da.lat < 18), np.logical_and(da.lon > 28, da.lon < 54))
    da = da.where(sp_mask == 1).dropna(xy_dims[0], "all").dropna(xy_dims[1], "all")
    
#     # PET averaged over the study region - note that the method returns monthly PET, even from daily temperatures
#     pet = convert_units_to(potential_evapotranspiration(tas = da, method = "thornthwaite48"), "mm/month", context = "hydro").rename("pet")
#     pet_ts = pet.where(rm == 1).mean(xy_dims)
#     pet_ts.to_netcdf(re.sub("tas", "pet", new_fnm))

    # spatial pattern over larger rectangular region (obs period only)
    sp = da.sel(time = slice("1980", "2022")).mean("time")
    sp.to_netcdf("sp-"+re.sub("[0-9]{8}-[0-9]{8}", "19800101-20221231", new_fnm))
    
    # seasonal cycle over the study region (obs period only)
    sc = da.where(rm == 1).mean(xy_dims).sel(time = slice("1980", "2022")).groupby("time.dayofyear").mean()
    sc.to_netcdf("sc_"+re.sub("[0-9]{8}-[0-9]{8}", "19800101-20221231", new_fnm))
    

tas_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_CLMcom-KIT-CCLM5-0-15_v1_day_19500101-21001231.nc
tas_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_GERICS-REMO2015_v1_day_19700101-21001231.nc
tas_AFR-22_NCC-NorESM1-M_rcp85_r1i1p1_ICTP-RegCM4-7_v0_day_19700101-20991231.nc


---
### **Potential evapotranspiration**
- if available, 2-year evspsblpot averaged over the study region
- retain full daily time series for comparison with values computed from temperatures

In [4]:
for fpath in glob.glob("/rds/general/user/cb2714/home/synda/data/CORDEX/AFR-*/*/*/*/day/evspsblpot"):
    
    mdl_fl = sorted(glob.glob(fpath+"/*.nc"))
    new_fnm = re.sub("_[0-9]{8}-", "_"+mdl_fl[0][-20:-12]+"-", mdl_fl[-1].split("/")[-1])
    print(new_fnm)
    
    # also skip if not both historical & RCP runs available
    if not all([any([s in fnm for fnm in mdl_fl]) for s in ["historical", "rcp85"]]): continue
        
    # skip if file already exists
    if len(glob.glob(new_fnm)) > 0: continue
        
    # otherwise - carry on
    rcp = convert_calendar(xr.open_mfdataset([fnm for fnm in mdl_fl if "rcp85" in fnm]).sel(time = slice(None, "2050")), "default", align_on = "year")
    hist = convert_calendar(xr.open_mfdataset([fnm for fnm in mdl_fl if "historical" in fnm]), "default", align_on = "year")

    if "rlon" in rcp.dims:
        hist = hist.assign_coords(rlon = rcp.rlon, rlat = rcp.rlat)
        xy_dims = ["rlon", "rlat"]
    elif "x" in rcp.dims:
        hist = hist.assign_coords(x = rcp.x, y = rcp.y)
        xy_dims = ["x", "y"]
    else:
        print(hist.dims)
        continue
    
    # combine historical & rcp into single array, create regionmask
    da = convert_units_to(xr.concat([hist, rcp], "time").evspsblpot, "mm/day", context = "hydro")
    rm = regionmask.mask_3D_geopandas(sf, da.lon, da.lat, drop = False).squeeze(drop = True)
    
    # get daily time series over study region
    ts = da.where(rm == 1).mean(xy_dims)
    ts.to_netcdf(new_fnm)

evspsblpot_AFR-44_CCCma-CanESM2_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_ICHEC-EC-EARTH_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_ICHEC-EC-EARTH_rcp85_r3i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_ICHEC-EC-EARTH_rcp85_r12i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_IPSL-IPSL-CM5A-MR_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_MPI-M-MPI-ESM-LR_rcp85_r3i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_MPI-M-MPI-ESM-LR_rcp85_r2i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_CSIRO-QCCCE-CSIRO-Mk3-6-0_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_MIROC-MIROC5_rcp85_r1i1p1_SMHI-RCA4_v1_day_19510101-21001231.nc
evspsblpot_AFR-44_MOHC-HadGEM2-ES_rcp85_r1i1p1_SMHI