In [2]:
from pyesgf.search import SearchConnection
from pyesgf.logon import LogonManager

import xarray as xr
import glob
import re
import numpy as np

import warnings
warnings.filterwarnings("ignore", message = ".+encoding does not have units specified.+")
import os; os.environ ["ESGF_PYCLIENT_NO_FACETS_STAR_WARNING"] = ""
from os.path import exists

# create connection to ESGF servers

lm = LogonManager()
lm.logoff()
lm.logon(hostname="esg-dn1.nsc.liu.se", bootstrap=True, interactive=True)
lm.is_logged_on()

conn = SearchConnection('https://esgf-data.dkrz.de/esg-search')

KeyboardInterrupt: 

---
### **CORDEX data**

_About 1.5Tb of EUR-11 precip data if whole files are downloaded using Synda. Instead, use WPS to extract only the required area & save files_



In [2]:
gcms = ['NCC-NorESM1-M', 'MPI-M-MPI-ESM-LR', 'MOHC-HadGEM2-ES', 'IPSL-IPSL-CM5A-MR', 'ICHEC-EC-EARTH', 'CNRM-CERFACS-CNRM-CM5']
rcms = ['WRF381P', 'WRF361H', 'RegCM4-6', 'REMO2015', 'REMO2009', 'RCA4', 'RACMO22E', 'HadREM3-GA7-05', 'HIRHAM5', 'COSMO-crCLIM-v1-1', 'CCLM4-8-17', 'ALARO-0', 'ALADIN63', 'ALADIN53']

problem_nodes = ["esgf1.dkrz.de", "esg-dn1.nsc.liu.se", "cordexesg.dmi.dk", "esgf.ceda.ac.uk"]
problem_nodes = []

# lat-lon bounds for selection of data
xn = 5; xx = 20; yn = 35; yx = 50

In [3]:
conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True)

for gcm in gcms:
    for rcm in rcms:
        
        skip_mdl = False
        
        # all manner of things going wrong with this particular run (Aladin hopefully a temporary glitch)
        if gcm+" "+rcm in ["MPI-M-MPI-ESM-LR WRF381P", "MPI-M-MPI-ESM-LR ALADIN63"]: continue
        
        # find available model data (skip if none)
        ctx = conn.new_context(facets = "ensemble, data_node", project = 'CORDEX', domain = 'EUR-11', time_frequency = 'day', experiment = 'historical, rcp85', variable = 'pr',
                               driving_model = gcm, rcm_name = rcm) 
        if ctx.hit_count == 0: 
#             print(" - no runs found")
            continue
        print(gcm+" "+rcm, end = "")
        
        # check data node - these can be temporarily unavailable, would need to check again
        data_node = list(ctx.facet_counts["data_node"].keys())[0]
        if data_node in problem_nodes:
            print(" - unreliable node ("+data_node+")")
            continue
        
        # if more than one realisation, pick the first
        if len(ctx.facet_counts["ensemble"]) > 1:
            ens = sorted(list(ctx.facet_counts["ensemble"].keys()))[0]
            ctx = conn.new_context(facets = "ensemble, data_node", project = 'CORDEX', domain = 'EUR-11', time_frequency = 'day', experiment = 'rcp85', variable = 'pr',
                                   driving_model = gcm, rcm_name = rcm, ensemble = ens) 
        
        # get list of urls for this model pair & flatten into a single list (also drop any starting after 2050, won't be used)
        all_urls = [[file.opendap_url for file in ctx_search.file_context().search(facets = None)] for ctx_search in ctx.search()]
        all_urls = [item for sublist in all_urls for item in sublist if not int(item[-20:-16]) > 2050]
        
        if len(glob.glob("*"+gcm+"*"+rcm+"*.nc")) >= len(all_urls):
            print(" complete")
            continue
        
        for url in all_urls:
            
            # flag to avoid trying every URL when one fails
            if skip_mdl: continue
            
            fnm = url.split("/")[-1]
            if os.path.exists(fnm): continue
            
            # load data
            try:
                ds = xr.open_dataset(url)
            except:
                print(" - no data found: "+list(ctx.facet_counts["data_node"].keys())[0])
                skip_mdl = True
                continue
            
            if "x" in ds.dims:
                x_dim, y_dim = ["x", "y"]
            elif "rlon" in ds.dims:
                x_dim, y_dim = ["rlon", "rlat"]
            
            # create mask of area required, trim data to enclose area
            sp_mask = np.logical_and(np.logical_and(ds.lat > yn, ds.lat < yx), np.logical_and(ds.lon > xn, ds.lon < xx))
            ds = ds.where(sp_mask == 1).dropna(x_dim, "all").dropna(y_dim, "all")
            
            # save & continue
            ds.to_netcdf(fnm)
            print(".", end = "")
        
        print("")

NCC-NorESM1-M WRF381P complete
NCC-NorESM1-M RegCM4-6 complete
NCC-NorESM1-M REMO2015

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de

NCC-NorESM1-M RCA4 - no data found: esg-dn1.nsc.liu.se

NCC-NorESM1-M RACMO22E

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de

NCC-NorESM1-M HadREM3-GA7-05

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf.ceda.ac.uk

NCC-NorESM1-M HIRHAM5

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: cordexesg.dmi.dk

NCC-NorESM1-M COSMO-crCLIM-v1-1

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de



syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


NCC-NorESM1-M ALADIN63 complete
MPI-M-MPI-ESM-LR WRF361H - no data found: esgf1.dkrz.de

MPI-M-MPI-ESM-LR RegCM4-6

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 complete
MPI-M-MPI-ESM-LR REMO2015 - no data found: esgf1.dkrz.de

MPI-M-MPI-ESM-LR REMO2009

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de

MPI-M-MPI-ESM-LR RCA4

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esg-dn1.nsc.liu.se

MPI-M-MPI-ESM-LR RACMO22E

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de

MPI-M-MPI-ESM-LR HadREM3-GA7-05

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf.ceda.ac.uk

MPI-M-MPI-ESM-LR HIRHAM5

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: cordexesg.dmi.dk

MPI-M-MPI-ESM-LR COSMO-crCLIM-v1-1

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de

MPI-M-MPI-ESM-LR CCLM4-8-17

syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


 - no data found: esgf1.dkrz.de



syntax error, unexpected $end, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR
context: ^


MOHC-HadGEM2-ES WRF381P

RuntimeError: NetCDF: Authorization failure

In [8]:
import glob
import re

In [56]:
for fpath in glob.glob("../synda/data/CORDEX/EUR-11/*/*/*/day/pr/"):
    
    synda_fl = glob.glob(fpath+"*.nc")
    if len(synda_fl) == 0: continue
        
    synda_fl = [fnm for fnm in synda_fl if int(fnm[-20:-16]) <= 2050]
    fnm_patn = re.sub("rcp85", "*", re.sub("historical", "*", "_".join(synda_fl[0].split("/")[-1].split("_")[:-1])+"*.nc"))
    
    if len(glob.glob(fnm_patn)) == len(synda_fl):
        print(fpath, len(glob.glob(fnm_patn)), len(synda_fl))

IndexError: list index out of range

In [59]:
varnm="pr"
# get list of available files, check against files already processed
synda_fl = glob.glob("../synda/data/CORDEX/EUR-11/*/*/*/day/"+varnm+"/*.nc")
synda_fl = [fnm for fnm in synda_fl if int(fnm[-20:-16]) <= 2050]
fl = glob.glob("*.nc")

len([fnm for fnm in synda_fl if not fnm.split("/")[-1] in fl])

1

In [60]:
synda_fl

['../synda/data/CORDEX/EUR-11/ICHEC-EC-EARTH/r1i1p1/HIRHAM5/day/pr/pr_EUR-11_ICHEC-EC-EARTH_rcp85_r1i1p1_DMI-HIRHAM5_v1_day_20260101-20301231.nc']

In [None]:
[fnm for fnm in glob.glob("../synda/data/CORDEX/EUR-11/*/*/*/day/"+varnm+"/*.nc")] 

In [30]:
(set([re.sub("historical", "rcp85", "_".join(fnm.split("_")[:-1])) for fnm in glob.glob("*.nc")]))

{'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_CLMcom-CCLM4-8-17_v1_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_CLMcom-ETH-COSMO-crCLIM-v1-1_v1_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_CNRM-ALADIN53_v1_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_CNRM-ALADIN63_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_GERICS-REMO2015_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_ICTP-RegCM4-6_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_IPSL-WRF381P_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_KNMI-RACMO22E_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_MOHC-HadREM3-GA7-05_v2_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_RMIB-UGent-ALARO-0_v1_day',
 'pr_EUR-11_CNRM-CERFACS-CNRM-CM5_rcp85_r1i1p1_SMHI-RCA4_v1_day',
 'pr_EUR-11_ICHEC-EC-EARTH_rcp85_r12i1p1_CLMcom-CCLM4-8-17_v1_day',
 'pr_EUR-11_ICHEC-EC-EARTH_rcp85_r12i1p1_CLMcom-ETH-COSMO-crCLIM-v1-1_v1_day',
 'pr_EUR-11_ICHEC-EC-EARTH_rcp85_r12i1p1_ICTP-RegCM4-6_v