In [10]:
import numpy as np
import xarray as xr
from glob import glob

### Preliminaries

In [11]:
###################
# Models (CMIP5)
###################

# nex models with all RCPs and variables (tas, pr)
nex_models = glob('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/*2006*')

nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/pr_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/pr_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmin_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmin_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmax_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmax_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = np.asarray([x[:-8] for x in nex_models])

complete_nex_models = np.unique([x for x in nex_models if (nex_models == x).sum() == 6]) # 2 RCPs x 3 variables

# loca models with all RCPs and variables
complete_loca_models = [x[42:] for x in glob('/gpfs/group/kaf26/default/public/LOCA/raw/*')]

loca_members = {}
for model in complete_loca_models:
    member = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp45/*')[0][-6:]
    loca_members.update({model : member})

# maca models with all RCPs and variables
maca_models_pr45 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_pr_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_pr45 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_pr_','')[:-31] for x in maca_models_pr45]

maca_models_pr85 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_pr_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_pr85 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_pr_','')[:-31] for x in maca_models_pr85]

maca_models_tmax45 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmax_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmax45 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax45]

maca_models_tmax85 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmax_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmax85 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax85]

maca_models_tmin45 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmin_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmin45 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin45]

maca_models_tmin85 = glob('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmin_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmin85 = [x.replace('/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin85]

complete_maca_models = np.unique(maca_models_pr45 + maca_models_pr85 + maca_models_tmax45 + maca_models_tmax85 + maca_models_tmin45 + maca_models_tmax85)

maca_members = {}
for model in complete_maca_models:
    member = model[-6:]
    maca_members.update({model[:-7] : member})
    
complete_maca_models = [x[:-7] for x in complete_maca_models]

# intersection of models
models = np.intersect1d(np.intersect1d(complete_nex_models, complete_loca_models), complete_maca_models)

# check if any final loca/maca models not r1i1p1 (all nex are r1i1p1)
bad_models = []
for model in models:
    if loca_members[model] != 'r1i1p1' or maca_members[model] != 'r1i1p1':
        models = np.setdiff1d(models, [model])

In [14]:
#################
# Data access
#################

# function to grab variables and SSPs for singe model
def grab_model(model_id, include_temp, include_prcp):
    # set up list of files to load
    rcp45_paths = []
    rcp85_paths = []
    
    if include_temp:
        rcp45_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp45/' + loca_members[model] + '/tasmin/*')
        rcp45_paths += rcp45_paths_tmp
        rcp45_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp45/' + loca_members[model] + '/tasmax/*')
        rcp45_paths += rcp45_paths_tmp
        
        rcp85_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp85/' + loca_members[model] + '/tasmin/*')
        rcp85_paths += rcp85_paths_tmp
        rcp85_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp85/' + loca_members[model] + '/tasmax/*')
        rcp85_paths += rcp85_paths_tmp
    
    if include_prcp:
        rcp45_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp45/' + loca_members[model] + '/pr/*')
        rcp45_paths += rcp45_paths_tmp
        
        rcp85_paths_tmp = glob('/gpfs/group/kaf26/default/public/LOCA/raw/' + model + '/16th/rcp85/' + loca_members[model] + '/pr/*')
        rcp85_paths += rcp85_paths_tmp
    
    # load
    ds_rcp45 = xr.open_mfdataset(rcp45_paths, parallel=True, chunks='auto')
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')

    ds_rcp85 = xr.open_mfdataset(rcp85_paths, parallel=True, chunks='auto')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')

    # concat along ssp dimension
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    return ds_out

In [20]:
ds = grab_model(models[0], False, True)

['pr_day_inmcm4_rcp45_r1i1p1_21000101-21001231.LOCA_2016-04-02.16th.nc', 'pr_day_inmcm4_rcp45_r1i1p1_20860101-20861231.LOCA_2016-04-02.16th.nc']


FileNotFoundError: [Errno 2] No such file or directory: b'/storage/home/d/dcl5300/pr_day_inmcm4_rcp45_r1i1p1_20290101-20291231.LOCA_2016-04-02.16th.nc'

In [None]:
ds

In [4]:
############
# Dask
############
from dask_jobqueue import PBSCluster
cluster = PBSCluster(resource_spec = 'pmem=15gb', env_extra= ['#PBS -l feature=rhel7'], walltime = '00:59:00')

cluster.scale(jobs=20)  # ask for jobs

from dask.distributed import Client
client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.102.201.240:46204,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Annual averages

In [None]:
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, include_temp=True, include_prcp=True)
    
    # storage options    
    store_prefix = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/loca/annual_avgs/' + model
    
    # compute and store
    ds['tasavg'] = (ds['tasmax'] + ds['tasmin']) / 2.
    ds['pr'] = ds['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    ds_final = ds.resample(time='1Y').mean()
    
    ds_final.to_netcdf(store_prefix + '.nc')
    print(model)

CNRM-CM5


## Annual maxima

In [None]:
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, include_temp=True, include_prcp=True)
    
    # storage options    
    store_prefix = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/nex-gddp/annual_maxs/' + model
    
    # compute and store
    ds['pr'] = ds['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    ds = ds.chunk({'rcp':1, 'time':365, 'lat':-1, 'lon':-1})
    ds_final = ds.resample(time='1Y').max()
    
    ds_final.to_netcdf(store_prefix + '.nc')
    print(model)

## Annual minima (temperature only)

In [None]:
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, include_temp=True, include_prcp=False)
    
    # storage options    
    store_prefix = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/nex-gddp/annual_mins/' + model
    
    # compute and store
    ds = ds.chunk({'rcp':1, 'time':365, 'lat':-1, 'lon':-1})
    ds_final = ds.resample(time='1Y').min()
    
    ds_final.to_netcdf(store_prefix + '.nc')
    print(model)

## Precipitation indices

In [None]:
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, include_temp=False, include_prcp=True)
    
    # storage options    
    store_prefix = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/nex-gddp/precip_inds/' + model
    
    # compute and store
    ds = ds.chunk({'rcp':1, 'time':365, 'lat':-1, 'lon':-1})
    ds['pr'] = ds['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    prcp_sdii = ds.where(ds.pr >= 1.).resample(time='1Y').mean()
    prcp_r20mm = ds.where(ds.pr >= 20.).resample(time='1Y').count()
    
    ds_final = xr.combine_by_coords([prcp_sdii.rename({'pr': 'SDII'}),
                                     prcp_r20mm.rename({'pr': 'R20mm'})])
    
    ds_final.to_netcdf(store_prefix + '.nc')
    print(model)