In [1]:
import numpy as np
import xarray as xr
import dask
import os
from glob import glob

### Preliminaries

In [2]:
###############################
# Set paths
# UPDATE THIS FOR REPRODUCTION
###############################
nex_in = '/gpfs/group/kaf26/default/public/NEX-GDDP/raw/'
loca_in = '/gpfs/group/kaf26/default/public/LOCA/raw/'
maca_in = '/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/'

nex_out = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/nex-gddp/'

In [3]:
###################
# Models (CMIP5)
###################

# nex models with all RCPs and variables (tas, pr)
nex_models = glob(nex_in + '*2006*')

nex_models = [x.replace(nex_in + 'pr_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'pr_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmin_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmin_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmax_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmax_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = np.asarray([x[:-8] for x in nex_models])

complete_nex_models = np.unique([x for x in nex_models if (nex_models == x).sum() == 6]) # 2 RCPs x 3 variables

# loca models with all RCPs and variables
complete_loca_models = [x[42:] for x in glob(loca_in + '*')]

loca_members = {}
for model in complete_loca_models:
    member = glob(loca_in + '' + model + '/16th/rcp45/*')[0][-6:]
    loca_members.update({model : member})

# maca models with all RCPs and variables
maca_models_pr45 = glob(maca_in + 'macav2metdata_pr_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_pr45 = [x.replace(maca_in + 'macav2metdata_pr_','')[:-31] for x in maca_models_pr45]

maca_models_pr85 = glob(maca_in + 'macav2metdata_pr_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_pr85 = [x.replace(maca_in + 'macav2metdata_pr_','')[:-31] for x in maca_models_pr85]

maca_models_tmax45 = glob(maca_in + 'macav2metdata_tasmax_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmax45 = [x.replace(maca_in + 'macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax45]

maca_models_tmax85 = glob(maca_in + 'macav2metdata_tasmax_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmax85 = [x.replace(maca_in + 'macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax85]

maca_models_tmin45 = glob(maca_in + 'macav2metdata_tasmin_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmin45 = [x.replace(maca_in + 'macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin45]

maca_models_tmin85 = glob(maca_in + 'macav2metdata_tasmin_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmin85 = [x.replace(maca_in + 'macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin85]

complete_maca_models = np.unique(maca_models_pr45 + maca_models_pr85 + maca_models_tmax45 + maca_models_tmax85 + maca_models_tmin45 + maca_models_tmax85)

maca_members = {}
for model in complete_maca_models:
    member = model[-6:]
    maca_members.update({model[:-7] : member})
    
complete_maca_models = [x[:-7] for x in complete_maca_models]

# intersection of models
models = np.intersect1d(np.intersect1d(complete_nex_models, complete_loca_models), complete_maca_models)

# check if any final loca/maca models not r1i1p1 (all nex are r1i1p1)
bad_models = []
for model in models:
    if loca_members[model] != 'r1i1p1' or maca_members[model] != 'r1i1p1':
        models = np.setdiff1d(models, [model])

In [3]:
# def subset_us(ds):
#     return ds.sel(
#         lon=slice(230, 310),
#         lat=slice(25, 55)
#     )

In [4]:
############
# Dask
############
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1, resource_spec = 'pmem=15GB', env_extra= ['#PBS -l feature=rhel7'], walltime = '02:00:00')

cluster.scale(jobs=30)  # ask for jobs

from dask.distributed import Client
client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.102.201.237:40968,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Annual averages

In [6]:
# calculate annual means for single model-year over all SSPs and variables
def model_year_means(model_id, year, path):
    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp45 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
        
    ds_tmax_rcp85 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp85 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    ds_pr_rcp45 = xr.open_dataset(path + 'pr_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_pr_rcp85 = xr.open_dataset(path + 'pr_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    # calculate maxima
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').mean()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').mean()
    ds_pr_rcp45 = ds_pr_rcp45.resample(time='1Y').mean()

    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').mean()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').mean()
    ds_pr_rcp85 = ds_pr_rcp85.resample(time='1Y').mean()

    # merge
    ds_rcp45 = xr.merge([ds_tmin_rcp45, ds_tmax_rcp45, ds_pr_rcp45])
    ds_rcp85 = xr.merge([ds_tmin_rcp85, ds_tmax_rcp85, ds_pr_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15 # K -> C
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15 # K -> C
    ds_out['pr'] = ds_out['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [11]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(nex_out + 'annual_avgs/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_means)(model, year, path)
            delayed_res.append(tmp_res)

        # some models are missing 2100
        if os.path.isfile('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmax_day_BCSD_rcp45_r1i1p1_' + model + '_2100.nc'):
            tmp_res = dask.delayed(model_year_means)(model, 2100, path)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(nex_out + 'annual_avgs/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done


## Annual maxima

In [12]:
# calculate annual maxima for single model-year over all SSPs and variables
def model_year_maxima(model_id, year, path):   
    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp45 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
        
    ds_tmax_rcp85 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp85 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    ds_pr_rcp45 = xr.open_dataset(path + 'pr_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_pr_rcp85 = xr.open_dataset(path + 'pr_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    # calculate maxima
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').max()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').max()
    ds_pr_rcp45 = ds_pr_rcp45.resample(time='1Y').max()

    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').max()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').max()
    ds_pr_rcp85 = ds_pr_rcp85.resample(time='1Y').max()

    # merge
    ds_rcp45 = xr.merge([ds_tmin_rcp45, ds_tmax_rcp45, ds_pr_rcp45])
    ds_rcp85 = xr.merge([ds_tmin_rcp85, ds_tmax_rcp85, ds_pr_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15 # K -> C
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15 # K -> C
    ds_out['pr'] = ds_out['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [14]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(nex_out + 'annual_maxs/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_maxima)(model, year, path)
            delayed_res.append(tmp_res)

        # some models are missing 2100
        if os.path.isfile('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmax_day_BCSD_rcp45_r1i1p1_' + model + '_2100.nc'):
            tmp_res = dask.delayed(model_year_maxima)(model, 2100, path)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(nex_out + 'annual_maxs/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done


## Annual minima (temperature only)

In [15]:
# calculate annual minima for single model-year over all SSPs and temperature variables
def model_year_minima(model_id, year, path):    
    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp45 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
        
    ds_tmax_rcp85 = xr.open_dataset(path + 'tasmax_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_tmin_rcp85 = xr.open_dataset(path + 'tasmin_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    # calculate minima
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').min()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').min()

    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').min()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').min()

    # merge
    ds_rcp45 = xr.merge([ds_tmin_rcp45, ds_tmax_rcp45])
    ds_rcp85 = xr.merge([ds_tmin_rcp85, ds_tmax_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # K -> C
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15
    
    return ds_out

In [16]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(nex_out + 'annual_mins/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_minima)(model, year, path)
            delayed_res.append(tmp_res)

        # some models are missing 2100
        if os.path.isfile('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/tasmax_day_BCSD_rcp45_r1i1p1_' + model + '_2100.nc'):
            tmp_res = dask.delayed(model_year_minima)(model, 2100, path)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(nex_out + 'annual_mins/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done


## Precipitation indices

In [17]:
# calculate annual precipitation indices for single model-year over all SSPs
def model_year_prcp_inds(model_id, year, path):
    # read files 
    ds_pr_rcp45 = xr.open_dataset(path + 'pr_day_BCSD_rcp45_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    ds_pr_rcp85 = xr.open_dataset(path + 'pr_day_BCSD_rcp85_r1i1p1_' + model_id + '_' + str(year) + '.nc')
    
    # calculate indices
    ds_sdii_rcp45 = ds_pr_rcp45.where(ds_pr_rcp45.pr >= 1./86400).resample(time='1Y').mean()
    ds_sdii_rcp85 = ds_pr_rcp85.where(ds_pr_rcp85.pr >= 1./86400).resample(time='1Y').mean()

    ds_r20mm_rcp45 = ds_pr_rcp45.where(ds_pr_rcp45.pr >= 20./86400).resample(time='1Y').count()
    ds_r20mm_rcp85 = ds_pr_rcp85.where(ds_pr_rcp85.pr >= 20./86400).resample(time='1Y').count()

    # merge
    ds_rcp45 = xr.combine_by_coords([ds_sdii_rcp45.rename({'pr': 'SDII'}),
                                     ds_r20mm_rcp45.rename({'pr': 'R20mm'})])
    
    ds_rcp85 = xr.combine_by_coords([ds_sdii_rcp85.rename({'pr': 'SDII'}),
                                     ds_r20mm_rcp85.rename({'pr': 'R20mm'})])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['SDII'] = ds_out['SDII'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [18]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(nex_out + 'prcp_inds/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_prcp_inds)(model, year, path)
            delayed_res.append(tmp_res)

        # some models are missing 2100
        if os.path.isfile('/gpfs/group/kaf26/default/public/NEX-GDDP/raw/pr_day_BCSD_rcp45_r1i1p1_' + model + '_2100.nc'):
            tmp_res = dask.delayed(model_year_prcp_inds)(model, 2100, path)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(nex_out + 'prcp_inds/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done
