In [1]:
import numpy as np
import xarray as xr
import dask
import os
from glob import glob

### Preliminaries

In [2]:
###############################
# Set paths
# UPDATE THIS FOR REPRODUCTION
###############################
nex_in = '/gpfs/group/kaf26/default/public/NEX-GDDP/raw/'
loca_in = '/gpfs/group/kaf26/default/public/LOCA/raw/'
maca_in = '/gpfs/group/kzk10/default/public/MACAv2-METDATA/raw/'

loca_out = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/cmip5/loca/'

In [3]:
###################
# Models (CMIP5)
###################

# nex models with all RCPs and variables (tas, pr)
nex_models = glob(nex_in + '*2006*')

nex_models = [x.replace(nex_in + 'pr_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'pr_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmin_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmin_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmax_day_BCSD_rcp45_r1i1p1_','') for x in nex_models]
nex_models = [x.replace(nex_in + 'tasmax_day_BCSD_rcp85_r1i1p1_','') for x in nex_models]
nex_models = np.asarray([x[:-8] for x in nex_models])

complete_nex_models = np.unique([x for x in nex_models if (nex_models == x).sum() == 6]) # 2 RCPs x 3 variables

# loca models with all RCPs and variables
complete_loca_models = [x[42:] for x in glob(loca_in + '*')]

loca_members = {}
for model in complete_loca_models:
    member = glob(loca_in + '' + model + '/16th/rcp45/*')[0][-6:]
    loca_members.update({model : member})

# maca models with all RCPs and variables
maca_models_pr45 = glob(maca_in + 'macav2metdata_pr_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_pr45 = [x.replace(maca_in + 'macav2metdata_pr_','')[:-31] for x in maca_models_pr45]

maca_models_pr85 = glob(maca_in + 'macav2metdata_pr_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_pr85 = [x.replace(maca_in + 'macav2metdata_pr_','')[:-31] for x in maca_models_pr85]

maca_models_tmax45 = glob(maca_in + 'macav2metdata_tasmax_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmax45 = [x.replace(maca_in + 'macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax45]

maca_models_tmax85 = glob(maca_in + 'macav2metdata_tasmax_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmax85 = [x.replace(maca_in + 'macav2metdata_tasmax_','')[:-31] for x in maca_models_tmax85]

maca_models_tmin45 = glob(maca_in + 'macav2metdata_tasmin_*_rcp45_2006_2010_CONUS_daily.nc')
maca_models_tmin45 = [x.replace(maca_in + 'macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin45]

maca_models_tmin85 = glob(maca_in + 'macav2metdata_tasmin_*_rcp85_2006_2010_CONUS_daily.nc')
maca_models_tmin85 = [x.replace(maca_in + 'macav2metdata_tasmin_','')[:-31] for x in maca_models_tmin85]

complete_maca_models = np.unique(maca_models_pr45 + maca_models_pr85 + maca_models_tmax45 + maca_models_tmax85 + maca_models_tmin45 + maca_models_tmax85)

maca_members = {}
for model in complete_maca_models:
    member = model[-6:]
    maca_members.update({model[:-7] : member})
    
complete_maca_models = [x[:-7] for x in complete_maca_models]

# intersection of models
models = np.intersect1d(np.intersect1d(complete_nex_models, complete_loca_models), complete_maca_models)

# check if any final loca/maca models not r1i1p1 (all nex are r1i1p1)
bad_models = []
for model in models:
    if loca_members[model] != 'r1i1p1' or maca_members[model] != 'r1i1p1':
        models = np.setdiff1d(models, [model])

In [4]:
############
# Dask
############
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1, 
                     resource_spec = 'pmem=20GB', memory='20GB',
                     env_extra=['#PBS -l feature=rhel7'], walltime='02:00:00')

cluster.scale(jobs=30)  # ask for jobs

from dask.distributed import Client
client = Client(cluster)

client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38408 instead


0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/38408/status,

0,1
Dashboard: /proxy/38408/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.102.201.223:43788,Workers: 0
Dashboard: /proxy/38408/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Annual averages

In [18]:
# calculate annual means for single model-year over all SSPs and variables
def model_year_means(model_id, year, path):
    # set up path
    netcdf_const = '1231.LOCA_2016-04-02.16th.nc'

    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
        
    ds_tmax_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    ds_pr_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/pr/pr_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_pr_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/pr/pr_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    ds_tas_rcp45 = (ds_tmax_rcp45['tasmax'] + ds_tmin_rcp45['tasmin']) /2.
    ds_tas_rcp85 = (ds_tmax_rcp85['tasmax'] + ds_tmin_rcp85['tasmin']) /2.
    
    # calculate avgs
    ds_tas_rcp45 = ds_tas_rcp45.resample(time='1Y').mean()
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').mean()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').mean()
    ds_pr_rcp45 = ds_pr_rcp45.resample(time='1Y').mean()

    ds_tas_rcp85 = ds_tas_rcp85.resample(time='1Y').mean()
    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').mean()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').mean()
    ds_pr_rcp85 = ds_pr_rcp85.resample(time='1Y').mean()

    # merge
    ds_rcp45 = xr.merge([ds_tas_rcp45.to_dataset(name='tas'), ds_tmin_rcp45, ds_tmax_rcp45, ds_pr_rcp45])
    ds_rcp85 = xr.merge([ds_tas_rcp85.to_dataset(name='tas'), ds_tmin_rcp85, ds_tmax_rcp85, ds_pr_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['tas'] = ds_out['tas'] - 273.15 # K -> C
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15 # K -> C
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15 # K -> C
    ds_out['pr'] = ds_out['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [19]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(loca_out + 'annual_avgs/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_means)(model, year, loca_in)
            delayed_res.append(tmp_res)
        
        # some models are missing 2100
        if os.path.isfile(loca_in + '' + model + '/16th/rcp45/r1i1p1/pr/pr_day_' + model + '_rcp45_r1i1p1_21000101-21001231.LOCA_2016-04-02.16th.nc'):
            tmp_res = dask.delayed(model_year_means)(model, 2100, loca_in)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(loca_out + 'annual_avgs/' + model + '.nc')

        print(model)

CNRM-CM5
CSIRO-Mk3-6-0
CanESM2
GFDL-ESM2G
GFDL-ESM2M
IPSL-CM5A-LR
IPSL-CM5A-MR
MIROC-ESM
MIROC-ESM-CHEM
MIROC5
MRI-CGCM3
NorESM1-M
bcc-csm1-1
inmcm4


In [20]:
client.close()

## Annual maxima

In [8]:
# calculate annual maxima for single model-year over all SSPs and variables
def model_year_maxima(model_id, year, path):
    netcdf_const = '1231.LOCA_2016-04-02.16th.nc'
    
    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
        
    ds_tmax_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    ds_pr_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/pr/pr_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_pr_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/pr/pr_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    # calculate maxima
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').max()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').max()
    ds_pr_rcp45 = ds_pr_rcp45.resample(time='1Y').max()

    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').max()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').max()
    ds_pr_rcp85 = ds_pr_rcp85.resample(time='1Y').max()

    # merge
    ds_rcp45 = xr.merge([ds_tmin_rcp45, ds_tmax_rcp45, ds_pr_rcp45])
    ds_rcp85 = xr.merge([ds_tmin_rcp85, ds_tmax_rcp85, ds_pr_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15 # K -> C
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15 # K -> C
    ds_out['pr'] = ds_out['pr'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [9]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(loca_out + 'annual_maxs/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_maxima)(model, year, loca_in)
            delayed_res.append(tmp_res)
            
        # some models are missing 2100
        if os.path.isfile(loca_in + '' + model + '/16th/rcp45/r1i1p1/pr/pr_day_' + model + '_rcp45_r1i1p1_21000101-21001231.LOCA_2016-04-02.16th.nc'):
            tmp_res = dask.delayed(model_year_maxima)(model, 2100, loca_in)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(loca_out + 'annual_maxs/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done


## Annual minima (temperature only)

In [10]:
# calculate annual minima for single model-year over all SSPs and temperature variables
def model_year_minima(model_id, year, path):
    netcdf_const = '1231.LOCA_2016-04-02.16th.nc'
    
    # read files
    ds_tmax_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
        
    ds_tmax_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmax/tasmax_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_tmin_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/tasmin/tasmin_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    # calculate minima
    ds_tmin_rcp45 = ds_tmin_rcp45.resample(time='1Y').min()
    ds_tmax_rcp45 = ds_tmax_rcp45.resample(time='1Y').min()

    ds_tmin_rcp85 = ds_tmin_rcp85.resample(time='1Y').min()
    ds_tmax_rcp85 = ds_tmax_rcp85.resample(time='1Y').min()

    # merge
    ds_rcp45 = xr.merge([ds_tmin_rcp45, ds_tmax_rcp45])
    ds_rcp85 = xr.merge([ds_tmin_rcp85, ds_tmax_rcp85])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # K -> C
    ds_out['tasmax'] = ds_out['tasmax'] - 273.15
    ds_out['tasmin'] = ds_out['tasmin'] - 273.15
    
    return ds_out

In [11]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(loca_out + 'annual_mins/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_minima)(model, year, loca_in)
            delayed_res.append(tmp_res)
        
        # some models are missing 2100
        if os.path.isfile(loca_in + '' + model + '/16th/rcp45/r1i1p1/pr/pr_day_' + model + '_rcp45_r1i1p1_21000101-21001231.LOCA_2016-04-02.16th.nc'):
            tmp_res = dask.delayed(model_year_minima)(model, 2100, loca_in)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(loca_out + 'annual_mins/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0 already done
CanESM2 already done
GFDL-ESM2G already done
GFDL-ESM2M already done
IPSL-CM5A-LR already done
IPSL-CM5A-MR already done
MIROC-ESM already done
MIROC-ESM-CHEM already done
MIROC5 already done
MRI-CGCM3 already done
NorESM1-M already done
bcc-csm1-1 already done
inmcm4 already done


## Precipitation indices

In [12]:
# calculate annual precipitation indices for single model-year over all SSPs
def model_year_prcp_inds(model_id, year, path):
    # constant across all files
    netcdf_const = '1231.LOCA_2016-04-02.16th.nc'
    
    # read files 
    ds_pr_rcp45 = xr.open_dataset(path + model_id + '/16th/rcp45/r1i1p1/pr/pr_day_' + model_id + '_rcp45_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    ds_pr_rcp85 = xr.open_dataset(path + model_id + '/16th/rcp85/r1i1p1/pr/pr_day_' + model_id + '_rcp85_r1i1p1_' + str(year) + '0101-' + str(year) + netcdf_const)
    
    # calculate indices
    ds_sdii_rcp45 = ds_pr_rcp45.where(ds_pr_rcp45.pr >= 1./86400).resample(time='1Y').mean()
    ds_sdii_rcp85 = ds_pr_rcp85.where(ds_pr_rcp85.pr >= 1./86400).resample(time='1Y').mean()

    ds_r20mm_rcp45 = ds_pr_rcp45.where(ds_pr_rcp45.pr >= 20./86400).resample(time='1Y').count()
    ds_r20mm_rcp85 = ds_pr_rcp85.where(ds_pr_rcp85.pr >= 20./86400).resample(time='1Y').count()

    # merge
    ds_rcp45 = xr.combine_by_coords([ds_sdii_rcp45.rename({'pr': 'SDII'})['SDII'],
                                     ds_r20mm_rcp45.rename({'pr': 'R20mm'})['R20mm']])
    
    ds_rcp85 = xr.combine_by_coords([ds_sdii_rcp85.rename({'pr': 'SDII'})['SDII'],
                                     ds_r20mm_rcp85.rename({'pr': 'R20mm'})['R20mm']])
    
    # assign and concat ssp dimension
    ds_rcp45 = ds_rcp45.assign_coords(rcp = 'rcp45')
    ds_rcp85 = ds_rcp85.assign_coords(rcp = 'rcp85')
    
    ds_out = xr.concat([ds_rcp45, ds_rcp85], dim='rcp')
    
    # unit conversions
    ds_out['SDII'] = ds_out['SDII'] * 86400 # kg m-2 s-1 -> mm day-1
    
    return ds_out

In [13]:
# loop through models: RUNTIME IS ~5 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # check if already exists
    if os.path.isfile(loca_out + 'precip_inds/' + model + '.nc'):
        print(model + ' already done')
        continue
    else:
        # Parallelize with dask over years
        delayed_res = []
        for year in range(2006,2100):
            tmp_res = dask.delayed(model_year_prcp_inds)(model, year, loca_in)
            delayed_res.append(tmp_res)
            
        # some models are missing 2100
        if os.path.isfile(loca_in + model + '/16th/rcp45/r1i1p1/pr/pr_day_' + model + '_rcp45_r1i1p1_21000101-21001231.LOCA_2016-04-02.16th.nc'):
            tmp_res = dask.delayed(model_year_prcp_inds)(model, 2100, loca_in)
            delayed_res.append(tmp_res)
    
        # Run
        res = dask.compute(*delayed_res)

        # Store
        df_final = xr.combine_by_coords(res)
        df_final.to_netcdf(loca_out + 'prcp_inds/' + model + '.nc')

        print(model)

CNRM-CM5 already done
CSIRO-Mk3-6-0
CanESM2
GFDL-ESM2G
GFDL-ESM2M
IPSL-CM5A-LR
IPSL-CM5A-MR
MIROC-ESM
MIROC-ESM-CHEM
MIROC5
MRI-CGCM3
NorESM1-M
bcc-csm1-1
inmcm4
