In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import glob
import matplotlib
import matplotlib.pyplot as plt
import cftime
import dask
import statsmodels.api as sm

In [2]:
# Updated for PBS scheduler
# this could go into utils.
# By default gets 1 core w/ 25 GB memory
def get_ClusterClient(ncores=1, nmem='25GB'):
    import dask
    from dask_jobqueue import PBSCluster
    from dask.distributed import Client
    ncores=ncores
    nmem = nmem

    cluster = PBSCluster(
        cores=ncores, # The number of cores you want
        memory=nmem, # Amount of memory
        processes=ncores, # How many processes
        queue='casper', # The type of queue to utilize (/glade/u/apps/dav/opt/usr/bin/execcasper)
        resource_spec='select=1:ncpus='+str(ncores)+':mem='+nmem, # Specify resources
        project='P93300641', # Input your project ID here
        walltime='4:00:00', # Amount of wall time
        interface='ib0', # Interface to use
    )

    client = Client(cluster)
    return cluster, client

In [3]:
cluster, client = get_ClusterClient()
cluster.scale(20)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38692 instead
  http_address["port"], self.http_server.port


In [195]:
client

0,1
Client  Scheduler: tcp://10.12.206.54:35426  Dashboard: https://https://jupyterhub.hpc.ucar.edu/stable/user/djk2120/proxy/{port}/status,Cluster  Workers: 20  Cores: 20  Memory: 500.00 GB


In [5]:
def get_params(keys,paramkey):
    params=[]
    minmaxs=[]
    for key in keys:
        ix     = paramkey.key==key
        params.append(paramkey.param[ix].values[0])
        minmaxs.append(paramkey.minmax[ix].values[0])
    return params,minmaxs

In [6]:
def month_wts(nyears):
    days_pm  = [31,28,31,30,31,30,31,31,30,31,30,31]
    return xr.DataArray(np.tile(days_pm,nyears),dims='time')

In [193]:
def get_ensemble(name,data_vars,htape,keys,paramkey):
    '''
    Returns a dataset comprising the given ensemble
    name  = 'CTL2020' or 'C285' or 'AF1905'
    data_vars, e.g. ['GPP','HR','AR']
    htape, e.g. 'h0' 0/1/2/3/4/5/7 available
    '''
    
    #define the directory structure and find files
    topdir     = '/glade/scratch/djk2120/PPEn11/hist/' 
    thisdir    = topdir+name+'/'
    files      = [glob.glob(thisdir+'*'+key+'*'+htape+'*.nc')[0] for key in keys]

    def preprocess(ds):
        return ds[data_vars]

    #read in the dataset
    ds = xr.open_mfdataset(files,combine='nested',concat_dim='ens',
                           parallel=True,preprocess=preprocess)

    #fix up time dimension
    ds['time'] = xr.cftime_range(str(2005),periods=120,freq='MS') #fix time bug
    
    #add in some extra variables
    ds0 = xr.open_dataset(files[0])
    extras = ['grid1d_lat','grid1d_lon']
    for extra in extras:
        ds[extra]=ds0[extra]

    #append some info about key/param/minmax
    params,minmaxs = get_params(keys,paramkey) 
    ds['key']    = xr.DataArray(keys,dims='ens')
    ds['param']  = xr.DataArray(params,dims='ens')
    ds['minmax'] = xr.DataArray(minmaxs,dims='ens')
    
    return ds

In [226]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
#fetch the paraminfo
csv = '/glade/scratch/djk2120/PPEn11/firstpass.csv' 
paramkey = pd.read_csv(csv)

#fetch the sparsegrid landarea
la_file = '/glade/scratch/djk2120/PPEn08/sparsegrid_landarea.nc'
la = xr.open_dataset(la_file).landarea  #km2

### CTL2010

In [251]:
#choose your subset of variables
data_vars = ['GPP','AR','HR','EFLX_LH_TOT','FCTR','FAREA_BURNED',
             'TWS','SOILWATER_10CM','SNOWDP','TV','TSOI_10CM','TLAI','FSR']
keys = paramkey.key
#read in the dataset
ds = get_ensemble('CTL2010',data_vars,'h0',keys,paramkey)

In [24]:
uniques = np.unique(ds.param)
ix = uniques=='default'
uniques = uniques[~ix]

In [199]:
def find_pair(da,params,p):
    ix = params==p
    if ix.sum().values<2:
        ix = np.logical_or(ix,params=='default')
    return da.isel(ens=ix)
    

In [217]:
def param_effect(pair):
    delt  = pair.isel(ens=0)-pair.isel(ens=1)
    sigma = np.std(delt.mean(dim='year'))
    if sigma>0:
        pe_mean = abs((la*delt).sum(dim='gridcell').mean(dim='year').values)
        iav     = (la*pair).sum(dim='gridcell').std(dim='year').values
        pe_iav  = abs(iav[0]-iav[1])
    else:
        pe_mean = 0
        pe_iav  = 0

    
    return pe_mean,pe_iav
    

In [None]:
def param_effect2(pair1,pair2):
    

In [226]:
def spatial(da,params,uniques):
    #lower scores have more distinct spatial signature
    nx = len(uniques)
    
    #calculate delta annual mean for each param-pair
    delts = np.ndarray([nx,400])
    for i,u in zip(range(nx),uniques):
        pair = find_pair(da,params,u).mean(dim='year')
        delt = pair.isel(ens=0) - pair.isel(ens=1)
        delts[i,:] = delt
        
    #calculate cross-correlations
    #  skip calc if sigma==0
    sigs = np.std(delts,axis=1)
    rvals = np.zeros([nx,nx])
    for i in range(nx):
        if sigs[i]==0:
            rvals[i,:]=1
        else:
            for j in range(i,nx,1):
                if sigs[j]==0:
                    rvals[i,j] = 1
                elif i==j:
                    rvals[i,j] = 1
                else:
                    x=delts[[i,j],:]
                    r2= np.corrcoef(x)[0,1]**2
                    rvals[i,j] =r2
                    rvals[j,i] =r2
                    
    return rvals.sum(axis=0)

In [252]:
pes=dict()
for datavar in data_vars:
    print(datavar)
    da = (month_wts(10)*ds[datavar]).groupby('time.year').sum().compute()
    s1 = []; s2 = []
    for u in uniques:
        pair = find_pair(da,ds.param,u)
        m,i  = param_effect(pair)
        s1.append(m)
        s2.append(i)
    s3 = spatial(da,ds.param,uniques)
    pes[datavar]={'s1':s1,'s2':s2,'s3':s3}

GPP
AR
HR
EFLX_LH_TOT
FCTR
FAREA_BURNED
TWS
SOILWATER_10CM
SNOWDP
TV
TSOI_10CM
TLAI
FSR


In [255]:
top50 = []
for datavar in data_vars:
    cats  = ['s1','s2','s3']
    nxs   = [5,3,1]
    flips = [False,False,True]
    for cat,nx,flip in zip(cats,nxs,flips):
        s = pes[datavar][cat]
        ranks = np.argsort(s)   
        if flip:
            ranks = np.flipud(ranks)
        for i in ranks[-nx:]:
            u = uniques[i]
            if u not in top50:
                print(datavar,cat,u)
                top50.append(u)
    
    

GPP s1 lmrhd
GPP s1 kmax
GPP s1 medlynintercept
GPP s1 jmaxb0
GPP s1 lmrse
GPP s2 wc2wjb0
GPP s2 leafcn
GPP s2 sucsat_sf
GPP s3 aq_sp_yield_min
AR s1 vcmaxse_sf
HR s2 FUN_fracfixers
HR s3 occur_hi_gdp_tree
EFLX_LH_TOT s1 medlynslope
EFLX_LH_TOT s2 fff
EFLX_LH_TOT s2 frac_sat_soil_dsl_init
FCTR s3 non_boreal_peatfire_c
FAREA_BURNED s1 jmaxse_sf
TWS s1 hksat_sf
TWS s1 bsw_sf
TWS s1 watsat_sf
TWS s2 baseflow_scalar
SNOWDP s1 ceta
SNOWDP s1 snw_rds_refrz
SNOWDP s1 zsno
SNOWDP s1 zetamaxstable
SNOWDP s1 upplim_destruct_metamorph
SNOWDP s2 wind_snowcompact_fact
SNOWDP s3 cropfire_a1
TV s2 kcha
TV s2 nstem
TLAI s2 theta_cj
FSR s1 rhosnir
FSR s1 rholnir
FSR s1 taulnir


In [281]:
top50

['lmrhd',
 'kmax',
 'medlynintercept',
 'jmaxb0',
 'lmrse',
 'wc2wjb0',
 'leafcn',
 'sucsat_sf',
 'aq_sp_yield_min',
 'vcmaxse_sf',
 'FUN_fracfixers',
 'occur_hi_gdp_tree',
 'medlynslope',
 'fff',
 'frac_sat_soil_dsl_init',
 'non_boreal_peatfire_c',
 'jmaxse_sf',
 'hksat_sf',
 'bsw_sf',
 'watsat_sf',
 'baseflow_scalar',
 'ceta',
 'snw_rds_refrz',
 'zsno',
 'zetamaxstable',
 'upplim_destruct_metamorph',
 'wind_snowcompact_fact',
 'cropfire_a1',
 'kcha',
 'nstem',
 'theta_cj',
 'rhosnir',
 'rholnir',
 'taulnir']

In [256]:
len(top50)

34

In [258]:
#choose your subset of variables
data_vars = ['GPP','AR','HR','EFLX_LH_TOT','FCTR','FAREA_BURNED',
             'TWS','SOILWATER_10CM','SNOWDP','TV','TSOI_10CM','TLAI','FSR']
keys = paramkey.key
#read in the dataset
c285 = get_ensemble('C285',data_vars,'h0',keys,paramkey)

In [263]:
datavar = 'GPP'
s4 = []
da1 = (month_wts(10)*c285[datavar]).groupby('time.year').sum().compute()
da2 = (month_wts(10)*ds[datavar]).groupby('time.year').sum().compute()

In [277]:
s4=[]
for u in uniques:

    pair1 = find_pair(da1,ds.param,u)
    pair2 = find_pair(da2,ds.param,u)

    x1=(la*pair1).sum(dim='gridcell').mean(dim='year')
    x2=(la*pair2).sum(dim='gridcell').mean(dim='year')
    fx = (x1/x2).values
    dx = abs(fx[1]-fx[0])

    s4.append(dx)

In [280]:
ranks = np.argsort(s4)
for i in ranks[-5:]:
    u = uniques[i]
    print(u)

lmrhd
vcmaxha
wc2wjb0
medlynintercept
jmaxb0


### C285