In [2]:
import pandas as pd
import os
from pyDOE import *
from scipy.io import netcdf as nc
import xarray as xr

### Download latest version of params file from google drive
* requires 'publishing' the google drive spreadsheet
* file > publish to web
* then it can be set up to continuously publish the spreadsheet to a stable url (with some latency, maybe 1-2 minutes)
* note that the first tab must be the sheet where the relevant information is located

In [3]:
data_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQs413GtLXtHVDCqEPgAwn4BbDjoWmV7uFqOAWH4mgpxXoVfN6ijnJdhyRgLkV-n2eU-sSQush4CzYU/pub?output=csv'
cmd = 'curl '+data_url+' > params.csv'
os.system(cmd)

0

### Read in csv data, filtering by the "include" column

TO DO: include information about the default value? Useful to keep track of this, especially for namelist params

In [4]:
#data     = pd.read_csv('params.csv')
data     = pd.read_csv('params.csv',header=0,skiprows=[1]) # modify read_csv to account for header spanning 2 rows
included = data['include']==1
params_full   = data.loc[included,['name','location','min','max','pft_mins','pft_maxs']]

# reset indexing and get rid of excel row number
params = params_full.reset_index(drop=True)

params

Unnamed: 0,name,location,min,max,pft_mins,pft_maxs
0,displar,P,0.4,0.95,,
1,dleaf,P,pft,pft,"0,0.000216,0.000216,0.00072,0.0081,0.0081,0.00...","0,0.00108,0.00108,0.0036,0.0567,0.0567,0.243,0..."
2,baseflow_scalar,N,0.0005,0.1,,
3,maximum_leaf_wetted_fraction,N,0.01,0.5,,
4,fff,P,0.02,5,,
5,medlynslope,P,pft,pft,"9,1.29,1.29,1.29,1.63,1.63,3.19,3.19,3.19,2.25...","9,4.7,4.7,4.7,4.59,4.59,5.11,5.11,5.11,9.27,9...."
6,kmax,P,pft,pft,"0,1.00E-08,1.00E-08,1.00E-08,1.00E-08,1.00E-08...","0,3.00E-08,3.00E-08,3.00E-08,3.00E-08,3.00E-08..."


In [5]:
# example of how to read pft-specific values as a numpy array
pftfirstind = params.index[params['min']=='pft'][0]
np.fromstring(params['pft_mins'][pftfirstind],dtype='float',sep=',')

array([0.      , 0.000216, 0.000216, 0.00072 , 0.0081  , 0.0081  ,
       0.0081  , 0.0081  , 0.0081  , 0.0081  , 0.000405, 0.000162,
       0.000144, 0.000144, 0.000144, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162, 0.000162, 0.000162, 0.000162, 0.000162, 0.000162,
       0.000162])

In [6]:
# testing out how to retrieve the pft-dependent parameter names
params.loc[params['min']=='pft']['name']

1          dleaf
5    medlynslope
6           kmax
Name: name, dtype: object

In [8]:
# testing out how to retrieve the pft-dependent parameter indices as a numpy array
params.index[params['min']=='pft'].values

array([1, 5, 6])

### Generate parameter sampling
 * option available for latin hypercube (LHC) or one-at-a-time (OAAT)
 * careful, each time you run LHC you get a new random draw

In [27]:
#sampling_protocol = 'OAAT'
sampling_protocol = 'LHC'
prefix = sampling_protocol
nparam = len(params['name'])  #number of parameters

if sampling_protocol == 'LHC':
    # define sample size (number of ensemble members)
    nsamp = 10

    # Generate the latin hypercube sample
    lhd = lhs(nparam, samples=int(nsamp))
    # lhd is a 2D array indexed by ensemble member x parameter
    
    # figure out how many pft-dependent params there are in this sample
    npftparam = sum(params['min']=='pft')
    
    if npftparam>0:
        # get dataframe index of first pft param
        pftfirstind = params.index[params['min']=='pft'][0]
        
        # get number of pfts
        npft = len(np.fromstring(params['pft_mins'][pftfirstind],dtype='float',sep=','))
        
        # set up numpy array to store pft-specific values
        pft_array = np.nan*np.ones([npftparam,npft,nsamp])
        
        for j in range(npftparam):
            # get the index for the current pft param
            pftind = params.index[params['min']=='pft'][j]
            
            # get min values
            min_pft_array = np.fromstring(params['pft_mins'][pftind],dtype='float',sep=',')
            # max values
            max_pft_array = np.fromstring(params['pft_maxs'][pftind],dtype='float',sep=',')
            
            # loop over samples and calculate parameter values for each pft
            for i in range(nsamp):
                pft_array[j,:,i] = (max_pft_array - min_pft_array)*lhd[i,pftind] + min_pft_array
                # can't store pft_array as a pandas dataframe because it's 3D
                # unless there is some alternate way to store this data?
    
    # initialize min/max arrays - for params without pft-variation
    min_array = np.nan*np.ones(nparam)
    max_array = np.nan*np.ones(nparam)
    
    # generate arrays with min and max values
    for i in range(nparam):
        if params['min'].values[i]=='pft':
            # TO DO: what's a good placeholder, to denote need to reference pft_array?
            # numpy doesn't like assigning a string to an existing array of floats
            # for now, just print a message
            print('skipping '+params['name'].values[i]+'...this parameter varies with PFT')
            
            # Numpy doesn't like assigning an array to a single index in an existing array
            # The problem is still that I'm declaring min_array before trying to assign values
            # If I could build it all at once, numpy would allow for nested arrays
            #min_array[i] = np.fromstring(params['pft_mins'].values[i],dtype='float',sep=',')
            #max_array[i] = np.fromstring(params['pft_maxs'].values[i],dtype='float',sep=',')
        else:
            # assign min/max values
            min_array[i] = float(params['min'].values[i])
            max_array[i] = float(params['max'].values[i])
            
    # calculate parameter values; skip pft params (NaNs in min/max arrays)
    param_array = (max_array - min_array)*lhd + min_array

elif sampling_protocol == 'OAAT':
    # number of samples is twice the number of parameters (min and max perturbations)
    nsamp = 2*nparam
    
    # set up parameter array
    # NaN is code for keep the default value
    param_array = np.nan*np.ones([nsamp,nparam])
    
    # get the min and max indices (even/odd rows)
    mins_index = (np.arange(0,nsamp,2),np.arange(0,nparam,1))
    maxs_index = (np.arange(1,nsamp,2),np.arange(0,nparam,1))
    
    # figure out how many pft-dependent params there are in this sample
    npftparam = sum(params['min']=='pft')
    
    # set up numpy array to store pft-specific values
    if npftparam>0:
        # get dataframe index of first pft param
        pftfirstind = params.index[params['min']=='pft'][0]
        
        # get number of pfts
        npft = len(np.fromstring(params['pft_mins'][pftfirstind],dtype='float',sep=','))
        
        # third dimension accounts for min/max values
        pft_array = np.nan*np.ones([npftparam,npft,2])
        
        for j in range(npftparam):
            # get the index for the current pft param
            pftind = params.index[params['min']=='pft'][j]
            
            # assign the values for min and max
            pft_array[j,:,0]=np.fromstring(params['pft_mins'][pftind],dtype='float',sep=',')
            pft_array[j,:,1]=np.fromstring(params['pft_maxs'][pftind],dtype='float',sep=',')
            # can't store pft_array as a pandas dataframe because it's 3D
            # unless there is some alternate way to store this data?
        
    # assign values to the parameter array
    for i in range(nparam):
        if params['min'].values[i]=='pft':
            # TO DO: what's a good placeholder, to denote need to reference pft_array?
            # e.g., param_array[mins_index[0][i]][i] = float('pft')
            # but numpy doesn't like assigning a string to an existing array of floats
            # for now, just print a message
            print('skipping '+params['name'].values[i]+'...this parameter varies with PFT')
        else:
            # assign min/max values
            param_array[mins_index[0][i]][i]=params['min'].values[i]
            param_array[maxs_index[0][i]][i]=params['max'].values[i]

# store psets in a pandas dataframe
psets = pd.DataFrame(data=param_array, index=None, columns=params['name'])
psets

skipping dleaf...this parameter varies with PFT
skipping medlynslope...this parameter varies with PFT
skipping kmax...this parameter varies with PFT


name,displar,dleaf,baseflow_scalar,maximum_leaf_wetted_fraction,fff,medlynslope,kmax
0,0.661122,,0.095538,0.318094,4.375023,,
1,0.708461,,0.085824,0.090537,4.99027,,
2,0.419637,,0.061623,0.027388,1.952172,,
3,0.847889,,0.03993,0.233262,0.256467,,
4,0.77151,,0.020472,0.132389,2.798645,,
5,0.491213,,0.009822,0.191629,0.930343,,
6,0.801693,,0.014856,0.479127,3.182888,,
7,0.91602,,0.071426,0.401369,3.789981,,
8,0.619673,,0.051099,0.292112,1.41702,,
9,0.554503,,0.041077,0.42562,2.352854,,


### Modify psets dataframe to include pft flag

In [28]:
if sampling_protocol == 'LHC':
    for ind,name in enumerate(params['name']):
        # check for NaNs in the whole column (denotes PFT-specific param)
        if np.isnan(psets[name]).all():
            print('adding pft flag for '+name)
            psets[name] = 'pft'

# NOTE: this bit of code generates a pandas warning, but still executes as it should
# Could come back to this if we figure out how to put some pft flag in the preceding code
elif sampling_protocol == 'OAAT':    
    for ind,name in enumerate(params['name']):
        # check for NaNs in the whole column (denotes PFT-specific param)
        if np.isnan(psets[name]).all():
            print('adding pft flag for '+name)
            psets[name][mins_index[0][ind]] = 'pft'
            psets[name][maxs_index[0][ind]] = 'pft'

psets

adding pft flag for dleaf
adding pft flag for medlynslope
adding pft flag for kmax


name,displar,dleaf,baseflow_scalar,maximum_leaf_wetted_fraction,fff,medlynslope,kmax
0,0.661122,pft,0.095538,0.318094,4.375023,pft,pft
1,0.708461,pft,0.085824,0.090537,4.99027,pft,pft
2,0.419637,pft,0.061623,0.027388,1.952172,pft,pft
3,0.847889,pft,0.03993,0.233262,0.256467,pft,pft
4,0.77151,pft,0.020472,0.132389,2.798645,pft,pft
5,0.491213,pft,0.009822,0.191629,0.930343,pft,pft
6,0.801693,pft,0.014856,0.479127,3.182888,pft,pft
7,0.91602,pft,0.071426,0.401369,3.789981,pft,pft
8,0.619673,pft,0.051099,0.292112,1.41702,pft,pft
9,0.554503,pft,0.041077,0.42562,2.352854,pft,pft


### Check out pft_array, the numpy array that stores pft-specific values

In [29]:
pft_array.shape 
# OAAT dims are (npftparam, npft, 2) where last dim represents min/max perturbations
# LHC dims are (npftparam, npft, nsamp)

(3, 79, 10)

## Generate parameter files
* ### this will overwrite parameter files!!
* ### proceed with caution

In [18]:
# assign the basepftfile
basepftfile = "../basecase/clm5_params.c200425.nc"

# initialize npftparam counter
npftparam = 0

# loop over nsamp and modify the parameter values accordingly
for i in range(nsamp):
    # open the default file
    tmp = xr.open_dataset(basepftfile)
    
    # generate name for this param file
    pftfile = "../paramfiles/"+prefix+str(i+1).zfill(4)+".nc"
    print('working on '+pftfile)
    
    # loop over parameters
    for name,loc in zip(params['name'],params['location']):
        
        # select parameters located in the params file only
        if loc=='P':

            # check to see if this parameter should be modified
            # logic is checking for psets that are NOT NaNs
            if pd.isna(psets[name][i])==False:
                print(name+' modified')
                var = tmp[name]
                #print(var.shape)

                # check to see if there is pft variation
                # NOTE: may want to use only first 16 indices for this ensemble (no crop), in which case indexing changes 
                if psets[name][i]=='pft':
                    
                    # check which npftparam we are on
                    print('npftparam='+str(npftparam))
                    
                    # check if this is a min or max perturbation
                    if i%2==0:
                        tmp[name][:] = pft_array[npftparam,:,0] # min values
                    else:
                        tmp[name][:] = pft_array[npftparam,:,1] # max values

                        # increment npftparam counter; only do this once per parameter
                        npftparam += 1 
                
                else: # no pft variation, assign the same number across all PFTs (as applicable)
                    
                    # check for indexing by pft
                    # NOTE: this logic might get tripped up by froz_q10 and q10_mr which are currently indexed by a placeholder dim "allpfts" (should be removed soon)
                    if var.shape:
                        
                        # check for indexing by segment or variants, which will be the first dimension
                        # skip the first index, don't want to overwrite non-vegetated values
                        if var.shape[0] != npft: 
                            tmp[name][:,1:] = psets[name][i]
                        else: # indexed by pft only
                            tmp[name][1:] = psets[name][i]
                    
                    else: # single value, no indexing by pft
                        tmp[name] = psets[name][i]

    # write changes (if any) to file
    tmp.to_netcdf(pftfile,'w')

working on ../paramfiles/OAAT0001.nc
displar modified
working on ../paramfiles/OAAT0002.nc
displar modified
working on ../paramfiles/OAAT0003.nc
dleaf modified
npftparam=0
working on ../paramfiles/OAAT0004.nc
dleaf modified
npftparam=0
working on ../paramfiles/OAAT0005.nc
working on ../paramfiles/OAAT0006.nc
working on ../paramfiles/OAAT0007.nc
working on ../paramfiles/OAAT0008.nc
working on ../paramfiles/OAAT0009.nc
fff modified
working on ../paramfiles/OAAT0010.nc
fff modified
working on ../paramfiles/OAAT0011.nc
medlynslope modified
npftparam=1
working on ../paramfiles/OAAT0012.nc
medlynslope modified
npftparam=1
working on ../paramfiles/OAAT0013.nc
kmax modified
npftparam=2
working on ../paramfiles/OAAT0014.nc
kmax modified
npftparam=2


## Generate namelist files

Bash script will generate the namelist mod for pointing to the right params file

In [19]:
# create the namelist mod files
for i in range(nsamp):
    nlfile = "../namelist_mods/"+prefix+str(i+1).zfill(4)+".txt" 
    with open(nlfile,"w") as file:
        output = "! user_nl_clm namelist options written by generate_params:\n"
        file.write(output)

# populate with mods
for name,loc in zip(params['name'],params['location']):
    if loc=='N':
        # don't have to worry about pft-variation here because namelist params won't have that
        for i in range(nsamp):
            # check to see if this parameter should be modified
            # logic is checking for psets that are NOT NaNs
            if ~np.isnan(psets[name][i]):
                nlfile = "../namelist_mods/"+prefix+str(i+1).zfill(4)+".txt"
                print('working on '+nlfile)
                with open(nlfile,"a") as file: # key is using "a" for append option
                    print(name+' modified')
                    output = "%s=%s\n" % (name, psets[name][i]) #round??
                    file.write(output) 

working on ../namelist_mods/OAAT0005.txt
baseflow_scalar modified
working on ../namelist_mods/OAAT0006.txt
baseflow_scalar modified
working on ../namelist_mods/OAAT0007.txt
maximum_leaf_wetted_fraction modified
working on ../namelist_mods/OAAT0008.txt
maximum_leaf_wetted_fraction modified


## Save off the parameter sets

In [25]:
# create a name for this particular ensemble
ensemble_name = "test0001"
# build the file name with the prefix (ensemble type)
psetsfile = "../parameter_sets/"+prefix+"_"+ensemble_name+".csv"
#print(psetsfile)

# first, save the psets dataframe to csv
psets.to_csv(psetsfile)

# second, save the pft array (if applicable)
pftarrayfile = "../parameter_sets/"+prefix+"_"+ensemble_name+"_pftvals"
#print(pftarrayfile)
# save as a numpy array (for now, easiest solution for 3D array?)
np.save(pftarrayfile, pft_array)
# example of how to load it back in
# test = np.load(pftarrayfile+".npy")

../parameter_sets/OAAT_test0001.csv
../parameter_sets/OAAT_test0001_pftvals
