In [None]:
import flopy
import pyemu
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import herebedragons as hbd
import shutil

# set the random seed
np.random.seed(123456)

# Getting started

These notebooks will be less verbose than other GMDSI tutorial notebooks. We assume the reader is already familiar with many of the topics. We focus specificaly on aspects that relate to:
- `PstFrom` with a `mf6` `disv` grid
- use of hyper parameters
- passing in prior knowledge to hyperparameters
- (subsequent notebooks) use of DSI

As usual, let's make a copy of our model folder for safety...

In [None]:
# folder containing original model files
org_d = os.path.join('model')

# a dir to hold a copy of the org model files
tmp_d = os.path.join('tmp')

if os.path.exists(tmp_d):
    shutil.rmtree(tmp_d)
shutil.copytree(org_d,tmp_d)

# get executables
hbd.get_bins(tmp_d)

In [None]:
# load simulation
sim = flopy.mf6.MFSimulation.load(sim_ws=tmp_d)
# load flow model
gwf = sim.get_model()

# run the model once to make sure it works
#pyemu.os_utils.run("mf6",cwd=tmp_d)

This model is a `DISV` grid. `PstFrom` is going to require the `flopy` model grid object to setup pilot points and spatialy varying covariance:

In [None]:
sr = gwf.modelgrid
sr

Apart from that, everything else is the same...magic...

In [None]:
# specify a template directory (i.e. the PstFrom working folder)
template_ws = os.path.join("pst_template")

start_datetime = sim.tdis.start_date_time.get_data()

# instantiate PstFrom
pf = pyemu.utils.PstFrom(original_d=tmp_d,       # where the model is stored
                            new_d=template_ws,   # the PEST template folder
                            remove_existing=True, # ensures a clean start
                            longnames=True,      # set False if using PEST/PEST_HP
                            spatial_reference=sr, #the spatial reference we generated earlier
                            zero_based=False, # does the MODEL use zero based indices? For example, MODFLOW does NOT
                            start_datetime=start_datetime, # required when specifying temporal correlation between parameters
                            echo=False) # to stop PstFrom from writing lots of information to the notebook; experiment by setting it as True to see the difference; useful for troubleshooting

We are going to keep things super simple for this tutorial. We are only going to parameterize hydrualic conductivity. As we are all sofisticated, we understand that in a real-world application other parameters and boundary conditions would likely also be important aspects to consider for data assimilation and uncertianty analysis....

In [None]:
# file that contains K values
f = "gwf.npf_k.txt"

# clean up the fname array file
fpath = os.path.join(template_ws,f)

k = np.loadtxt(fpath)
k.shape

Unfortunatley `flopy` doesnt write tidy model input files...so we need to fix them...

In [None]:
# import helper function
from herebedragons import tidy_array
tidy_array(fpath)
k = np.loadtxt(fpath)
k.shape

In [None]:
ib = gwf.dis.idomain.get_data()
assert ib is None

No one specifgied the `idomain` in the original model setup, so lets just create a "zone array". `PstFrom` expects this when we setup pilot points and so on later. Note that the shape of `ib` is the same as the shape of the `k`:

In [None]:
ib = np.ones(sr.ncpl, dtype=int)
assert ib.shape == k.shape
ib.shape

Now, we have a set of pilot point locations (and values) already prepared from our conceptual modelling:

In [None]:
ppdf = pd.read_csv(os.path.join("data","conceptual_kh.pts"), sep=r'\s+')
ppdf.head()

However, `PstFrom` and `pypestutils` expect a strict format in terms of column anmes and information. So first we need to make that. A pilot point file must have the following columns: `['name','zone','x','y','parval1']`

In [None]:
ppdf = ppdf.astype(float)
ppdf.rename(columns={'point':'name','easting':"x","northing":"y",}, inplace=True)
ppdf.name = ppdf.name.apply(lambda x: f'pp{int(x)}')
ppdf['zone'] = 1
ppdf.head()

We have them all, except for `parval1`. We will be specifying that case by case below. lets start with the "mean" value of K pilot points:

In [None]:
#set parval1 as the mean values of K
ppdf['parval1'] = ppdf['mean']

# take a look at the final pilot point file
ppdf[['name','zone','x','y','parval1']].head()

We spatialy varying mean values of K in the conceptaul pilot points...

In [None]:
ppdf.parval1.unique()

But the value of K is uniform in the model..

In [None]:
np.unique(gwf.npf.k.get_data())

Lets fix that:

In [None]:
ppdf.parval1 = ppdf.parval1 / np.unique(gwf.npf.k.get_data())[0]
ppdf[['name','zone','x','y','parval1']].head()

In [None]:
# now save the file into the template folder
ppfname = "ppoints.k.csv"
ppdf[['name','zone','x','y','parval1']].to_csv(os.path.join(template_ws,ppfname), index=False)

Now, setup a goestatistical structure to pass to `PstFrom`. This is the geostatiscs for the "hyper parameter". Inception much...

Lets just use the median from the conceptual pilot points:

In [None]:
ppdf.describe()

In [None]:
a = ppdf.describe().loc['50%','a'] # range of correlation; length units of the model. In our case 'meters'
anisotropy = ppdf.describe().loc['50%','hanis'] #name says it all
bearing = ppdf.describe().loc['50%','bearing'] #angle in degrees East of North corresponding to anisotropy ellipse
a,anisotropy,bearing

In [None]:
anisotropy

In [None]:
# exponential variogram for spatially varying parameters
v_pp = pyemu.geostats.ExpVario(contribution=1.0, #sill
                                    a=a, # range of correlation; length units of the model. In our case 'meters'
                                    anisotropy=anisotropy, #name says it all
                                    bearing=bearing #angle in degrees East of North corresponding to anisotropy ellipse
                                    )

# geostatistical structure for spatially varying parameters
pp_gs = pyemu.geostats.GeoStruct(variograms=v_pp, transform='log') 

Now we can set up pilot point multiplier parameters. 

Key aspects here are:

```
    initial_value=ppdf.parval1.values,
```

Where we have passed in the initial values for the K multipliers at each pilot point. And,

```
    pp_options={"prep_hyperpars":True, "pp_space":ppfname}
```

Where we have specfied that `PstFrom` shoudl setup hyper parameters, and use the `ppfname` file for pilot point locations.

In [None]:
m = ppdf['mean'].values       # means (linear)
v10 = ppdf['var'].values       # variance of log10

s10 = np.sqrt(v10)
ln10 = np.log(10.0)
sigma = s10 * ln10       # std dev in natural log
z95 =  1.6448536269514722
z05 = -z95

lb = m * np.exp(-0.5 * sigma**2 + z05 * sigma)
ub = m * np.exp(-0.5 * sigma**2 + z95 * sigma)

# make to multiplier
lb = lb / np.unique(gwf.npf.k.get_data())[0]
ub = ub / np.unique(gwf.npf.k.get_data())[0]
lb,ub

In [None]:
m

In [None]:
ppdf.parval1

In [None]:
# pilot point for "mean" hyperparamter
df_pp = pf.add_parameters(f,
                    zone_array=ib,
                    par_type="pilotpoints",
                    geostruct=pp_gs,
                    par_name_base=f.split('.')[1].replace("_","")+"pp",
                    pargp=f.split('.')[1].replace("_","")+"pp",
                    lower_bound=lb,
                    upper_bound=ub,
                    #ult_ubound=uubnd, ult_lbound=ulbnd,
                    initial_value=ppdf.parval1.values,
                    pp_options={"prep_hyperpars":True,
                                "pp_space":ppfname}
                    ) 

To make our lives easier and help with checking & postprocessing, lets add observations of K in every model cell. This will allow us to see the outcomes of the interpolation of the pilot point values to the grid...

In [None]:
df_obs = pf.add_observations(f, prefix="k", obsgp="k")

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5,5))
mv = flopy.plot.PlotMapView(model=gwf)
mv.plot_grid(lw=0.5,alpha=0.5)
ax.set_aspect("equal")

ax.scatter(df_pp.x,df_pp.y, s=10,
           c=df_pp.parval1,marker='o', zorder=3);
fig.tight_layout();

Lets checkout what `pp_options={"prep_hyperpars":True}` did. It created a bunch of hyper parameter pilot point files for bearing, corrlength and anisotropy.

In [None]:
tag="npfkpp"

hyperpar_files = [f for f in os.listdir(pf.new_d) if tag in f]
hyperpar_files

OK, so lets go through each of the hyper parameters an dparameterize them using pilot points...same as before...

Lets start with anisotropy. We are going to make these hyperparameters all "additive" type paraemters. 

In [None]:
# we dont want to log transform anisotropy, so we should set up a geostatistical structure with 'none' transform
# geostatistical structure for spatially varying parameters
pp_gs_none = pyemu.geostats.GeoStruct(variograms=v_pp, transform='none') 

In [None]:
ppfname = "ppoints.aniso.csv"
ppdf['parval1'] = ppdf['hanis']
ppdf[['name','zone','x','y','parval1']].to_csv(os.path.join(template_ws,ppfname), index=False)
ppdf

In [None]:

afile = tag+'.aniso.dat'


tidy_array(os.path.join(template_ws,afile))

atag = afile.split('.')[0].replace("_","-")+"-aniso"
_df = pf.add_parameters(afile,
                  par_type="pilotpoints",
                  zone_array=ib.flatten(),
                  geostruct=pp_gs_none,
                  par_name_base=atag,
                  pargp=atag,
                  lower_bound=-2.5,upper_bound=2.5,
                  ult_ubound=8., ult_lbound=0.,
                  apply_order=1,
                  par_style="a",transform="none",
                  initial_value= ppdf['parval1'].values - anisotropy,
                  pp_options={"prep_hyperpars":False,
                              "try_use_ppu":True,
                                "pp_space":ppfname})
_ = pf.add_observations(afile, prefix=atag, obsgp=atag)

Now bearing...

In [None]:
ppfname = "ppoints.bearing.csv"
ppdf['parval1'] = ppdf['bearing']
ppdf[['name','zone','x','y','parval1']].to_csv(os.path.join(template_ws,ppfname), index=False)
ppdf

In [None]:
afile = tag+'.bearing.dat'
tidy_array(os.path.join(template_ws,afile))
atag = afile.split('.')[0].replace("_","-")+"-bearing"
_df = pf.add_parameters(afile,
                  par_type="pilotpoints",
                  zone_array=ib.flatten(),
                  geostruct=pp_gs_none,
                  par_name_base=atag,
                  pargp=atag,
                  lower_bound=-45,upper_bound=45,
                  apply_order=1,
                  par_style="a",
                  transform="none",
                  initial_value=ppdf['parval1'].values - bearing,
                  pp_options={"prep_hyperpars":False,
                              "try_use_ppu":True,
                                "pp_space":ppfname})
_ = pf.add_observations(afile, prefix=atag, obsgp=atag)

Now correlation length...

In [None]:
ppfname = "ppoints.corrlen.csv"
ppdf['parval1'] = ppdf['a']
ppdf[['name','zone','x','y','parval1']].to_csv(os.path.join(template_ws,ppfname), index=False)
ppdf

In [None]:



afile = tag+'.corrlen.dat'
tidy_array(os.path.join(template_ws,afile))
atag = afile.split('.')[0].replace("_","-")+"-corrlen"
_df = pf.add_parameters(afile,
                  par_type="pilotpoints",
                  zone_array=ib.flatten(),
                  geostruct=pp_gs_none,
                  par_name_base=atag,
                  pargp=atag,
                  lower_bound=0.5,upper_bound=2.0,
                  ult_lbound=200, ult_ubound=1200,
                  apply_order=1,
                  par_style="m",transform="none",
                  initial_value=ppdf['parval1'].values / a,
                  pp_options={"prep_hyperpars":False,
                              "try_use_ppu":True,
                                "pp_space":ppfname})
_ = pf.add_observations(afile, prefix=atag, obsgp=atag)

### Add postprocess functions
To make sure that interpolated model input files are nice and tidy for `PEST` to read them as observations, lets add the `tidy_array()` function to the forward run:

In [None]:
pf.extra_py_imports.append("numpy as np")
pf.add_py_function("herebedragons.py","tidy_array('gwf.npf_k.txt')",is_pre_cmd=False)

# Check the prior K fields

An extremly useful check can be done now. Note that we have not yet added a model run to the `PstFrom`. All that is in the forward run workflow up to now is the interpolation from pilot points to the model grid. In other words, the interpolation is our "forward run" at the moment. 

Build the pest control file and forward run .py file to see:

In [None]:
pst = pf.build_pst()

In [None]:
_ = [print(line.rstrip()) for line in open(os.path.join(template_ws,"forward_run.py"))]

This is super powerful. We can generate a prior ensemble of pilot point values, run that ensemble and collate all the generater model input parameter fields. We can then check them and make sure they make sense and look pretty :) 

All we need to do is generate the ensemble and run it once. Because we are not running modflow, this will be super fast!

Lets just make sure wverything is working first:

In [None]:
pst.write(os.path.join(template_ws, 'pest.pst'),version=2)

In [None]:
pyemu.os_utils.run('pestpp-ies pest.pst', cwd=template_ws)

In [None]:
pst = pyemu.Pst(os.path.join(template_ws, 'pest.pst'))
pst.phi

Cool, so this is our base model run:

In [None]:
sim = flopy.mf6.MFSimulation.load(sim_ws=template_ws,load_only=['npf'],verbosity_level=0)
gwf = sim.get_model()
gwf.npf.k.plot(colorbar=True)

Generate the prior parameter ensemble:

In [None]:
# build the prior covariance matrix and store it as a compressed binary file (otherwise it can get huge!)
# depending on your machine, this may take a while...
if pf.pst.npar < 35000:  #if you have more than about 35K pars, the cov matrix becomes hard to handle
    cov = pf.build_prior(fmt='coo', filename=os.path.join(template_ws,"prior_cov.jcb"))
    # and take a peek at a slice of the matrix
    try: 
        x = cov.x.copy()
        x[x==0] = np.NaN
        plt.imshow(x[:,:])
    except:
        pass
    pf.pst.pestpp_options["parcov"] = "prior_cov.jcb"

In [None]:
pe = pf.draw(num_reals=1000, use_specsim=False) # draw parameters from the prior distribution
pe.enforce() # enforces parameter bounds
pe.to_binary(os.path.join(template_ws,"prior_pe.jcb")) #writes the parameter ensemble to binary file

pst.pestpp_options["ies_par_en"] = "prior_pe.jcb"
pst.pestpp_options["ies_num_reals"] = 50
pst.write(os.path.join(template_ws,"pest.pst"),version=2)
print(pe.shape,pf.pst.npar,pf.pst.npar_adj)
assert pe.shape[1] == pf.pst.npar_adj

Re-write the control file:

In [None]:
pst.pestpp_options["overdue_giveup_fac"] = 10
pst.pestpp_options["overdue_giveup_minutes"] = 100
pst.pestpp_options["save_binary"] = True

pst.control_data.noptmax = -1

pst.write(os.path.join(template_ws, 'pest.pst'),version=2)

And run `pestpp-ies`!

### Warning: set number of workers to equal or less than the amount of cores you have available

In [None]:
num_workers=10

In [None]:

m_d = "master_prior_cond"
t_d = template_ws

pyemu.os_utils.start_workers(t_d, # the folder which contains the "template" PEST dataset
                            'pestpp-ies', #the PEST software version we want to run
                            'pest.pst', # the control file to use with PEST
                            num_workers=num_workers, #how many agents to deploy
                            worker_root='.', #where to deploy the agent directories; relative to where python is running
                            master_dir=m_d, #the manager directory
                            )

Read in the results of the `pestpp-ies` prior monte carlo. We can use some the `Pst` inbuilt helpers for this. Start b reading the .pst file form the mater dir:

In [None]:
pst = pyemu.Pst(os.path.join(m_d, 'pest.pst'))

Now, if parmeter or observation ensemble files are avialable in the folder, `pyemu` will try and load those:

In [None]:
pe = pst.ies.paren.copy()
pe.head()

In [None]:
obs = pst.observation_data
obsnmes = obs.loc[obs.oname=="k"].obsnme.tolist()

onames = obs.oname.unique()


The same for the observation ensemble:

In [None]:
oe = pst.ies.obsen.copy()
oe.loc[:,obsnmes].head()

Lets plot a couple of those parameter fields...good thing we tracked all the arrays as observations!

In [None]:
i = 'base'#oe.index.values[1]
for i in oe.index.values[-5:]:
    fig,axs = plt.subplots(1,4,figsize=(16,4))

    for e,oname in enumerate(onames):
        ax = axs[e]
        ax.set_aspect("equal")
        pm = flopy.plot.PlotMapView(model=gwf, ax=ax)

        _obs = obs.loc[obs.oname==oname].copy()
        _obs["i"] = _obs["i"].astype(int)
        _obs.sort_values("i", inplace=True)
        obsnmes = _obs.obsnme.tolist()
        arr = oe.loc[i,obsnmes].values
        if oname=='k':
            arr = np.log10(arr)

        pa = pm.plot_array(arr)
        plt.colorbar(pa, ax=ax, shrink=0.5)

        ax.set_title(oname)
        ax.set_xticks([])
        ax.set_yticks([])


    fig.tight_layout();
    plt.show()
    plt.close();

This provides a practical way of checking that all the plumibg works...that there arent silly mistakes with parameter values and bounds etc...and that you are happy with how the prior knowledge is being expressed through parameterisation. It can be quite helpfull to show results at this stage to stakeholders for example, to ensure that everyone agrees on the "reasonableness" of parameter values and distirbutions.

# Finishing up the PEST setup

Once we are happy with that, we can go through the process of adding in the rest of the PEST setup. Such as observcations, other paramters...and importnatly the mf6 model run.

To make our lives easier later on when we use DSI, we are simply going to track the model simualted heads and temperatures in all cells and all stressperiods. We have prepared a utulity function that processes `mf6` outpfiles and writes observations to clean .txt files (see `herebedragons.py` for details; its pretty simple). 

Lets just call it and run here:

In [None]:
hbd.post_model_outputs(template_ws=template_ws)

And add it to the `PstFrom`:

In [None]:
pf.add_py_function("herebedragons.py","post_model_outputs()",is_pre_cmd=False)

Now lets add observations from each of those files:

In [None]:
afile = "riv.0.txt"
prefix ='riv'
_ = pf.add_observations(afile, prefix=prefix, obsgp=prefix)
_

In [None]:
afile = "heads.0.txt"
prefix ='heads0'
_ = pf.add_observations(afile, prefix=prefix, obsgp=prefix+'_history')

In [None]:
afile = "heads.1.txt"
prefix ='heads1'
_ = pf.add_observations(afile, prefix=prefix, obsgp=prefix+"_future")

In [None]:
afile = "temp.max.txt"
prefix ='temp_max'
_ = pf.add_observations(afile, prefix=prefix, obsgp=prefix)

Eazy as...

And last but not least, update the forward run script to call `mf6`:

In [None]:
pf.mod_sys_cmds.append("mf6") #do this only once
pf.mod_sys_cmds

Boom! good to go - lets build the pest setup:

In [None]:
pst = pf.build_pst()

...just because...

In [None]:
obs = pst.observation_data
obs.weight = 0.0

Now we can draw the prior. Lets draw a large number of reals so we can play around with DSI later. 

And that is it...pest setup ready to run a prior monte carlo.

In [None]:
pe = pf.draw(num_reals=1000, use_specsim=False) # draw parameters from the prior distribution
pe.enforce() # enforces parameter bounds


pe.to_binary(os.path.join(template_ws,"prior_pe.jcb")) #writes the parameter ensemble to binary file


pst.pestpp_options["ies_par_en"] = "prior_pe.jcb"
pst.pestpp_options["ies_num_reals"] = 100
pst.write(os.path.join(template_ws,"pest.pst"),version=2)
print(pe.shape,pf.pst.npar,pf.pst.npar_adj)
assert pe.shape[1] == pf.pst.npar_adj