# Discrete Disjunctive Kriging

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gstlearn as gl
import gstlearn.plot as gp

# initialisation
flagKriging  = True
flagModeling = True

np.random.seed(123)

## Simulation of a reference data set
We create a reference data set (lognormal distribution) based on a model that we define, using *simtub* (based on Turning Bands).

In [None]:
# parameters for the simulation
m   = 1.
sig = 0.5

# initialization of the grid
grd = gl.DbGrid.create(x0=(0.0,0.0), dx=(0.01,0.01), nx=(100,100))

# construct a model
model = gl.Model.createFromDb(grd)
model.addCov(gl.CovAniso(gl.ECov.EXPONENTIAL,0.2, 0, 1., model.getContext()))

# simulation of the model on the grid
gl.simtub(dbin = None, dbout = grd, model = model, nbsimu = 1)
grd.setName("Simu", "Y")
Z = m * np.exp(sig * grd["Y"].squeeze() - sig**2 / 2) # transform the variable
grd["Z"] = Z

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,6))
grd.plot(name = "Z", ax=ax1)
grd.plot_hist('Z', xlab = "Raw variable", bins = 25, color="orange", ax=ax2)

In [None]:
# extract data set
Np = 1000 # number of samples wanted in the data set
nech = grd.getSampleNumber()
sel = np.zeros(nech, dtype=bool)
sel[np.random.choice(nech, size = Np, replace = False)] = True # select randomly Np samples
data = gl.Db(grd) # copy of grd into a Db (not gridded)
data.addSelection(np.double(sel), "sel")

ax = grd.plot("Z", figsize=(6,6))
ax = data.plot(size_name="Z", ax=ax, color='yellow')
ax.set_aspect("equal")

#### Elementary statistics

We define cutoffs values corresponding to quantiles 0\%, 30\%, 50\%, 70\% and 90\%.

In [None]:
zcut = np.quantile(data['Z'], q = [0, 0.3, 0.5, 0.7, 0.9])

print("\n{:^40}".format("Coupures sur la variable Z"))
print("{:^10}{:^10}{:^10}{:^10}{:^10}".format(0., 0.3, 0.5, 0.7, 0.9))
print("{:^10.3f}{:^10.3f}{:^10.3f}{:^10.3f}{:^10.3f}".format(*zcut),'\n')

mylimits = gl.Limits(zcut) #defines limits based on the cutoff values (4 intervals delimited by the 5 cutoff values)
mylimits.display()

## Discretization of the variable on cutoff values

From the cutoff values defined above, the variable *Z* is discretized on the four intervals delimited by the cutoff values. The indicators of the intervals are $\mathbb{1}(z_i \le Z < z_{i+1})$. The fifth indicator ($\mathbb{1}(Z \ge z_{5})$) is not computed because it can be deducted from the four other ones as their sum equals to one. 

In [None]:
# Locate "Z" variable
data.clearLocators(gl.ELoc.Z)
data.setLocator("Z", gl.ELoc.Z)

# Compute the indicators
mylimits.toIndicator(data, name='Z', OptionIndicator=1) # create 4 new indicator variables

# Compute the discretized version of the variable
data.clearLocators(gl.ELoc.Z)
data.setLocator("Z", gl.ELoc.Z)
mylimits.toIndicator(data, name='Z', OptionIndicator=0) # create the discretized variable 

# statistics on the indicators
w = data.statistics(["Indicator.Z.Class*"], ["mean"], True, True, False)
w = list(w) + [1 - np.sum(w)]
print("\nProportions:",w)

# Problem
function *Limits.toIndicator* does not work as expected

In [None]:
print(data)

fig, axs = plt.subplots(2,2)
for i,ax in enumerate(axs.flatten()):
    data.plot(f"I*{i+1}", ax=ax)

In [None]:
data.deleteColumn("Indicator*")

means = np.empty(4)               # mean per class
DZ = np.empty(len(Z))             # discretized variable
I5 = np.ones(len(Z), dtype=bool)  # fifth indicator, deduced from the four first ones
for i in range(4):
    I = (Z>=zcut[i])*(Z<zcut[i+1])         # indicator
    data[f"Indicator.Z.Class.{i+1}"] = I   
    means[i] = np.mean(Z[I])               # mean of the class
    DZ[I] = means[i]                       # fill the discretized variable for this class
    I5[I] = False                          # I5 is 0 where other indicators are 1

DZ[I5] = np.mean(Z[I5])        # fill the discretized variable for the fifth indicator
data["Indicator.Z.Mean"] = DZ  

# plot indicators
fig, axs = plt.subplots(2,2)
for i,ax in enumerate(axs.flatten()):
    data.plot(f"I*{i+1}", ax=ax)
# plot discretized variable
data.plot("*Mean", title='Discretized variable')

# statistics on the indicators
w = data.statistics(["Indicator.Z.Class*"], ["mean"], True, True, False)
w = list(w) + [1 - np.sum(w)]
print("\nProportions:",w)

## Variography (omnidirectional)

#### Variogram of the raw variable *Z*

In [None]:
# Locate Z
data.clearLocators(gl.ELoc.Z)
data.setLocator("Z", gl.ELoc.Z)

# Variogram parameters
dirParam = gl.DirParam(2, 10, 0.05) #ndim, nlags, lag
varioParam = gl.VarioParam()
varioParam.addDir(dirParam)

var_Z = gl.Vario(varioParam, data)
err = var_Z.compute()

# fit model
mod_Z = gl.Model()
opt=gl.Option_AutoFit()
opt.setWmode(2) # weighted proportional to the number of pairs and inverse proportional to the distance
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)

# plot
ax = gp.varmod(var_Z, mod_Z, title = "Z (initial)", flagLegend=True)

#### Variogram of the discretized variable

In [None]:
data.clearLocators(gl.ELoc.Z)
data.setLocator("Indicator.Z.Mean", gl.ELoc.Z)

var_Z = gl.Vario(varioParam, data)
err = var_Z.compute()

mod_Z = gl.Model()
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)

ax = gp.varmod(var_Z, mod_Z, title = "Z (initial)", flagLegend=True)

#### Variograms of the Indicator variables

In [None]:
data.clearLocators(gl.ELoc.Z)
data.setLocator("Indicator.Z.Class*", gl.ELoc.Z)

var_Z = gl.Vario(varioParam, data)
err = var_Z.compute()

mod_Z = gl.Model()
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)

ax = gp.varmod(var_Z, mod_Z, flagLegend=False, figsize=(10,10))
plt.suptitle("Z (initial)", fontsize=30)

The indicators are spatially correlated. A method for disjunctive kriging would consist in cokriging all indicators. Instead, we will decompose them into factors that are not correlated spatially, so that they can all be estimated seperatel by kriging.

## MAF : Min/Max Autocorrelation Factors

Indicators are decomposed on factors called MAF (Min/Max Autocorrelation Factors). MAFs are not correlated spatially, and the first MAFs represent the spatial structures with the most continuity.

Definition of interval function *model.maf.index*:

- on ne peux pas visualiser le premier MAF est s'assurer qu'il est monotone.
- l'object pca a bien une fonction pca.plot mais pas de fonction spécifique pour représenter les facteurs comme des fonctions ? (à définir)

In [None]:
# Calcul des maf comme fonction de l'index
def model_maf_index(maf, flag_interval = True):
    if flag_interval:
        # Calcul de la probabilité des N+1 classes
        fw = list(maf.mean) + [1-np.sum(maf.mean)]
        # Normalisation de l'indicatrices des intervalles
        i_norm_val = np.eye(len(fw))
        for i in range(len(fw)):
            i_norm_val[:,i] = (i_norm_val[:,i] - fw[i]) / np.sqrt(fw[i]*(1-fw[i]))
    else:
        # Calcul de la moyenne des N+1 coupures
        fw = [1] + list(maf.mean)
        # Normalisation de l'indicatrices des intervalles
        i_norm_val = np.tril(np.ones((len(fw), len(fw)))) # lower triangular matrix with ones
        for i in range(1, len(fw)):
            i_norm_val[:,i] = (i_norm_val[:,i] - fw[i]) / np.sqrt(fw[i]*(1-fw[i]))
    
    # Conversion des indicatrices normalisées en valeur des facteurs
    maf_index = i_norm_val[:,:len(fw)-1] * maf.pcaz2f
    return maf_index