<a href="https://colab.research.google.com/github/ContiPaolo/Multifidelity-Tutorial/blob/main/MF_POD_Burger's1D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creation of the Multilevel dataset

This notebook is made to ensure consistency among the users of this library.

Before computing the solution for the n_samples let us ensure that the libraries and the solver are consistent to those used to develop this library 

### Check that the libraries in the environment are the same as this:

In [1]:
conda list 

# packages in environment at /Users/lucacaroselli/miniconda3/envs/fenics:
#
# Name                    Version                   Build  Channel
absl-py                   2.1.0              pyhd8ed1ab_0    conda-forge
anyio                     4.3.0              pyhd8ed1ab_0    conda-forge
appnope                   0.1.4              pyhd8ed1ab_0    conda-forge
argon2-cffi               23.1.0             pyhd8ed1ab_0    conda-forge
argon2-cffi-bindings      21.2.0          py311h2725bcf_4    conda-forge
arrow                     1.3.0              pyhd8ed1ab_0    conda-forge
arviz                     0.18.0                   pypi_0    pypi
asttokens                 2.4.1              pyhd8ed1ab_0    conda-forge
astunparse                1.6.3              pyhd8ed1ab_0    conda-forge
async-lru                 2.0.4              pyhd8ed1ab_0    conda-forge
attrs                     23.2.0             pyh71513ae_0    conda-forge
babel                     2.14.0             pyhd8ed1ab_0    

#### Import libraries

In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['KERAS_BACKEND'] = 'tensorflow'

import time

from pyDOE import lhs as lhcs
from scipy.stats.distributions import norm as norm_dist

import numpy as np
import matplotlib.pyplot as plt
plt.style.use(['dark_background'])

from itertools import product

import sys
sys.path.append('./solver')
from model import *
np.random.seed(123)

In [2]:
# Set the resolution of the model and the random field parameters.
resolution = (50, 50)
resolution_coarse = (25,25)
field_mean = 1
field_stdev = 1
lamb_cov = 0.1
mkl_fine = 64

# Set up the model(s)
model_fine = Model(resolution, field_mean, field_stdev, mkl_fine, lamb_cov)
model_coarse = Model( resolution_coarse, field_mean, field_stdev, mkl_fine, lamb_cov)

In [3]:
# Assuming solver_f.solver.mesh.coordinates() and solver_c.solver.mesh.coordinates() return lists or numpy arrays
list1 = model_fine.solver.mesh.coordinates()
list2 = model_coarse.solver.mesh.coordinates()

# Convert lists to numpy arrays if they are not already
array1 = np.array(list1)
array2 = np.array(list2)

# Convert to structured arrays for easy row-wise comparison
dtype = {'names': ['f{}'.format(i) for i in range(array1.shape[1])],'formats': [array1.dtype] * array1.shape[1]}

structured_array1 = array1.view(dtype)
structured_array2 = array2.view(dtype)

# Create the boolean vector by checking if each row in array1 is in array2
bool_vector = np.in1d(structured_array1, structured_array2)

# Print the boolean vector
print(bool_vector)

model_coarse.random_process.eigenvalues = model_fine.random_process.eigenvalues
model_coarse.random_process.eigenvectors = model_fine.random_process.eigenvectors[bool_vector]

[ True False  True ...  True False  True]


In [4]:
# Define the sampling points.
x_data = y_data = np.array([0.1, 0.3, 0.5, 0.7, 0.9])

datapoints = np.array(list(product(x_data, y_data)))

Check that the folling code produces the same output as the cell below 

In [5]:
model_coarse.solve(np.ones(64))
model_coarse.get_data(datapoints)

array([0.95298313, 0.93511747, 0.93376781, 0.96958524, 0.97444054,
       0.71472909, 0.69129481, 0.71240044, 0.8061805 , 0.85216712,
       0.56621954, 0.55445767, 0.51410139, 0.50800318, 0.58644353,
       0.32871738, 0.33691461, 0.38946967, 0.43303401, 0.2922576 ,
       0.13825812, 0.17735146, 0.20420585, 0.14543473, 0.13773947])

In [6]:
model_fine.solve(np.ones(64))
model_fine.get_data(datapoints)

array([0.95464728, 0.93776413, 0.93109258, 0.96899473, 0.97489174,
       0.72064782, 0.68868939, 0.71549097, 0.80617574, 0.85157427,
       0.57053728, 0.56263489, 0.51127896, 0.50627652, 0.58754885,
       0.32579065, 0.33349834, 0.38978423, 0.43724719, 0.28198924,
       0.13979754, 0.17877788, 0.20639778, 0.14169928, 0.13629775])

array([0.95464728, 0.93776413, 0.93109258, 0.96899473, 0.97489174, \
$\quad\quad$   0.72064782, 0.68868939, 0.71549097, 0.80617574, 0.85157427, \
$\quad\quad$   0.57053728, 0.56263489, 0.51127896, 0.50627652, 0.58754885, \
$\quad\quad$   0.32579065, 0.33349834, 0.38978423, 0.43724719, 0.28198924, \
$\quad\quad$   0.13979754, 0.17877788, 0.20639778, 0.14169928, 0.13629775])

#### Problem setup
Consider steady groundwater flow in a confined, inhomogeneous aquifer which occupies the domain $\Omega$ with boundary $\Gamma$. 
Assuming that water is incompressible, the governing equations for groundwater flow can be written as the scalar elliptic partial differential equation:
$$
-\nabla \cdot(-T(\mathbf{x}) \nabla h(\mathbf{x}))=g(\mathbf{x}) \text { for all } \quad \mathbf{x} \in \Omega
$$
subject to boundary conditions on $\Gamma=\Gamma_N \cup \Gamma_D$ defined by the constraint equations
$$
h(\mathbf{x})=h_D(\mathbf{x}) \quad \text { on } \Gamma_D \quad \text { and } \quad(-T(\mathbf{x}) \nabla h(\mathbf{x})) \cdot \boldsymbol{n}=q_N(\mathbf{x}) \quad \text { on } \Gamma_N .
$$

Here $T(\mathbf{x})$ is the heterogeneous, depth-integrated transmissivity, $h(\mathbf{x})$ is hydraulic head, $h_D(\mathbf{x})$ is fixed hydraulic head at boundaries with Dirichlet constraints, $g(\mathbf{x})$ is fluid sources and sinks, $q(\mathbf{x})$ is Darcy velocity, $q_N(\mathbf{x})$ is Darcy velocity across boundaries with Neumann constraints and $\Gamma_D \subset \partial \Omega$ and $\Gamma_N \subset \partial \Omega$ define the boundaries comprising of Dirichlet and Neumann conditions, respectively. Following standard FEM practice (see e.g. [19]), eq. (1) is converted into weak form by multiplying by an appropriate test function $w \in H^1(\Omega)$ and integrating by parts, so that
$$
\int_{\Omega} \nabla w \cdot(T(\mathbf{x}) \nabla h) d \mathbf{x}+\int_{\Gamma_N} w q_N(\mathbf{x}) d s=\int_{\Omega} w g(\mathbf{x}) d \mathbf{x}, \quad \forall w \in H^1(\Omega),
$$
where $H^1(\Omega)$ is the Hilbert space of weakly differentiable functions on $\Omega$. To approximate the hydraulic head solution $h(\boldsymbol{x})$, a finite element space $V_\tau \subset H^1(\Omega)$ on a finite element mesh $\mathcal{Q}_\tau(\Omega)$. This is defined by a basis of piecewise linear Lagrange polynomials $\left\{\phi_i(\mathbf{x})\right\}_{i=1}^M$, associated with each of the $M$ finite element nodes. As a result (3) can be rewritten as a system of sparse linear equations
$$
\begin{aligned}
\mathbf{A h}=\mathbf{b} \quad \text { where } \quad A_{i j} & =\int_{\Omega} \nabla \phi_i \cdot T(\mathbf{x}) \nabla \phi_j(\mathbf{x}) d \mathbf{x} \text { and } \\
b_i & =\int_{\Omega} \phi_i(\mathbf{x}) g(\mathbf{x}) d \mathbf{x}-\int_{\Gamma_N} \phi_i(\mathbf{x}) q_N(\mathbf{x}) d s
\end{aligned}
$$
where $\mathbf{A} \in \mathbb{R}^{M \times M}$ and $\mathbf{b} \in \mathbb{R}^M$ are the global stiffness matrix and load vector, respectively. The vector $\mathbf{h}:=\left[h_1, h_2, \ldots, h_M\right] \in \mathbb{R}^M$ is the solution vector of hydraulic head at each node within the finite element mesh so that $h(\mathbf{x})=\sum_{i=1}^M h_i \phi_i(\mathbf{x})$. In our numerical experiments, these equations are solved using the open source general-purpose FEM framework FEniCS. While there are well-established groundwater simulation software packages available, such as MODFLOW  and FEFLOW [19], FEniCS was chosen because of its flexibility and ease of integration with other software and analysis codes.

# Creation of fine dataset N = 64000

In [7]:
# Set up an MLP trained on n_samples.
n_samples = 64000

# Create a matrix of random samples from a Gaussian Latin Hypercube.
samples = lhcs(mkl_fine, samples = n_samples)
samples = norm_dist(loc=0, scale=1).ppf(samples)

data = np.zeros((n_samples, len(datapoints)))
for i in range(n_samples):
    model_fine.solve(samples[i,:])
    # Extract data and save it in the matrix.
    data[i,:] = model_fine.get_data(datapoints)

# Split the data into test and training data
X_train = samples[:int(0.9*n_samples),:]; y_train = data[:int(0.9*n_samples),:]
X_test  = samples[int(0.9*n_samples):,:]; y_test  = data[int(0.9*n_samples):,:]

### Check that the seed has been set correctly

Check that the first solution 

In [8]:
print(np.all(np.abs(X_train[1] - np.array([-0.47836734,  0.28648697, -0.6303336 , -0.73718135, -1.28329923,
                                            -1.01340338, -0.72873664, -0.47809715,  0.7151521 ,  0.67099511,
                                            0.43188719,  0.03632936, -0.8649311 , -0.8647808 ,  0.34179245,
                                            0.24767509, -0.01252372,  2.15128095, -0.09757678, -1.21028494,
                                            -1.84696239, -0.39120485,  1.84236875,  1.38900885,  0.24818345,
                                            1.7394803 , -1.39512217, -1.57186453, -0.48234839, -0.51473032,
                                            -0.84504164,  0.01514964, -0.01444682, -0.0601214 , -0.35200387,
                                            -1.93785236, -0.22041721,  0.64776277,  1.07086038,  0.28086706,
                                            1.08468014, -0.31092276,  0.17420309, -0.75637504,  1.80047032,
                                            1.13038324,  0.45958537,  2.13109204, -0.88060009,  1.11485691,
                                            0.84702747, -0.86375104,  0.44718326,  0.35841146,  0.88659583,
                                            1.82694493, -1.20593774, -1.30671999, -1.31755625,  0.0846923 ,
                                            -0.8921337 , -0.03485612, -1.13209765,  0.98190528]) ) < 1e-07))


print(np.all(np.abs(y_train[1] - np.array([0.88111212, 0.9237418 , 0.96635079, 0.88511486, 0.907395  ,
       0.77208539, 0.72286761, 0.59835583, 0.58188088, 0.55799164,
       0.52192965, 0.52471177, 0.4320843 , 0.40017836, 0.30377345,
       0.32830741, 0.32617682, 0.28403402, 0.23154346, 0.25103854,
       0.26009132, 0.1807072 , 0.10643022, 0.06946826, 0.10587686]) ) < 1e-07))


True
True


In [9]:
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)
print(datapoints.shape)

(6400, 64)
(6400, 25)
(57600, 64)
(57600, 25)
(25, 2)


In [17]:
np.savetxt("./data/X_test_64000.csv",X_test , delimiter = ",")
np.savetxt("./data/y_test_64000.csv",y_test , delimiter = ",")
np.savetxt("./data/X_train_64000.csv",X_train , delimiter = ",")
np.savetxt("./data/y_train_64000.csv",y_train , delimiter = ",")

# Creation of coarse dataset N = 32000

In [10]:
# Set up an MLP trained on n_samples.
n_samples = 32000

# Create a matrix of random samples from a Gaussian Latin Hypercube.
samples = lhcs(mkl_fine, samples = n_samples)
samples = norm_dist(loc=0, scale=1).ppf(samples)

data = np.zeros((n_samples, len(datapoints)))
for i in range(n_samples):
    model_coarse.solve(samples[i,:])
    # Extract data and save it in the matrix.
    data[i,:] = model_coarse.get_data(datapoints)

# Split the data into test and training data
X_train_coarse = samples[:int(0.9*n_samples),:]; y_train_coarse = data[:int(0.9*n_samples),:]
X_test_coarse  = samples[int(0.9*n_samples):,:]; y_test_coarse  = data[int(0.9*n_samples):,:]

In [13]:
np.savetxt("./data/X_test_coarse_32000.csv",X_test_coarse , delimiter = ",")
np.savetxt("./data/y_test_coarse_32000.csv",y_test_coarse , delimiter = ",")
np.savetxt("./data/X_train_coarse_32000.csv",X_train_coarse , delimiter = ",")
np.savetxt("./data/y_train_coarse_32000.csv",y_train_coarse , delimiter = ",")