## MLMC Estimator Pretest - Proof of Concept

This notebook investigates the behaviour of different multilevel Monta Carlo extensions of the Harrell-Davis quantile estimator, specifically regarding their accuracy and variance.

To understand the characteristics of the proposed estimators, the QOI is chosen as a quantile of a well-known distribution - the normal distribution $N(\mu, \sigma^2)$.

### 1. Set the parameters

In [67]:
# Distribution of interest: 
mu = -10
sd = 200

# Quantile of interest:
p = 0.005

# Model valuation costs:
c_f = 1
c_c = c_f / 100

# Number of samples:
n_f = 10
n_c = 50000
n_std_mc = int((n_f * c_f + n_c * c_c) / c_f)

# Number of bootstrap samples:
n_bootstrap = 1000

### 2. Define the sampling functions

In [73]:
import numpy as np

def SampleLevel1(mu, sd, n_f):
    fineModelSamples = np.random.normal(mu, sd, n_f)
    eps = abs(50 * np.random.normal(0, 0.6, n_f))
    coarseModelSamples_lvl1 = fineModelSamples + eps
    return fineModelSamples, coarseModelSamples_lvl1, eps

def SampleLevel2(mu, sd, n_c):
    eps = abs(50 * np.random.normal(0, 0.6, n_c))
    coarseModelSamples_50000 = np.random.normal(mu, sd, n_c) + eps
    return coarseModelSamples_50000, eps

### 3. Define Harrell-Davis weighting function

In [74]:
import scipy as scp

def HarrellDavisWeighting(p, xSorted):
    n = xSorted.size
    hd = np.empty((2), np.float64)
    if n < 2:
        hd.flat = np.nan
        return hd[0]
    v = np.arange(n+1) / float(n)
    betacdf = scp.stats.distributions.beta.cdf
    _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
    w = _w[1:] - _w[:-1]
    hd_mean = np.dot(w, xSorted)
    hd[0] = hd_mean
    hd[1] = np.dot(w, (xSorted-hd_mean)**2)
    return hd[0]

### 4. Define Quantile Estimators

In [75]:
from scipy.stats import norm
import pandas as pd

def CalcQuantileEstimates(p, mu, sd, fineModelSamples, coarseModelSamples_lvl1, coarseModelSamples_lvl2, eps2, numStdMCSamples):
    # Compute HD estimators
    hdEstimatorFineModel = scp.stats.mstats.hdquantiles(data=fineModelSamples, prob=(p), var=False).item()
    hdEstimatorCoarseModel_lvl1 = scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl1, prob=(p), var=False).item()
    hdEstimatorCoarseModel_50000 = scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl2, prob=(p), var=False).item()

    # Compute order statistics
    fineModelOrderStats = np.sort(fineModelSamples)
    coarseModel10000OrderStats = np.sort(coarseModelSamples_lvl1)
    coarseModel50000OrderStats = np.sort(coarseModelSamples_lvl2)

    # MLMC estimators
    #mlmcEstimator = hdEstimatorFineModel - hdEstimatorCoarseModel_lvl1 + hdEstimatorCoarseModel_lvl2
    mlmcEstimator = HarrellDavisWeighting(p, fineModelOrderStats) - HarrellDavisWeighting(p, coarseModel10000OrderStats) + HarrellDavisWeighting(p, coarseModel50000OrderStats)

    diffsOfOrderStats = np.sort(fineModelSamples) - np.sort(coarseModelSamples_lvl1)
    #mlmcEstimatorComb = scp.stats.mstats.hdquantiles(data=diffs, prob=(p), var=False).item() + scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl2, prob=(p), var=False).item()
    mlmcEstimatorComb = HarrellDavisWeighting(p, diffsOfOrderStats) + scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl2, prob=(p), var=False).item()

    diffs = fineModelSamples - coarseModelSamples_lvl1
    orderStatsOfDiffs = np.sort(diffs)
    mlmcEstimatorApprox = HarrellDavisWeighting(p, orderStatsOfDiffs) + scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl2, prob=(p), var=False).item()

    # Standard MC estimator
    hdEstimatorStandardMC = scp.stats.mstats.hdquantiles(data=coarseModelSamples_lvl2[0:numStdMCSamples-1] - eps2[0:numStdMCSamples-1], prob=(p), var=False).item()

    # Actual quantile
    actualQuantile = norm.ppf(p, loc=mu, scale=sd)

    # Combine into a DataFrame
    df = pd.DataFrame({
        "Value": [
            hdEstimatorFineModel,
            hdEstimatorCoarseModel_lvl1,
            hdEstimatorCoarseModel_50000,
            mlmcEstimator,
            mlmcEstimatorComb,
            mlmcEstimatorApprox,
            hdEstimatorStandardMC,
            actualQuantile
        ]
    })

    return df

### 5. Apply bootstrapping

In [76]:
estimators = [
    f"Fine Model HD, {n_f} samples",
    f"Coarse Model HD, {n_f} samples",
    f"Coarse Model HD, {n_c} samples",
    "MLMC (separate terms)",
    "MLMC (diff. of order stats)",
    "MLMC (order stats of diff.)",
    f"Standard MC-HD, {n_std_mc} samples",
    "Actual Quantile"
]

df = pd.DataFrame({"Estimator": estimators})

cols = []

for i in range(1, n_bootstrap + 1):

    fineModelSamples, coarseModelSamples_lvl1, eps1 = SampleLevel1(mu, sd, n_f)
    coarseModelSamples_lvl2, eps2 = SampleLevel2(mu, sd, n_c)

    col = CalcQuantileEstimates(
        p, mu, sd,
        fineModelSamples,
        coarseModelSamples_lvl1,
        coarseModelSamples_lvl2,
        eps2,
        n_std_mc
    )["Value"]

    col.name = f"Run_{i}"
    cols.append(col)

new_cols = pd.concat(cols, axis=1)
df = pd.concat([df, new_cols], axis=1)

### 6. Compute bootstrapped statistics for the estimators

In [72]:
# Compute average across all Run columns
run_cols = [f"Run_{i}" for i in range(1, n_bootstrap+1)]
df["Mean"] = df[run_cols].mean(axis=1)

# Extract the Actual Quantile value from the DataFrame
actual_value = df.loc[df["Estimator"] == "Actual Quantile", "Mean"].values[0]

# Compute the deviation of the mean from the actual quantile
df["DevActVal"] = abs(df["Mean"] - actual_value)

# Compute standard deviation across all Run columns
df["StdDev"] = df[run_cols].std(axis=1)

# Identify all Run columns
run_cols = [col for col in df.columns if col.startswith("Run_")]

# Compute RMS deviation from Actual Quantile
df["RMSDevActVal"] = ((df[run_cols] - actual_value) ** 2).mean(axis=1) ** 0.5

# Show relevant columns
pd.set_option("display.precision", 3)
print(df[["Estimator", "Mean", "DevActVal", "StdDev", "RMSDevActVal"]])

                        Estimator     Mean  DevActVal     StdDev  RMSDevActVal
0       Fine Model HD, 10 samples -317.978    207.188  1.104e+02     2.347e+02
1     Coarse Model HD, 10 samples -322.487    202.679  1.114e+02     2.313e+02
2  Coarse Model HD, 50000 samples -531.298      6.132  4.455e+00     7.578e+00
3           MLMC (separate terms) -526.789      1.623  2.816e+01     2.819e+01
4     MLMC (diff. of order stats) -526.789      1.623  2.816e+01     2.819e+01
5     MLMC (order stats of diff.) -577.430     52.264  1.822e+01     5.535e+01
6     Standard MC-HD, 510 samples -533.672      8.506  4.059e+01     4.145e+01
7                 Actual Quantile -525.166      0.000  2.275e-13     8.640e-12
