# [Multilevel Modeling](https://en.wikipedia.org/wiki/Multilevel_model)
Idea: Decompose the sum of random numbers into its contributions

for a given set of $x_i$ and $a_{ik}$ with
$$x_i = \Sigma_{j=0}^n \Sigma_{k=1}^{m_j} a_{ik}y_{jk}$$ 
with
* $m_j$ being the number of contributers of layer j
* $y_{jk}$ being the k-th contribution of layer j
* $m_0 = 1$ by default
* $a_{ik} \in \{0, 1\}$
* $\Sigma a_{ik} = 1$ only one contributer per layer

calculate the Distributions $y_{k} \sim N(μ_{k}, σ_{k})$
Boundary conditions:
* $\Sigma _{k} μ_k = 0$ via $μ_{m_j} = -\Sigma_{k = 1}^{m_j-1} μ_{k}$


Goal of this workbook to explore [Linear Models](https://scikit-learn.org/stable/modules/linear_model.html) see also [(Examples)](https://scikit-learn.org/stable/auto_examples/linear_model/index.html)


In [1]:
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder

In [2]:
def align_coefficients(coefs):
    """ensure that the average over all contributers per level is 0"""
    mem = []
    for lcoefs in coefs:
        avg = np.mean([m for (m, s) in lcoefs.values()])
        mem.append({k: (m-avg, s) for k, (m,s) in lcoefs.items()})
    
    return mem

def gen_contributer_coefficients(n_contributers, contributer_avg, contributer_sig):
    """creates for n_contributers the average and standard deviation
    Input:
    * n_contributers: list with number of influencers for each level
    * contributer_avg: average for the average of the influencers
    * contributer_sig: average for the stdev for the influencers (lognormal distribution) 
     
    Output:
    Dictionary {level : { influencer: (mu, sigma)}} with the coefficients for every
    influencer in each level
    """
    assert n_contributers[0] == 1, "First level is allowed to have one contributer"
    return [{i : (np.random.normal(contributer_avg), np.random.lognormal(contributer_sig)) for i in range(num)} for lvl, num in enumerate(n_contributers)]
    return align_coefficients([{i : (np.random.normal(contributer_avg), np.random.lognormal(contributer_sig)) for i in range(num)} for lvl, num in enumerate(n_contributers)])

def print_coefficients(contributer_coefficients):
    #contributer_coefficients = align_coefficients(contributer_coefficients)
    for m, stage in enumerate(contributer_coefficients):
        for i, (mu, sig) in stage.items():
            print(f"Stage {m}: Contributer {i} mu={mu:.2f}, sig={sig:.2f}")

def gen_data(contributer_coefficients, n_samples):
    """generate random data.
    The first level defines the baseline that holds for all random numbers
    
    Inputs:
    * n_samples: number of samples
    * contributer_coefficients: dictionary with (mean,sig) per level per contributer
    
    Outputs:
    * data: array with the final number
    * contributers: matrix defining the contributers, first column is for the first level"""
    
    assert len(contributer_coefficients[0]) == 1, "Level 0 defines the baseline. It should have exactly one contributer"
    #data = np.random.normal(gen_avg, gen_sig, n_samples)
    data = np.zeros((n_samples, ))
    contributers = np.zeros((n_samples, len(contributer_coefficients)))
    for lvl, cdict in enumerate(contributer_coefficients):
        #print(f"creating level {lvl}")
        lvl_influencers = len(cdict) #number of influencers in this level
        lvldata = np.zeros((n_samples, lvl_influencers))

        for i, (mu,sig) in cdict.items():
            lvldata[:,i] = np.random.normal(mu, sig, n_samples)

        selection = np.random.randint(low=0,
                                    high=lvl_influencers,
                                    size=(n_samples))
        contributers[:, lvl] = selection
        
        data += np.array([lvldata[row, col] for row, col in enumerate(selection)])
        # Note: The first level 
    return data, contributers[:, 1:].astype(int)

def generate(n_samples, contributer_coefficients):
    if contributer_coefficients == 2:
        n_contributers= len(contributer_coefficients[1])
        print("ATTENTION: just for one level")
    else:
        n_contributers = [len(level) for level in contributer_coefficients][1:]

    print("generating Data for")
    print_coefficients(contributer_coefficients)

    data, contributers = gen_data(n_samples=n_samples,
                              contributer_coefficients=contributer_coefficients)

    #contributers = contributers.squeeze(-1) # kill the first level - not needed
    return data, contributers, n_contributers

In [80]:
n_samples = 100000

cc_27 = [
    {    0 : (2, 5)},
    {    0: (2, 1),
        1: (-1, 1),
        2: (-1, 1)},
    {    0: (3, 1),
        1: (-1, 1),
        2: (-1, 1),
        3: (-1, 1)}
        ]

cc_27b = [
    {    0 : (2, 5)},
    {    0: (2, 1),
        1: (-1, 1),
        2: (-1, 3)},
    {    0: (2, 1),
        1: (-1, 1),
        2: (-1, 1),
        3: (0, 1)}
        ]

cc_2x = [
    {    0 : (2, 5)},
    {    0: (2, 1),
        1: (1, 1),
        2: (-1, 1),
        3: (0, 5),
        4: (-2, 2),
        },
    {    0: (3, 1),
        1: (-1, 1),
        2: (1, 1),
        3: (0, 1),
        4: (-3, 3)}
        ]


data, contributers, n_contributers = generate(n_samples, contributer_coefficients=cc_2x)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(contributers)
contributers_ohe = enc.transform(contributers).toarray()
n_features = np.sum(n_contributers)


generating Data for
Stage 0: Contributer 0 mu=2.00, sig=5.00
Stage 1: Contributer 0 mu=2.00, sig=1.00
Stage 1: Contributer 1 mu=1.00, sig=1.00
Stage 1: Contributer 2 mu=-1.00, sig=1.00
Stage 1: Contributer 3 mu=0.00, sig=5.00
Stage 1: Contributer 4 mu=-2.00, sig=2.00
Stage 2: Contributer 0 mu=3.00, sig=1.00
Stage 2: Contributer 1 mu=-1.00, sig=1.00
Stage 2: Contributer 2 mu=1.00, sig=1.00
Stage 2: Contributer 3 mu=0.00, sig=1.00
Stage 2: Contributer 4 mu=-3.00, sig=3.00


7

In [84]:
def show(coefficients, text = ""):

    l = list(coefficients)
    p = lambda i: [f"{l.pop(0):.3f}" for _ in range(i)]
    print(text) if text != "" else ""
    
    for i,n in enumerate(n_contributers):
        print(f"{i+1}: {p(n)}")
        
def eval(regressor, n_contributers=n_contributers):
    """print intercept and coefficients"""
    print(f"Intercept: {regressor.intercept_}")
    show(regressor.coef_)

# [Ridge Regression](https://scikit-learn.org/stable/modules/linear_model.html#regression)

In [91]:
reg = linear_model.Ridge(alpha=1, fit_intercept=True)
reg.fit(contributers_ohe, data)
eval(reg)

Intercept: 2.016177169262657
1: ['1.987', '0.996', '-1.036', '0.029', '-1.976']
2: ['2.993', '-1.017', '0.992', '0.005', '-2.974']


# [Lasso](https://scikit-learn.org/stable/modules/linear_model.html#lasso)
--> [LassoLarsCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html#sklearn.linear_model.LassoLarsCV)

In [16]:
reg = linear_model.LassoLarsCV(cv=5).fit(contributers_ohe, data)
reg.score(contributers_ohe, data)
eval(reg)

Intercept: 0.886657158656976
0: ['2.821', '0.000', '-0.023']
1: ['2.301', '-0.866', '-0.838', '0.000']


# [Elastic-Net](https://scikit-learn.org/stable/modules/linear_model.html#elastic-net)

In [17]:
reg = linear_model.ElasticNetCV(cv=5).fit(contributers_ohe, data)
eval(reg)

Intercept: 1.4958976611423003
0: ['2.250', '-0.581', '-0.625']
1: ['2.294', '-0.913', '-0.884', '0.000']


# [LARS-Lasso](https://scikit-learn.org/stable/modules/linear_model.html#lars-lasso)
--> finds biggest contributers

In [18]:
reg = linear_model.LassoLars(alpha=0.1).fit(contributers_ohe, data)
eval(reg)

Intercept: 0.6458608897258027
0: ['2.419', '0.000', '0.000']
1: ['2.295', '-0.108', '-0.117', '0.000']


# [Orthogonal Matching Pursuit](https://scikit-learn.org/stable/modules/linear_model.html#orthogonal-matching-pursuit-omp)

In [89]:
reg = linear_model.OrthogonalMatchingPursuit().fit(contributers_ohe, data)
eval(reg)

Intercept: 1.2624933616660958
1: ['0.000', '0.000', '0.000', '0.000', '0.000']
2: ['3.732', '0.000', '0.000', '0.000', '0.000']


# [Bayesian Ridge Regression](https://scikit-learn.org/stable/modules/linear_model.html#bayesian-ridge-regression)

In [90]:
reg = linear_model.BayesianRidge(verbose=True).fit(contributers_ohe, data)
eval(reg)

print(f"precision of the noise: {reg.alpha_}")
print(f"Estimated precision of the weights: {reg.lambda_}")
print(f"Estimated variance-covariance matrix of the weights: {reg.sigma_}")
print(f"")

Convergence after  2  iterations
Intercept: 2.0161731641321925
1: ['1.986', '0.996', '-1.035', '0.029', '-1.976']
2: ['2.992', '-1.016', '0.992', '0.005', '-2.973']
precision of the noise: 0.029124163203749852
Estimated precision of the weights: 0.26908443276033245
Estimated variance-covariance matrix of the weights: [[ 7.44645911e-01  7.42914013e-01  7.42914033e-01  7.42918419e-01
   7.42913283e-01  7.71031155e-06  3.73987373e-06  1.60548706e-06
  -7.67053611e-06 -5.38513623e-06]
 [ 7.42914013e-01  7.44634015e-01  7.42918014e-01  7.42922360e-01
   7.42917257e-01 -1.02929811e-06 -3.68008544e-06  9.36535070e-06
  -3.07135053e-06 -1.58461663e-06]
 [ 7.42914033e-01  7.42918014e-01  7.44633765e-01  7.42922494e-01
   7.42917353e-01 -2.59102211e-06 -8.70947029e-07 -6.21151916e-07
   5.33840871e-06 -1.25528766e-06]
 [ 7.42918419e-01  7.42922360e-01  7.42922494e-01  7.44620637e-01
   7.42921748e-01 -3.00640547e-06  3.95433712e-06 -5.72555274e-06
   4.65725645e-06  1.20364648e-07]
 [ 7.42913283

# [Auomatic Relevance Determination](https://scikit-learn.org/stable/modules/linear_model.html#automatic-relevance-determination-ard)

In [85]:
reg = linear_model.ARDRegression(verbose=True,
                                 compute_score=True,
                                 ).fit(contributers_ohe, data)
eval(reg)

print(f"precision of the noise: {reg.alpha_}")

show(reg.lambda_, "Estimated precision of the weights:")

print(f"Estimated variance-covariance matrix of the weights:\n {reg.sigma_}")

print("Scores")
print(reg.scores_)

Converged after 3 iterations
Intercept: 2.050897629052473
1: ['1.957', '0.965', '-1.064', '-0.000', '-2.005']
2: ['2.987', '-1.020', '0.985', '0.000', '-2.979']
precision of the noise: 0.029124165499521043
Estimated precision of the weights:
1: ['0.261', '1.069', '0.881', '1035.977', '0.248']
2: ['0.112', '0.957', '1.027', '26583.736', '0.113']
Estimated variance-covariance matrix of the weights:
 [[ 4.37612738e-03  2.63662054e-03  2.63736014e-03  9.58574720e-04
   2.64021314e-03  2.29445227e-05  1.20456452e-05  1.94821707e-05
   6.73949701e-06]
 [ 2.63662054e-03  4.34669136e-03  2.63376147e-03  9.57252047e-04
   2.63660331e-03  9.60550841e-06  5.72142817e-08  2.26242858e-05
   5.94922187e-06]
 [ 2.63736014e-03  2.63376147e-03  4.34843253e-03  9.57561749e-04
   2.63741908e-03 -3.10165997e-07 -5.48608312e-06  4.32341613e-06
  -2.07806092e-06]
 [ 9.58574720e-04  9.57252047e-04  9.57561749e-04  9.62999254e-04
   9.58599317e-04 -1.68120308e-08  2.33471621e-09 -3.31027367e-08
  -8.91592115e

In [86]:
print_coefficients(cc_2x)

Stage 0: Contributer 0 mu=2.00, sig=5.00
Stage 1: Contributer 0 mu=2.00, sig=1.00
Stage 1: Contributer 1 mu=1.00, sig=1.00
Stage 1: Contributer 2 mu=-1.00, sig=1.00
Stage 1: Contributer 3 mu=0.00, sig=5.00
Stage 1: Contributer 4 mu=-2.00, sig=2.00
Stage 2: Contributer 0 mu=3.00, sig=1.00
Stage 2: Contributer 1 mu=-1.00, sig=1.00
Stage 2: Contributer 2 mu=1.00, sig=1.00
Stage 2: Contributer 3 mu=0.00, sig=1.00
Stage 2: Contributer 4 mu=-3.00, sig=3.00


In [88]:
reg = linear_model.ARDRegression(verbose=True,
                                 compute_score=True,
                                 threshold_lambda=1e6, 
                                 tol=1e-8
                                 ).fit(contributers_ohe, data)
eval(reg)

print(f"precision of the noise: {reg.alpha_}")

show(reg.lambda_, "Estimated precision of the weights:")

print(f"Estimated variance-covariance matrix of the weights:\n {reg.sigma_}")

print("Scores")
print(reg.scores_)

Intercept: 2.0509461800468456
1: ['1.957', '0.965', '-1.064', '-0.000', '-2.005']
2: ['2.987', '-1.020', '0.985', '-0.000', '-2.979']
precision of the noise: 0.02912416549742874
Estimated precision of the weights:
1: ['0.261', '1.069', '0.881', '1100.300', '0.248']
2: ['0.112', '0.957', '1.026', '614.879', '0.113']
Estimated variance-covariance matrix of the weights:
 [[ 4.32047135e-03  2.58104150e-03  2.58176312e-03  9.02661311e-04
   2.58455568e-03  2.28892773e-05  1.19893781e-05  1.94279797e-05
  -5.64771312e-08  6.68379226e-06]
 [ 2.58104150e-03  4.29118935e-03  2.57824137e-03  9.01415861e-04
   2.58102286e-03  9.56611947e-06  1.67705507e-08  2.25859292e-05
  -4.05483189e-08  5.90937406e-06]
 [ 2.58176312e-03  2.57824137e-03  4.29289463e-03  9.01707501e-04
   2.58182065e-03 -3.07448950e-07 -5.48448668e-06  4.32708195e-06
   1.74867525e-09 -2.07580273e-06]
 [ 9.02661311e-04  9.01415861e-04  9.01707501e-04  9.06827600e-04
   9.02684477e-04 -1.57783163e-08  2.25082112e-09 -3.11181339e

In [72]:
D = np.zeros((n_features, n_features))
np.fill_diagonal(D, 1)

In [76]:
reg.predict(D, return_std=True)

(array([ 2.98313636, -0.02101741, -0.02347963,  3.02407272, -0.02733895,
        -0.01941585,  0.97741452]),
 array([5.45582635, 5.45570748, 5.45570759, 5.45587363, 5.45572356,
        5.45571468, 5.45587398]))

# [Quantile Regressor](https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html)

In [14]:
X = contributers_ohe
y = data
from sklearn.utils.fixes import parse_version, sp_version

# This is line is to avoid incompatibility if older SciPy version.
# You should use `solver="highs"` with recent version of SciPy.
solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"

from sklearn.linear_model import QuantileRegressor

quantiles = [0.05, 0.5, 0.95]
predictions = {}
#out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
for quantile in quantiles:
    qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver)
    y_pred = qr.fit(X, y).predict(X)
    predictions[quantile] = y_pred
"""
    if quantile == min(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred >= y_normal
        )
    elif quantile == max(quantiles):
        out_bounds_predictions = np.logical_or(
            out_bounds_predictions, y_pred <= y_normal
        )
        """

'\n    if quantile == min(quantiles):\n        out_bounds_predictions = np.logical_or(\n            out_bounds_predictions, y_pred >= y_normal\n        )\n    elif quantile == max(quantiles):\n        out_bounds_predictions = np.logical_or(\n            out_bounds_predictions, y_pred <= y_normal\n        )\n        '

--> dont use the quantile regressor for this purpose

# Conclusion

If all standard deviations are the same within one level best Results are obtained by the Ridge Regression and the Bayesian Ridge Regression . If the standard deviation changes, the estimation gets artefacts.

If the means and the standard deviations are more distributed, then the ARDRegressor delivers the best estimates. The Estimated Precision of the weights correlates (a bit) to the variance.