# [Bias Variance Tradeoff](https://en.wikipedia.org/wiki/Bias%E2%80%93variance_tradeoff)
try out [see also here](https://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/)

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import ARDRegression
from sklearn.pipeline import Pipeline
import pandas as pd


In [46]:
def align_coefficients(coefs):
    """ensure that the average over all contributers per level is 0"""
    mem = []
    for lcoefs in coefs:
        avg = np.mean([m for (m, s) in lcoefs.values()])
        mem.append({k: (m - avg, s) for k, (m, s) in lcoefs.items()})

    return mem


def gen_contributer_coefficients(n_contributers, contributer_avg, contributer_sig):
    """creates for n_contributers the average and standard deviation
    Input:
    * n_contributers: list with number of influencers for each level
    * contributer_avg: average for the average of the influencers
    * contributer_sig: average for the stdev for the influencers (lognormal distribution)

    Output:
    Dictionary {level : { influencer: (mu, sigma)}} with the coefficients for every
    influencer in each level
    """
    assert n_contributers[0] == 1, "First level is allowed to have one contributer"
    return align_coefficients(
        [
            {
                i: (
                    np.random.normal(contributer_avg),
                    np.random.lognormal(contributer_sig),
                )
                for i in range(num)
            }
            for lvl, num in enumerate(n_contributers)
        ]
    )


def print_coefficients(contributer_coefficients):
    # contributer_coefficients = align_coefficients(contributer_coefficients)
    for m, stage in enumerate(contributer_coefficients):
        for i, (mu, sig) in stage.items():
            print(f"Stage {m}: Contributer {i} mu={mu:.2f}, sig={sig:.2f}")


def gen_data(contributer_coefficients, n_samples):
    """generate random data.
    The first level defines the baseline that holds for all random numbers

    Inputs:
    * n_samples: number of samples
    * contributer_coefficients: dictionary with (mean,sig) per level per contributer

    Outputs:
    * data: array with the final number
    * contributers: matrix defining the contributers, first column is for the first level"""

    assert (
        len(contributer_coefficients[0]) == 1
    ), "Level 0 defines the baseline. It should have exactly one contributer"
    # data = np.random.normal(gen_avg, gen_sig, n_samples)
    data = np.zeros((n_samples,))
    contributers = np.zeros((n_samples, len(contributer_coefficients)))
    for lvl, cdict in enumerate(contributer_coefficients):
        # print(f"creating level {lvl}")
        lvl_influencers = len(cdict)  # number of influencers in this level
        lvldata = np.zeros((n_samples, lvl_influencers))

        for i, (mu, sig) in cdict.items():
            lvldata[:, i] = np.random.normal(mu, sig, n_samples)

        selection = np.random.randint(low=0, high=lvl_influencers, size=(n_samples))
        contributers[:, lvl] = selection

        data += np.array([lvldata[row, col] for row, col in enumerate(selection)])
        # Note: The first level
    return data, contributers[:, 1:].astype(int)


def generate(n_samples, contributer_coefficients):
    if contributer_coefficients == 2:
        n_contributers = len(contributer_coefficients[1])
        print("ATTENTION: just for one level")
    else:
        n_contributers = [len(level) for level in contributer_coefficients][1:]

    print("generating Data for")
    print_coefficients(contributer_coefficients)

    data, contributers = gen_data(
        n_samples=n_samples, contributer_coefficients=contributer_coefficients
    )

    # contributers = contributers.squeeze(-1) # kill the first level - not needed
    return data, contributers, n_contributers


def to_df(array, y_data):
    cols = [f"group_{i}" for i in range(array.shape[1])]
    data = pd.DataFrame(array, columns=cols)
    data["sum"] = y_data
    return data


def get_coefs(pipe, coef_type, general=None):
    """convert pipe to a dataframe with the coefficients"""

    feature_names = list(pipe[0].get_feature_names_out())
    coefs = list(pipe[1].coef_)

    feature_names.append("intercept")
    coefs.append(pipe[1].intercept_)

    df = pd.DataFrame({"parameter": feature_names, coef_type: coefs})
    if general is not None:
        assert type(general) is dict
        for key, val in general.items():
            df[key] = val

    return df

In [51]:
n_samples = 10000

cc_27 = [
    {0: (2, 5)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 1)},
    {0: (3, 1), 1: (-1, 1), 2: (-1, 1), 3: (-1, 1)},
]

cc_27b = [
    {0: (2, 5)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 3)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 1), 3: (0, 1)},
]

cc_2x = [
    {0: (2, 5)},
    {
        0: (2, 1),
        1: (1, 1),
        2: (-1, 1),
        3: (0, 5),
        4: (-2, 2),
    },
    {0: (3, 1), 1: (-1, 1), 2: (1, 1), 3: (0, 1), 4: (-3, 3)},
]
contributer_coefficients = cc_2x

y_data, x_data, n_contributers = generate(
    n_samples, contributer_coefficients=contributer_coefficients
)

data = to_df(x_data, y_data)

"""
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(contributers)
X = enc.transform(contributers).toarray()
n_features = np.sum(n_contributers)"""

generating Data for
Stage 0: Contributer 0 mu=2.00, sig=5.00
Stage 1: Contributer 0 mu=2.00, sig=1.00
Stage 1: Contributer 1 mu=1.00, sig=1.00
Stage 1: Contributer 2 mu=-1.00, sig=1.00
Stage 1: Contributer 3 mu=0.00, sig=5.00
Stage 1: Contributer 4 mu=-2.00, sig=2.00
Stage 2: Contributer 0 mu=3.00, sig=1.00
Stage 2: Contributer 1 mu=-1.00, sig=1.00
Stage 2: Contributer 2 mu=1.00, sig=1.00
Stage 2: Contributer 3 mu=0.00, sig=1.00
Stage 2: Contributer 4 mu=-3.00, sig=3.00


"\nenc = OneHotEncoder(handle_unknown='ignore')\nenc.fit(contributers)\nX = enc.transform(contributers).toarray()\nn_features = np.sum(n_contributers)"

In [52]:
data

Unnamed: 0,group_0,group_1,sum
0,2,1,0.927193
1,1,1,8.301104
2,3,3,4.093627
3,4,2,-2.266321
4,2,4,-3.902551
...,...,...,...
9995,3,4,7.249486
9996,2,3,11.196188
9997,0,3,4.499292
9998,0,4,2.512233


In [53]:
pipe = Pipeline(
    [
        ("ohe", OneHotEncoder(sparse_output=False)),
        ("regressor", ARDRegression(fit_intercept=True)),
    ]
)

groups = [f"group_{i}" for i in range(len(contributer_coefficients) - 1)]

In [57]:
pipe.fit(data[groups], data["sum"])

df_mean = get_coefs(pipe, coef_type="mean")

data["pred"] = pipe.predict(data[groups])
data["var"] = np.power(data["sum"] - data["pred"], 2)

pipe.fit(data[groups], data["var"])

df_var = get_coefs(pipe, coef_type="var")

df = df_mean.merge(df_var, on="parameter")
df

Unnamed: 0,parameter,mean,var
0,group_0_0,2.007802,-0.001563
1,group_0_1,1.000887,0.000909
2,group_0_2,-0.767909,-0.001447
3,group_0_3,0.00021,25.939345
4,group_0_4,-2.07243,0.453258
5,group_1_0,2.832775,0.000272
6,group_1_1,-1.037208,0.000531
7,group_1_2,0.828589,-1.632554
8,group_1_3,-0.000155,-0.000249
9,group_1_4,-3.088206,7.511155


In [39]:
contributer_coefficients

[{0: (2, 5)},
 {0: (2, 1), 1: (1, 1), 2: (-1, 1), 3: (0, 5), 4: (-2, 2)},
 {0: (3, 1), 1: (-1, 1), 2: (1, 1), 3: (0, 1), 4: (-3, 3)}]