# [Random Effects Model](https://en.wikipedia.org/wiki/Random_effects_model)
try out [statsmodel's mixed linear model](https://www.statsmodels.org/stable/mixed_linear.html)

In [3]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [37]:
def align_coefficients(coefs):
    """ensure that the average over all contributers per level is 0"""
    mem = []
    for lcoefs in coefs:
        avg = np.mean([m for (m, s) in lcoefs.values()])
        mem.append({k: (m - avg, s) for k, (m, s) in lcoefs.items()})

    return mem


def gen_contributer_coefficients(n_contributers, contributer_avg, contributer_sig):
    """creates for n_contributers the average and standard deviation
    Input:
    * n_contributers: list with number of influencers for each level
    * contributer_avg: average for the average of the influencers
    * contributer_sig: average for the stdev for the influencers (lognormal distribution)

    Output:
    Dictionary {level : { influencer: (mu, sigma)}} with the coefficients for every
    influencer in each level
    """
    assert n_contributers[0] == 1, "First level is allowed to have one contributer"
    return [
        {
            i: (np.random.normal(contributer_avg), np.random.lognormal(contributer_sig))
            for i in range(num)
        }
        for lvl, num in enumerate(n_contributers)
    ]
    return align_coefficients(
        [
            {
                i: (
                    np.random.normal(contributer_avg),
                    np.random.lognormal(contributer_sig),
                )
                for i in range(num)
            }
            for lvl, num in enumerate(n_contributers)
        ]
    )


def print_coefficients(contributer_coefficients):
    # contributer_coefficients = align_coefficients(contributer_coefficients)
    for m, stage in enumerate(contributer_coefficients):
        for i, (mu, sig) in stage.items():
            print(f"Stage {m}: Contributer {i} mu={mu:.2f}, sig={sig:.2f}")


def gen_data(contributer_coefficients, n_samples):
    """generate random data.
    The first level defines the baseline that holds for all random numbers

    Inputs:
    * n_samples: number of samples
    * contributer_coefficients: dictionary with (mean,sig) per level per contributer

    Outputs:
    * data: array with the final number
    * contributers: matrix defining the contributers, first column is for the first level"""

    assert (
        len(contributer_coefficients[0]) == 1
    ), "Level 0 defines the baseline. It should have exactly one contributer"
    # data = np.random.normal(gen_avg, gen_sig, n_samples)
    data = np.zeros((n_samples,))
    contributers = np.zeros((n_samples, len(contributer_coefficients)))
    for lvl, cdict in enumerate(contributer_coefficients):
        # print(f"creating level {lvl}")
        lvl_influencers = len(cdict)  # number of influencers in this level
        lvldata = np.zeros((n_samples, lvl_influencers))

        for i, (mu, sig) in cdict.items():
            lvldata[:, i] = np.random.normal(mu, sig, n_samples)

        selection = np.random.randint(low=0, high=lvl_influencers, size=(n_samples))
        contributers[:, lvl] = selection

        data += np.array([lvldata[row, col] for row, col in enumerate(selection)])
        # Note: The first level
    return data, contributers[:, 1:].astype(int)


def generate(n_samples, contributer_coefficients):
    if contributer_coefficients == 2:
        n_contributers = len(contributer_coefficients[1])
        print("ATTENTION: just for one level")
    else:
        n_contributers = [len(level) for level in contributer_coefficients][1:]

    print("generating Data for")
    print_coefficients(contributer_coefficients)

    data, contributers = gen_data(
        n_samples=n_samples, contributer_coefficients=contributer_coefficients
    )

    # contributers = contributers.squeeze(-1) # kill the first level - not needed
    return data, contributers, n_contributers


def to_df(array):
    cols = [f"group_{i}" for i in range(array.shape[1])]
    return pd.DataFrame(array, columns=cols)

In [19]:
n_samples = 10000

cc_27 = [
    {0: (2, 5)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 1)},
    {0: (3, 1), 1: (-1, 1), 2: (-1, 1), 3: (-1, 1)},
]

cc_27b = [
    {0: (2, 5)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 3)},
    {0: (2, 1), 1: (-1, 1), 2: (-1, 1), 3: (0, 1)},
]

cc_2x = [
    {0: (2, 5)},
    {
        0: (2, 1),
        1: (1, 1),
        2: (-1, 1),
        3: (0, 5),
        4: (-2, 2),
    },
    {0: (3, 1), 1: (-1, 1), 2: (1, 1), 3: (0, 1), 4: (-3, 3)},
]


y_data, x_data, n_contributers = generate(n_samples, contributer_coefficients=cc_2x)
"""
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(contributers)
X = enc.transform(contributers).toarray()
n_features = np.sum(n_contributers)"""

generating Data for
Stage 0: Contributer 0 mu=2.00, sig=5.00
Stage 1: Contributer 0 mu=2.00, sig=1.00
Stage 1: Contributer 1 mu=1.00, sig=1.00
Stage 1: Contributer 2 mu=-1.00, sig=1.00
Stage 1: Contributer 3 mu=0.00, sig=5.00
Stage 1: Contributer 4 mu=-2.00, sig=2.00
Stage 2: Contributer 0 mu=3.00, sig=1.00
Stage 2: Contributer 1 mu=-1.00, sig=1.00
Stage 2: Contributer 2 mu=1.00, sig=1.00
Stage 2: Contributer 3 mu=0.00, sig=1.00
Stage 2: Contributer 4 mu=-3.00, sig=3.00


"\nenc = OneHotEncoder(handle_unknown='ignore')\nenc.fit(contributers)\nX = enc.transform(contributers).toarray()\nn_features = np.sum(n_contributers)"

In [31]:
d = sm.datasets.get_rdataset("dietox", "geepack").data
d["Pig"].dtype

dtype('int64')

In [38]:
data = to_df(x_data)
data["sum"] = y_data
data["baseline"] = 1
data

Unnamed: 0,group_0,group_1,sum,baseline
0,1,2,4.195875,1
1,3,0,23.066601,1
2,3,3,4.749189,1
3,3,3,5.158638,1
4,4,0,-12.311043,1
...,...,...,...,...
9995,4,1,-7.570059,1
9996,0,0,12.584144,1
9997,3,2,12.088368,1
9998,3,0,14.089029,1


In [34]:
smf.mixedlm("Weight ~ Time", d, groups=d["Pig"])

<statsmodels.regression.mixed_linear_model.MixedLM at 0x2165fd679e0>

In [48]:
# md = smf.mixedlm("sum ~ baseline", data, groups=data[["group_0", "group_1"]].to_numpy())
md = smf.mixedlm("sum ~ 1", data, groups=data["group_0"])
mdf = md.fit()
print(mdf.summary())


          Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: sum        
No. Observations: 10000   Method:             REML       
No. Groups:       5       Scale:              37.7636    
Min. group size:  1906    Log-Likelihood:     -32356.9757
Max. group size:  2090    Converged:          Yes        
Mean group size:  2000.0                                 
----------------------------------------------------------
             Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
----------------------------------------------------------
Intercept    1.902     0.659  2.887  0.004   0.611   3.193
Group Var    2.151     0.250                              



# Conclusion
Statsmodels does allow for one-level groups only

In [46]:
data[["group_0", "group_1"]].to_numpy()

array([[1, 2],
       [3, 0],
       [3, 3],
       ...,
       [3, 2],
       [3, 0],
       [3, 2]])