In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")
# plt.style.use('dark_background')

In [None]:
from functools import cache
import numpy as np
import random

@cache
def harmonic_number(n: int) -> float:
    return 1 / n + harmonic_number(n - 1) if n > 0 else 0

In [None]:
from dataclasses import dataclass

k0 = 2**10

@dataclass(order=True)
class Sequence:
    elements: list[int]

def sim(k0: int) -> Sequence:
    ki = [k0]
    for i in range(int(np.log2(k0))):
        ki.append(1 if ki[-1] <= 1 else random.randint(1, ki[-1]-1))
    return Sequence(list(reversed(ki)))

sims = np.array([sim(k0) for _ in tqdm(range(100000))])

In [None]:
sorted = np.sort(sims)

In [None]:
print(sorted[0])
print(sorted[-1])
print(sorted[int(len(sorted) * 0.5)])

In [None]:
from scipy.stats import beta
x = np.linspace(0,1,1000)

means = [0.55,0.3,0.8]
sizes = [30,10,10]
alphas = [m*s for m,s in zip(means,sizes)]
betas = [(1-m)*s for m,s in zip(means,sizes)]
distrs = [beta(a,b) for a,b in zip(alphas,betas)]

r = 0.5


plt.figure(figsize=(8,4), dpi=160)
for i,(a,b,distr) in enumerate(zip(alphas,betas,distrs)):
    plt.plot(x,distr.pdf(x), c=["tab:blue","tab:orange","tab:green",][i], label="$G_{x_"+str(i+1)+"}(z)$")
    mode = (a-1)/(a+b-2)
    margin = distr.pdf(mode)-distr.pdf(r)
    #plt.plot([mode,mode],[distr.pdf(r),distr.pdf(mode)], ":k")
    plt.plot([r,mode],[distr.pdf(mode),distr.pdf(mode)], ":k", alpha=0.7)
    plt.plot([r,mode],[distr.pdf(r),distr.pdf(r)], ":k", alpha=0.7)
    
    plt.errorbar(mode,distr.pdf(r), yerr=margin-0.2, lolims = True, ecolor="k")#["tab:blue","tab:orange","tab:green",][i])

plt.plot([r,r],[-1,100],"tab:red", label = "r")
plt.ylim(-0.5,5)
plt.xlim(0,1)
plt.xlabel("z")
plt.legend()

# priors

In [None]:
from scipy.stats import beta

x = np.linspace(0,1,1000)
for base_pred in np.arange(0.1,1,0.1):
    size = 1./np.min((base_pred,1-base_pred),axis=0)
    a,b = base_pred*size, (1-base_pred)*size
    base_pred,a,b = np.round(base_pred,1),np.round(a,1),np.round(b,1)

    plt.figure()
    plt.title(f"{base_pred=} {a=} {b=}")
    distr = beta(a,b)
    plt.plot(x,distr.pdf(x), label="mean=<mean>")
    plt.vlines(base_pred,np.min(distr.pdf(x)),np.max(distr.pdf(x)),"r")

# esnambles

In [None]:
def simulate(metrics, runs=100000, trees=100):
    alphas = 1/2+np.random.gamma(2,size=trees)
    betas = 1/2+np.random.gamma(0.2,size=trees)
    
    res = {m:[] for m in metrics}
    for run in tqdm(range(runs)):
        anom_rates = [np.random.beta(a,b) for a,b in zip(alphas,betas)]
        for m in metrics:
            if m=="mean":
                res[m].append(np.mean(anom_rates))
            if m=="gmean":
                res[m].append(np.exp(np.mean(np.log(anom_rates))))
                    
    return alphas, betas, res

In [None]:
from scipy.stats import beta

x = np.linspace(0,1,1000)
metrics = ["mean","gmean"]
alphas, betas, res = simulate(metrics)

modes = (alphas-1)/(alphas+betas-2)
concs = alphas+betas-2
means = (alphas)/(alphas+betas)
sampsizes = alphas+betas

In [None]:
for m,realizations in res.items():
    plt.hist(realizations, density=True, label=m, alpha=0.7, bins=100);

# max_likelihood    
a, b = 1+np.sum(alphas-1), 1+np.sum(betas-1)
distr = beta(a,b)
plt.plot(x,distr.pdf(x), label="ML")

# mode = <mode>, concentration = sum(concentration)
mode = np.mean(modes)
conc = np.sum(concs)
a, b = 1+conc*mode, 1+conc*(1-mode)
distr = beta(a,b)
plt.plot(x,distr.pdf(x), label="mode=<mode>")

# mean = <mean>, samplesize = sum(samplesize)
mean = np.mean(means)
sampsize = np.sum(sampsizes+1)
a, b = sampsize*mean, sampsize*(1-mean)
distr = beta(a,b)
plt.plot(x,distr.pdf(x), label="mean=<mean>")

plt.legend()