In [1]:
using Distributions
using StatsPlots
default(fmt=:png, titlefontsize=8, legendfontsize=7, guidefontsize=8, tickfontsize=6)
using Random
using KernelDensity
using QuadGK
using StatsBase

safediv(x, y) = y == 0 ? y : x/y

function myquantile(y, Y, Z, p; δ=log2(y)/4)
    mask = @. y - δ < Y < y + δ
    !any(mask) && return NaN
    quantile(Z[mask], p)
end

myskewness(dist) = skewness(dist)
myskewness(dist::MixtureModel) = _myskewness(dist)
function _myskewness(dist)
    μ, σ = mean(dist), std(dist)
    f(x) = ((x - μ)/σ)^3 * pdf(dist, x)
    quadgk(f, extrema(dist)...)[1]
end

mykurtosis(dist) = kurtosis(dist)
mykurtosis(dist::MixtureModel) = _mykurtosis(dist)
function _mykurtosis(dist)
    μ, σ = mean(dist), std(dist)
    f(x) = ((x - μ)/σ)^4 * pdf(dist, x)
    quadgk(f, extrema(dist)...)[1] - 3
end

rd(x; digits=4) = round(x; digits)

name(dist) = replace(string(dist), r"\{.*\}"=>"")
function name(dist::MixtureModel)
    c = components(dist)
    p = probs(dist)
    s = string(p[1]) * name(c[1])
    for i in 2:length(p)
        s = s * "+" * string(p[i]) * name(c[i])
    end
    s
end

function mcsim(;
        dist = Beta(0.2, 0.3),
        n = 20,
        L = 10^6,
    )
    μ, σ = mean(dist), std(dist)
    Z = Vector{Float64}(undef, L) # expected to follow Normal(1,0)
    X² = similar(Z)               # expected to follow Chisq(n-1)
    tmp = [Vector{Float64}(undef, n) for i in 1:Threads.nthreads()]
    Threads.@threads for i in 1:L
        X = rand!(dist, tmp[Threads.threadid()])
        X̄ = mean(X) # sample mean
        S² = var(X) # sample unbiased variance
        Z[i] = √n * (X̄ - μ) / σ
        X²[i] = (n - 1) * S²/σ^2
    end
    T = @. Z / √(X²/(n - 1)) # expected to follow TDist(n-1)
    T² = @. T .^ 2           # expected to follow FDist(1, n-1)
    (; dist, n, Z, X², T, T²)
end

function plot_samplestats(;
        dist = Normal(1, 2),
        n = 10,
        L = 10^6,
        scattermax = 10^5,
        T²span = (0.75, 0.999),
        Zspan = (0.001, 0.999), 
        X²span = (0.001, 0.999), 
        kwargs...
    )
    (; dist, n, Z, X², T, T²) = mcsim(; dist, n, L)
    distname = name(dist)
    sk = myskewness(dist)
    ku = mykurtosis(dist)
    fdist = FDist(1, n-1)
    
    println(L, " samples with size n = ", n, " of ", distname)
    println("skewness, kurtosis = ", rd(sk), ", ", rd(ku))
    for p in (0.95,)# 0.99)
        println("√quantile(T², $p) = ", rd(√quantile(T², p)), ",  ")
        println("P(|T| > √quantile(FDist(1, $(n-1)), $p) = ", rd(√quantile(fdist, p)), ") = ",  rd(1 - ecdf(T²)(quantile(fdist, p))))
    end
    
    Zlim = quantile.(Ref(Z), Zspan)
    X²lim = quantile.(Ref(X²), X²span)
    
    kdeX² = InterpKDE(kde(X²))
    kdeZX² = InterpKDE(kde((Z, X²)))
    g(z, x²) = safediv(pdf(kdeZX², z, x²), pdf(kdeX², x²))
    kdeT² = InterpKDE(kde(T²))
    h(t²) = pdf(kdeT², t²)
    
    P = plot(; colorbar=false)
    plot!(; xlabel="Z = √n(X̄ - μ)/σ", ylabel="X² = (n - 1)S²/σ²", xlim=Zlim, ylim=X²lim)
    scatter!(Z[1:min(end, scattermax)], X²[1:min(end, scattermax)]; alpha=0.3, msw=0, ms=1, label="")
    vline!([0]; label="", ls=:dot, c=:red)
    hline!([n-1]; label="", ls=:dot, c=:red)
    title!("$distname, n=$n")
    
    z = range(Zlim..., 200)
    x² = range(X²lim..., 200)
    z05 = (x² -> myquantile(x², X², Z, 0.05)).(x²)
    z95 = (x² -> myquantile(x², X², Z, 0.95)).(x²)
    Q = plot(; colorbar=false)
    plot!(; xlabel="Z = √n(X̄ - μ)/σ conditioned by X²", ylabel="X² = (n - 1)S²/σ²", xlim=Zlim, ylim=X²lim)
    heatmap!(z, x², g)
    plot!(z05, x²; label="", c=:cyan)
    plot!(z95, x²; label="", c=:cyan)
    vline!([0]; label="", ls=:dot, c=:pink)
    hline!([n-1]; label="", ls=:dot, c=:pink)
    title!("p(z|x²)")
    
    chisqdist = Chisq(n-1)
    xlim = (max(0.0, quantile(X², 0.005) - 10), max(quantile(X², 0.995), quantile(chisqdist, 0.999)))
    R = plot(; xlabel="X² = (n - 1)S²/σ²", ylabel="density", xlim)
    histogram!(X²; norm=true, alpha=0.3, bin=range(xlim..., 100), label="X²")
    plot!(chisqdist, xlim...; label="Chisq($(n-1))", lw=1.5)
    vline!([n-1]; label="", ls=:dot, c=:black)
    title!("sample of X² = (n - 1)S²/σ²")
    
    xlim = quantile.(Ref(T²), T²span)
    bin = range(0, last(xlim), round(Int, 100last(xlim)/(last(xlim) - first(xlim))))
    ymax = maximum(x -> max(pdf(fdist, x), h(x)), range(xlim..., 100))
    ylim = (-0.03ymax, 1.05ymax)
    S = plot(; xlabel="T² where T = √n(X̄ - μ)/S", ylabel="density", xlim, ylim)
    histogram!(T²; norm=true, alpha=0.3, bin=bin, label="T²")
    plot!(fdist, xlim...; label="FDist(1, $(n-1))", lw=1.5)
    vline!([quantile(T², 0.95)]; label="95% line of T²", c=1, ls=:dot)
    vline!([quantile(fdist, 0.95)]; label="95% line of FDist", c=2, ls=:dot)
    title!("tail (> $(100first(T²span))%) of sample of T²")
    
    plot(P, Q, R, S; size=(800, 600))
    plot!(leftmargin=3Plots.mm, bottommargin=3Plots.mm, kwargs...)
end

plot_samplestats (generic function with 1 method)

In [2]:
function mcsimZW(;
        dist = Beta(0.2, 0.3),
        n = 20,
        L = 10^6,
    )
    μ, σ = mean(dist), std(dist)
    Z = Vector{Float64}(undef, L) # expected to follow Normal(1,0)
    W = similar(Z)
    tmp = [Vector{Float64}(undef, n) for i in 1:Threads.nthreads()]
    Threads.@threads for i in 1:L
        X = rand!(dist, tmp[Threads.threadid()])
        X̄ = mean(X) # sample mean
        S² = var(X) # sample unbiased variance
        Z[i] = √n * (X̄ - μ) / σ
        W[i] = √n * (S²/σ^2 - 1)
    end
    (; dist, n, Z, W)
end

function check_varcov(;
        dist = Beta(0.2, 0.3),
        n = 5,
        L = 10^8,
    )
    (; Z, W) = mcsimZW(; dist, n, L)
    Z² = @. Z^2
    κ₃ = myskewness(dist)
    κ₄ = mykurtosis(dist)
    
    @show dist
    @show κ₃
    @show κ₄
    meanZ,  meanZ_exact  = mean(Z),    0.0
    meanW,  meanW_exact  = mean(W),    0.0
    varZ,   varZ_exact   = var(Z),     1.0
    covZW,  covZW_exact  = cov(Z, W),  κ₃
    varW,   varW_exact   = var(W),     κ₄ + 2n/(n-1)
    covZ²W, covZ²W_exact = cov(Z², W), κ₄/√n
    varZ²,  varZ²_exact  = var(Z²),    κ₄/n + 2
    result = [
        :meanZ   meanZ   meanZ_exact   meanZ  - meanZ_exact
        :meanW   meanW   meanW_exact   meanW  - meanW_exact
        :varZ    varZ    varZ_exact    varZ   - varZ_exact
        :covZW   covZW   covZW_exact   covZW  - covZW_exact
        :varW    varW    varW_exact    varW   - varW_exact
        :covZ²W  covZ²W  covZ²W_exact  covZ²W - covZ²W_exact
        :varZ²   varZ²   varZ²_exact   varZ²  - varZ²_exact
    ]
end

check_varcov (generic function with 1 method)

In [3]:
check_varcov(dist = Normal(1, 2), n = 5)

dist = Normal{Float64}(μ=1.0, σ=2.0)
κ₃ = 0.0
κ₄ = 0.0


7×4 Matrix{Any}:
 :meanZ    9.45488e-5   0.0   9.45488e-5
 :meanW   -0.000109494  0.0  -0.000109494
 :varZ     1.00006      1.0   6.0009e-5
 :covZW    0.0002197    0.0   0.0002197
 :varW     2.49948      2.5  -0.00051887
 :covZ²W  -0.000424565  0.0  -0.000424565
 :varZ²    2.00084      2.0   0.000841716

In [4]:
check_varcov(dist = Beta(0.2, 0.3), n = 5)

dist = Beta{Float64}(α=0.2, β=0.3)
κ₃ = 0.3999999999999999
κ₄ = -1.542857142857143


7×4 Matrix{Any}:
 :meanZ   -7.138e-5     0.0       -7.138e-5
 :meanW   -2.91878e-5   0.0       -2.91878e-5
 :varZ     0.999876     1.0       -0.000124479
 :covZW    0.399923     0.4       -7.72633e-5
 :varW     0.956972     0.957143  -0.000170479
 :covZ²W  -0.689719    -0.689987   0.000268091
 :varZ²    1.69082      1.69143   -0.000606334

In [5]:
check_varcov(dist = TDist(4.5), n = 5)

dist = TDist{Float64}(ν=4.5)
κ₃ = 0.0
κ₄ = 12.0


7×4 Matrix{Any}:
 :meanZ   -0.000129484   0.0      -0.000129484
 :meanW    7.95634e-5    0.0       7.95634e-5
 :varZ     1.00004       1.0       4.24114e-5
 :covZW    0.0177433     0.0       0.0177433
 :varW    15.4681       14.5       0.968053
 :covZ²W   5.81034       5.36656   0.443778
 :varZ²    4.60618       4.4       0.206183

In [6]:
check_varcov(dist = TDist(4.6), n = 5)

dist = TDist{Float64}(ν=4.6)
κ₃ = 0.0
κ₄ = 10.000000000000005


7×4 Matrix{Any}:
 :meanZ   -6.41269e-5   0.0      -6.41269e-5
 :meanW    0.00021099   0.0       0.00021099
 :varZ     1.00003      1.0       3.20309e-5
 :covZW    0.0019016    0.0       0.0019016
 :varW    11.2027      12.5      -1.29734
 :covZ²W   3.89695      4.47214  -0.575189
 :varZ²    3.7425       4.0      -0.257498

In [7]:
check_varcov(dist = TDist(5.0), n = 5)

dist = TDist{Float64}(ν=5.0)
κ₃ = 0.0
κ₄ = 6.0


7×4 Matrix{Any}:
 :meanZ   -9.17194e-5   0.0      -9.17194e-5
 :meanW   -0.000155134  0.0      -0.000155134
 :varZ     1.00001      1.0       8.79713e-6
 :covZW   -0.00182944   0.0      -0.00182944
 :varW     8.27791      8.5      -0.222085
 :covZ²W   2.58411      2.68328  -0.0991733
 :varZ²    3.15515      3.2      -0.0448527

In [8]:
check_varcov(dist = Laplace(), n = 5)

dist = Laplace{Float64}(μ=0.0, θ=1.0)
κ₃ = 0.0
κ₄ = 3.0


7×4 Matrix{Any}:
 :meanZ    5.4649e-5    0.0       5.4649e-5
 :meanW   -0.000332864  0.0      -0.000332864
 :varZ     0.999768     1.0      -0.00023169
 :covZW    0.000223478  0.0       0.000223478
 :varW     5.49636      5.5      -0.00363861
 :covZ²W   1.34063      1.34164  -0.0010089
 :varZ²    2.59846      2.6      -0.00154342

In [9]:
check_varcov(dist = Exponential(), n = 5)

dist = Exponential{Float64}(θ=1.0)
κ₃ = 2.0
κ₄ = 6.0


7×4 Matrix{Any}:
 :meanZ   4.54152e-5  0.0       4.54152e-5
 :meanW   4.53059e-5  0.0       4.53059e-5
 :varZ    1.00009     1.0       9.35006e-5
 :covZW   1.9999      2.0      -0.000101841
 :varW    8.49786     8.5      -0.00213706
 :covZ²W  2.68244     2.68328  -0.000842399
 :varZ²   3.20008     3.2       7.87109e-5

In [10]:
check_varcov(dist = Gamma(2, 1), n = 5)

dist = Gamma{Float64}(α=2.0, θ=1.0)
κ₃ = 1.414213562373095
κ₄ = 3.0


7×4 Matrix{Any}:
 :meanZ   -0.000119964  0.0      -0.000119964
 :meanW    4.13685e-5   0.0       4.13685e-5
 :varZ     0.999975     1.0      -2.49944e-5
 :covZW    1.41412      1.41421  -9.7001e-5
 :varW     5.50124      5.5       0.00124094
 :covZ²W   1.34056      1.34164  -0.00107967
 :varZ²    2.59918      2.6      -0.000817381

In [11]:
check_varcov(dist = LogNormal(), n = 5)

dist = LogNormal{Float64}(μ=0.0, σ=1.0)
κ₃ = 6.184877138632554
κ₄ = 110.9363921763115


7×4 Matrix{Any}:
 :meanZ    -2.14904e-5     0.0      -2.14904e-5
 :meanW    -0.000114033    0.0      -0.000114033
 :varZ      0.999681       1.0      -0.000319468
 :covZW     6.18873        6.18488   0.00385054
 :varW    112.772        113.436    -0.664572
 :covZ²W   49.2161        49.6123   -0.396145
 :varZ²    23.9617        24.1873   -0.22557