In [None]:
using Plots


include("distributions.jl")
include("distances/w_distance.jl") # ar mchirdeba? 
include("distances/new_distance.jl")
include("distances/distance_Wasserstein.jl")


In [None]:
# given two laws of random probability measures, generate distances between hierarchical
# empirical Measures

function sample_distances(q_1::PPM, q_2::PPM, n::Int, m::Int, s::Int)
    # this function samples distances between hierarchical empirical measures
    # n : number of probability measures from which observations are generated
    # m : number of observations from each probability measure
    # s : number of times to sample distance between empirical measures

    d_lips = Vector{Float64}(undef, s)
    for i in 1:s
        emp_1, emp_2 = generate_emp(q_1, n, m), generate_emp(q_2, n, m)
        d_lips[i] = dlip(emp_1, emp_2)
    end
    return d_lips
end

Now let us write the threshold function via Rademacher complexity for HIPM

In [None]:
function c_1(n::Int, θ::Float64, k::Float64)
    # n: number of probability measures from which observations are generated
    # θ: probability level for controlling type I error
    # k: diameter of the sample 
    a = 1280 * k * sqrt(log(2)) / sqrt(n)
    b = sqrt(4 * k^2 * log(1/θ)/n)
    c = 4 * k / (3 * n) * log(1/θ)
    d = 32 * sqrt(10 * k^2 * log(1/θ) * sqrt(log(2))) / (n ^ (3/4))
    return a + b + c + d
end

function c_2(n::Int, m::Int, θ::Float64, k::Float64)
    # n: number of probability measures from which observations are generated
    # m: number of observations from each probability measure
    # θ: probability level for controlling type I error
    # k: diameter of the sample 

    a = 256 * k * sqrt(log(2))/sqrt(m)
    b = sqrt(k^2 * log(1/θ)/ (2 * n))
    return a + b
end


function rademacher_threshold(n::Int, m::Int, θ::Float64, k::Float64)
    return 2 * (2 * c_2(n, m, θ/2,k) + c_1(n,θ/2, k))
end


Let us compare behaviour of empirical and theoretical thresholds when $n,m$ increase for fixed probability level $\theta$

In [None]:
function emp_vs_theor_fixed_theta(q_1::PPM, q_2::PPM, θ::Float64, s::Int)
    # q_1: law of RPM
    # q_2: law of RPM
    # θ: probability level for type I error
    # s: number of times to sample distance between hierarchical measures

    # This functions firstly gets empirical thresholds per n = m for fixed θ. This is to study asymptotic behaviour of empirical threshold. 
    # Then it gets rademacher thresholds per n = m for fixed θ, and compares it to the empirical thresholds.

    k = q_1.b-q_1.a
    @assert (q_1.b == q_2.b) && (q_1.a == q_2.a) "sample space should be same"


    ns_emp = collect(50:200:5000) # list of n = m values for empirical thresholds
    ns_radem = collect(50:50:5000) # list of n = m values for rademacher thresholds
    emp_thresholds = zeros(length(ns_emp))
    radem_thresholds = zeros(length(ns_radem))
    for (i, n) in enumerate(ns_emp)
        println("getting empirical threshold for n = m = $n")
        d_lips = sample_distances(q_1, q_2, n, n, s)
        emp_thresholds[i] = quantile(d_lips, 1-θ)
    end

    for (i,n) in enumerate(ns_radem)
        radem_thresholds[i] = rademacher_threshold(n, n, θ, k)
    end

    emp_radem_plot = plot(xlabel = "n = m", ylabel ="log thresholds", title = "Empirical vs Rademacher thresholds, θ = $θ", legend = :topright)
    plot!(emp_radem_plot, ns_radem, log.(radem_thresholds), label = "Rademacher thresholds", lw = 2)
    scatter!(emp_radem_plot, ns_emp, log.(emp_thresholds), label = "Empirical thresholds")
    return emp_thresholds, radem_thresholds, emp_radem_plot
end



In [None]:
function probability(baseMeasure::String)
    # function to generate observation either from uniform(-1/2,1/2) or from splitting measure
    if baseMeasure == "same" # Uniform(-1/2,1/2)
        return rand() - 0.5
    elseif baseMeasure == "splitting"  # sample either close to -1 or close to 1
        atom = rand()
        mixture = rand((0,1))
        return mixture * ( -1. + 0.25 * atom ) + (1 - mixture) * (0.75 + 0.25 * atom)
    end

end

# simulations for Dirichlet

Random.seed!(123456)


α_1, α_2 = 1.0, 2.0
P_0_1 = ()->probability("same")
P_0_2 = ()->probability("splitting")

a, b = -1.0, 1.0

q_1 = DP(α_1, P_0_2, a, b)
q_2 = DP(α_2, P_0_2, a, b)

s = 1
θ = 0.15

emp_thresholds, radem_thresholds, emp_radem_plot_fixed_theta = emp_vs_theor_fixed_theta(q_1, q_2, θ, s)

In [None]:
emp_radem_plot_fixed_theta

In [None]:
filepath = joinpath(pwd(), "ch4")
savefig(emp_radem_plot_fixed_theta, joinpath(filepath, "emp_radem_plot_fixed_theta.png"))


Now we see the comparison between empirical and theoretical thresholds per $\theta$ for fixed $n$ and $m$.

In [None]:
function emp_vs_theor_vary_theta(q_1::PPM, q_2::PPM, n::Int, m::Int, s::Int)
    # q_1: law of RPM
    # q_2: law of RPM
    # n: number of probability measures from which observations are generated
    # m: number of observations from each probability measure
    # s: number of times to sample distance between hierarchical measures

    # This functions firstly gets empirical thresholds per θ for fixed n and m. 
    # Then it gets rademacher thresholds per θ for fixed n and m, and compares it to the empirical thresholds.

    k = q_1.b-q_1.a
    @assert (q_1.b == q_2.b) && (q_1.a == q_2.a) "sample space should be same"
    θs = collect(0.01:0.01:1.0) # list of probability levels
    
    

    d_lips = sample_distances(q_1, q_2, n, m, s) 
    emp_thresholds = quantile(d_lips, 1 .- θs)
    radem_thresholds = rademacher_threshold.(n, m, θs, k)

    emp_radem_plot = plot(xlabel = "θ", ylabel ="log thresholds", title = "Empirical vs Rademacher thresholds, n = m = $n", legend = :topright)
    plot!(emp_radem_plot, θs, log.(emp_thresholds), label = "Empirical thresholds", lw = 2)
    plot!(emp_radem_plot, θs, log.(radem_thresholds), label = "Rademacher thresholds", lw = 2)
    return emp_thresholds, radem_thresholds, emp_radem_plot
end


In [None]:
function probability(baseMeasure::String)
    # function to generate observation either from uniform(-1/2,1/2) or from splitting measure
    if baseMeasure == "same" # Uniform(-1/2,1/2)
        return rand() - 0.5
    elseif baseMeasure == "splitting"  # sample either close to -1 or close to 1
        atom = rand()
        mixture = rand((0,1))
        return mixture * ( -1. + 0.25 * atom ) + (1 - mixture) * (0.75 + 0.25 * atom)
    end

end

# simulations for Dirichlet

Random.seed!(123456)


α_1, α_2 = 1.0, 2.0
P_0_1 = ()->probability("same")
P_0_2 = ()->probability("splitting")

a, b = -1.0, 1.0

q_1 = DP(α_1, P_0_2, a, b)
q_2 = DP(α_2, P_0_2, a, b)


n = 500
m = 500
s = 1


emp_th, rad_th, emp_radem_plot_vary_theta = emp_vs_theor_vary_theta(q_1, q_2, n, m, s)

In [None]:
emp_radem_plot_vary_theta

In [None]:
filepath = joinpath(pwd(), "ch4")
savefig(emp_radem_plot_vary_theta, joinpath(filepath, "emp_radem_plot_vary_theta.png"))
