In [1]:
include("methods_for_simulations.jl")





savefig_for_rejections_hipm_wow (generic function with 3 methods)

# How many permutations?

# Permutation or Boostrap ?


We compare boostrap and permutation approaches for HIPM and WoW. In particular we compare them by looking at Type I and Type II errors


Let us start with Type I error

In [2]:
function decide_boost(dist::Function, hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_boostrap::Int, θ::Float64)
    distance_observed = dist(hier_sample_1, hier_sample_2)

    # now we obtain threshold via boostrap
    boostrap_samples = zeros(n_boostrap)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n_top = hier_sample_1.n
    @assert n_top == hier_sample_2.n "number of rows for both hierarchical samples should be the same"

    for i in 1:n_boostrap
        total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows
        indices_1 = sample(1:2*n_top, n_top; replace = true) # get random indices for rows from pooled sample 
        indices_2 = sample(1:2*n_top, n_top; replace = true) 
        atoms_1 = total_rows[indices_1,:]  # select the rows associated to random indices
        atoms_2 = total_rows[indices_2,:]  
       
        # define new hierarchical samples
        hier_sample_1_boostrap = emp_ppm(atoms_1, n_top, n_bottom, a, b)
        hier_sample_2_boostrap = emp_ppm(atoms_2, n_top, n_bottom, a, b)

        boostrap_samples[i] = dist(hier_sample_1_boostrap, hier_sample_2_boostrap)
    end

    threshold_boostrap = quantile(boostrap_samples, 1 - θ)
    return 1.0*(distance_observed > threshold_boostrap)
end



function decide_perm(dist::Function, hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_permutations::Int, θ::Float64)
    distance_observed = dist(hier_sample_1, hier_sample_2)

    # now we obtain threshold via permutation
    permuted_samples = zeros(n_permutations)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n_top = hier_sample_1.n
    @assert n_top == hier_sample_2.n "number of rows for both hierarchical samples should be the same"

    for i in 1:n_permutations
        total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows
        random_indices = randperm(2*n_top) # indices to distribute rows to new hierarchical meausures
        
        # first n random rows will be to hirst permuted hierarchical measure, the rest to the second.
        atoms_1 = total_rows[random_indices[1:n_top],:] 
        atoms_2 = total_rows[random_indices[n_top+1:end],:] 

        # define new hierarchical samples
        hier_sample_1_permuted = emp_ppm(atoms_1, n_top, n_bottom, a, b)
        hier_sample_2_permuted = emp_ppm(atoms_2, n_top, n_bottom, a, b)

        permuted_samples[i] = dist(hier_sample_1_permuted, hier_sample_2_permuted)
    end

    threshold_permutation = quantile(permuted_samples, 1 - θ)
    return 1.0*(distance_observed > threshold_permutation)
end



decide_perm (generic function with 1 method)

In [3]:
function rejection_rate_boost_vs_perm(p::PPM, n::Int, m::Int, n_boostrap::Int, n_permutations::Int, θ::Float64, S::Int64)
    # This function obtains computes Type I error given law of RPM p. 
    
    # Input: 
    #   p: law of RPM
    #   n: number rows in hierarchical sample
    #   m: number of columns in hierarchical sample
    #   θ: significance level
    #   S: number of times we simulate hierarchical samples to estimate Type I error
    rej_rates_wow_boost = 0.0
    rej_rates_wow_perm = 0.0    
    rej_rates_hipm_boost = 0.0
    rej_rates_hipm_perm = 0.0

    for s in 1:S
        # generate hierarchical samples
        hier_sample_1, hier_sample_2 = generate_emp(p, n, m), generate_emp(p, n, m)

        # Record decisions 
        rej_rates_wow_boost += decide_boost(ww, hier_sample_1, hier_sample_2, n_boostrap, θ) 
        rej_rates_wow_perm += decide_perm(ww, hier_sample_1, hier_sample_2, n_permutations, θ)

        rej_rates_hipm_boost += decide_boost(hipm, hier_sample_1, hier_sample_2, n_boostrap, θ)
        rej_rates_hipm_perm += decide_perm(hipm, hier_sample_1, hier_sample_2, n_permutations, θ)
    end

    return rej_rates_wow_boost/S, rej_rates_wow_perm/s, rej_rates_hipm_boost/S, rej_rates_hipm_perm/S
end

rejection_rate_boost_vs_perm (generic function with 1 method)

we consider several laws of random probability measure. So we will have several Type I errors per distance function and perm/boostrap.

Then we can plot 4 curves per each law of random probability measures.

In [None]:
# Define law of RPMs

q = ...


n = 1
m = 1


θ = 0.05

S = 1 # number of simulations to estimate rejection rate

rejection_rate_boost_vs_perm(q::PPM) = rejection_rate_boost_vs_perm(q, n, m, n_boostrap, n_permutation, θ, S)

# Define several law of RPMs
qs = [DP() for ...]
all_rejections = rejection_rates_boost_vs_perm.(qs)
