In [None]:
include("methods_for_simulations.jl")





# How many permutations?

Here we decide what should be the number of permutations for dlip.

We consider values {25,50,75,100,200,500,1000}

We look at two plots:       
1) We fix two different but close to each other laws of RPMs (Dirichlet processes) and per number of permutations look at the rejection rates.

2) we fix two same laws of RPMS (still Dirichlet) and look at Type I errors.

In [None]:

function decide_perm_dlip(hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_permutations::Int, θ::Float64)
    # This function implements the decision rule given hierarchical samples based on permutation threshold for dlip.

    # Input:
    #      dist - distance function between two hierarchical samples
    #      hier_sample_1, hier_sample_2 - two hierarchical samples of type emp_ppM
    #      n_permutations - number of permutation samples to generate
    #      θ - significance level
    
    distance_observed = dlip(hier_sample_1, hier_sample_2)

    # now we obtain threshold via permutation
    permuted_samples = zeros(n_permutations)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n = hier_sample_1.n
    m = hier_sample_1.m
    @assert n == hier_sample_2.n "number of rows for both hierarchical samples should be the same"

    total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows

    for i in 1:n_permutations
        
        random_indices = randperm(2*n) # indices to distribute rows to new hierarchical meausures
        
        # first n random rows will be to hirst permuted hierarchical measure, the rest to the second.
        atoms_1 = total_rows[random_indices[1:n],:] 
        atoms_2 = total_rows[random_indices[n+1:end],:] 

        # define new hierarchical samples
        hier_sample_1_permuted = emp_ppm(atoms_1, n, m, a, b)
        hier_sample_2_permuted = emp_ppm(atoms_2, n, m, a, b)

        permuted_samples[i] = dlip(hier_sample_1_permuted, hier_sample_2_permuted)
    end

    threshold_permutation = quantile(permuted_samples, 1 - θ)
    return 1.0*(distance_observed > threshold_permutation)
end




function rejection_rates_permutations(q_1::PPM, q_2::PPM, n::Int, m::Int, n_permutations::Vector{Int}, θ::Float64, S::Int)
    # This function computes rejection rates for each number of permutations.
    
    # Input:
    #      q_1, q_2 - two laws of random probability measures of type PPM
    #      n, m - hierarchical sample sizes
    #      n_permutations - vector of number of permutations to consider
    #      θ - significance level
    #      S - number of simulations to estimate rejection rate
    
    rej_rates = zeros(length(n_permutations))
    
    for (i, n_perm) in enumerate(n_permutations)
        # for each number of permutations we compute rejection rate
        for s in 1:S
            hier_emp_1, hier_emp_2 = generate_emp(q_1, n, m), generate_emp(q_2, n, m)
            rej_rates[i] += decide_perm_dlip(hier_emp_1, hier_emp_2, n_perm, θ)
        end

        rej_rates[i] /= S
    end
    return rej_rates
end

In [None]:
n = 7
m = 1

n_permutations = [1]

θ = 0.05
S = 1

# Define Dirichlet processes

α = 5.0
P_0 = () -> rand(Beta(1,1))
a,b = 0.0, 1.0
q_1 = DP(α, P_0, a,b)
q_2 = DP(α, P_0, a,b)
Random.seed!(1234)
rejections_permutations = rejection_rates_permutations(q_1, q_2, n, 2, n_permutations, θ, S)

In [None]:
rejections_permutations

# Permutation or Boostrap ?


We compare boostrap and permutation approaches for dlip and WoW. In particular we compare them by looking at Type I and Type II errors


In [None]:
function decide_boost_wow_dlip(hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_boostrap::Int, θ::Float64)
    # This function implements the decision rule based on boostrap threshold for both distance functions and hierarchical samples.

    # Input:
    #      hier_sample_1, hier_sample_2 - two hierarchical samples of type emp_ppm
    #      n_boostrap - number of boostrap samples to generate
    #      θ - significance level  
    
    wow_observed = 0.0
    #wow_observed = ww(hier_sample_1, hier_sample_2)
    dlip_observed = dlip(hier_sample_1, hier_sample_2)

    # now we obtain threshold via boostrap
    boostrap_samples_wow = zeros(n_boostrap)
    boostrap_samples_dlip = zeros(n_boostrap)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n = hier_sample_1.n
    m = hier_sample_1.m
    @assert n == hier_sample_2.n "number of rows for both hierarchical samples should be the same"
    @assert m == hier_sample_2.m "number of columns for both hierarchical samples should be the same"

    total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows
    for i in 1:n_boostrap
        
        indices_1 = sample(1:2*n, n; replace = true) # get random indices for rows from pooled sample 
        indices_2 = sample(1:2*n, n; replace = true) 
        atoms_1 = total_rows[indices_1,:]  # select the rows associated to random indices
        atoms_2 = total_rows[indices_2,:]  
       
        # define new hierarchical samples
        hier_sample_1_boostrap = emp_ppm(atoms_1, n, m, a, b)
        hier_sample_2_boostrap = emp_ppm(atoms_2, n, m, a, b)

        #boostrap_samples_wow[i] = ww(hier_sample_1_boostrap, hier_sample_2_boostrap)
        boostrap_samples_dlip[i] = dlip(hier_sample_1_boostrap, hier_sample_2_boostrap)
    end

    threshold_boostrap_wow = quantile(boostrap_samples_wow, 1 - θ)
    threshold_boostrap_dlip = quantile(boostrap_samples_dlip, 1 - θ)
    return 1.0*(wow_observed > threshold_boostrap_wow), 1.0*(dlip_observed > threshold_boostrap_dlip)
end



function decide_perm_wow_dlip(hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_permutations::Int, θ::Float64)
    # This function implements the decision rule based on permutation threshold for both distance functions and hierarchical samples.

    # Input:
    #      hier_sample_1, hier_sample_2 - two hierarchical samples of type emp_ppM
    #      n_permutations - number of permutation samples to generate
    #      θ - significance level
    
    wow_observed = 0.0
    #wow_observed = ww(hier_sample_1, hier_sample_2)
    dlip_observed = dlip(hier_sample_1, hier_sample_2)

    # now we obtain threshold via permutation
    permuted_samples_wow = zeros(n_permutations)
    permuted_samples_dlip = zeros(n_permutations)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n = hier_sample_1.n
    m = hier_sample_1.m
    @assert n == hier_sample_2.n "number of rows for both hierarchical samples should be the same"
    
    total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows
    
    for i in 1:n_permutations
        
        random_indices = randperm(2*n) # indices to distribute rows to new hierarchical meausures
        
        # first n random rows will be to hirst permuted hierarchical measure, the rest to the second.
        atoms_1 = total_rows[random_indices[1:n],:] 
        atoms_2 = total_rows[random_indices[n+1:end],:] 

        # define new hierarchical samples
        hier_sample_1_permuted = emp_ppm(atoms_1, n, m, a, b)
        hier_sample_2_permuted = emp_ppm(atoms_2, n, m, a, b)

        #permuted_samples_wow[i] = ww(hier_sample_1_permuted, hier_sample_2_permuted)
        permuted_samples_dlip[i] = dlip(hier_sample_1_permuted, hier_sample_2_permuted)
    end

    threshold_permutation_wow = quantile(permuted_samples_wow, 1 - θ)
    threshold_permutation_dlip = quantile(permuted_samples_dlip, 1 - θ)
    return 1.0*(wow_observed > threshold_permutation_wow), 1.0*(dlip_observed > threshold_permutation_dlip)
end



In [None]:
function rejection_rate_boost_vs_perm(q_1::PPM, q_2::PPM, n::Int, m::Int, n_boostrap::Int, n_permutations::Int, θ::Float64, S::Int64)
    # This function obtains rejection rate for pair of laws of random probability measures. It obtains these rates for each
    # distance function (WoW, dlip) and each method of obtaining thresholds (boostrap, permutation).
    # Rejection rate is number of times we reject H_0 per generated hierarchical samples divided by number of generated hierarchical samples S.
    
    # Input: 
    #   q_1: law of RPM
    #   q_2: law of RPM
    #   n: number rows in hierarchical sample
    #   m: number of columns in hierarchical sample
    #   θ: significance level
    #   S: number of times we simulate hierarchical samples to estimate Type I error
    rej_rates_wow_boost = 0.0
    rej_rates_wow_perm = 0.0    
    rej_rates_dlip_boost = 0.0
    rej_rates_dlip_perm = 0.0

    for s in 1:S
        # generate hierarchical samples
        hier_sample_1, hier_sample_2 = generate_emp(q_1, n, m), generate_emp(q_2, n, m)

        # Record decisions 
        decisions_boostrap = decide_boost_wow_dlip(hier_sample_1, hier_sample_2, n_boostrap, θ)
        rej_rates_wow_boost += decisions_boostrap[1]
        rej_rates_dlip_boost += decisions_boostrap[2]
        
        decisions_permutations = decide_perm_wow_dlip(hier_sample_1, hier_sample_2, n_permutations, θ)
        rej_rates_wow_perm += decisions_permutations[1]
        rej_rates_dlip_perm += decisions_permutations[2]
    end

    return [rej_rates_wow_boost/S, rej_rates_wow_perm/S, rej_rates_dlip_boost/S, rej_rates_dlip_perm/S]
end

we consider several laws of random probability measure. So we will have several rejection rates per distance function and perm/boostrap.

Then we can plot 4 curves per each law of random probability measures.

In [None]:
# Define parameters for hierarchical sample, methods 
n = 7
m = 1


n_boostrap = 1
n_permutation = n_boostrap


θ = 0.05

S = 1 # number of simulations to estimate rejection rate

rejection_rate_boost_vs_perm(pair_of_laws::Tuple{PPM,PPM}) = rejection_rate_boost_vs_perm(pair_of_laws[1], pair_of_laws[2], n, m, n_boostrap, n_permutation, θ, S)




In [None]:
# Let us define several pairs of Dirichlet Processes


#βs = [1, 1.25, 1.5, 1.75, 2.0]
βs = [1]
P_0 = () -> rand(Beta(1,1))
a, b = 0.0, 1.0
α = 5.0

n_laws = length(βs)

laws = []
for i in 1:n_laws
    q_1 = DP(α, P_0, a, b)

    P_2 = ()->rand(Beta(1,βs[i]))
    q_2 = DP(α, P_2, a, b)

    push!(laws, (q_1,q_2))
end






In [None]:
laws

In [None]:
rejections = zeros(n_laws, 4)
Random.seed!(1234)

for i in 1:n_laws
    rejections[i,:] = rejection_rate_boost_vs_perm(laws[i])
end







saati da ragac unda iyos wesit

In [None]:
rejections

In [None]:
# We plot rejection rates seperately for WoW and dlip



fig_wow = plot(title = "Rejections for WoW", xlabel = "b", ylabel = "Rej Rate", ylims = (-0.1, 1.1))
plot!(fig_wow, βs, rejections[:,1], label = "boostrap", color = "brown",linestyle = :solid)
plot!(fig_wow, βs, rejections[:,2], label = "permutation", color = "brown",linestyle = :dash)


fig_dlip = plot(title = "Rejections for dlip", xlabel = "b", ylabel = "Rej Rate", ylims = (-0.1, 1.1))
plot!(fig_dlip, βs, rejections[:,3], label = "boostrap", color = "red",linestyle = :solid)
plot!(fig_dlip, βs, rejections[:,4], label = "permutation", color = "red",linestyle = :dash)
hline!(fig_dlip, [θ], label = "significance level", color = "black")





In [None]:
fig_wow

In [None]:
fig_dlip

# Times for permutation and Boostrap?


The only differences between them is about choosing indexes for taking rows of matrix and slightly in indexing (see code)

In particular, Let B be the nxm matrix.

Permutation approach takes random permutation on {1,...,2n} and then takes first n random indices and then second n random indices.

Boostrap approach takes n i.i.d samples from {1,2,...,n} uniformly twice, and they are indices.

So we can only compare which is faster:
    
                Taking permutation on {1,...,2n} or taking i.i.d samples from {1,...,n} uniformly twice.
    


In [None]:
[sample(1:2*2, 2; replace = true), sample(1:2*2, 2; replace = true) for i in 1:3]

In [None]:
n = 5
B = rand(n,100)

s = 5


time_permutation = @elapsed begin
    [randperm(2n) for i in 1:s]
    randperm(2n)[1:n]
    randperm(2n)[1:n]

end
time_permutation = time_permutation / s


time_boostrap = @elapsed begin
    [sample(1:2*n, n; replace = true) for i in 1:s]
    [sample(1:2*n, n; replace = true) for i in 1:s]
end
time_boostrap = time_boostrap / s

In [None]:
println("Average time for permutation: ", time_permutation)
println("Average time for boostrap: ", time_boostrap)

In [None]:
time_permutation > time_boostrap

As we see, time for permutation is less than the time for boostrap.


In [None]:

function decide_boostrap_dlip(hier_sample_1::emp_ppm, hier_sample_2::emp_ppm, n_boostrap::Int, θ::Float64)
    # This function implements the decision rule given hierarchical samples based on boostrap threshold for dlip.

    # Input:
    #      dist - distance function between two hierarchical samples
    #      hier_sample_1, hier_sample_2 - two hierarchical samples of type emp_ppM
    #      n_boostrap - number of boostrap samples to generate
    #      θ - significance level
    
    distance_observed = dlip(hier_sample_1, hier_sample_2)

    # now we obtain threshold via boostrap
    boostrap_samples = zeros(n_boostrap)

    a = minimum([hier_sample_1.a, hier_sample_2.a]) # since endpoints of space of observations for two hierarchical samples might not
                                                    # be same
    b = maximum([hier_sample_1.b, hier_sample_2.b])

    n = hier_sample_1.n
    m = hier_sample_1.m
    @assert n == hier_sample_2.n "number of rows for both hierarchical samples should be the same"
    total_rows = vcat(hier_sample_1.atoms, hier_sample_2.atoms) # collect all rows

    for i in 1:n_boostrap
        
        indices_1 = sample(1:2*n, n; replace = true) # get random indices for rows from pooled sample 
        indices_2 = sample(1:2*n, n; replace = true) 
        atoms_1 = total_rows[indices_1,:]  # select the rows associated to random indices
        atoms_2 = total_rows[indices_2,:]  
       
        # define new hierarchical samples
        hier_sample_1_boostrap = emp_ppm(atoms_1, n, m, a, b)
        hier_sample_2_boostrap = emp_ppm(atoms_2, n, m, a, b)

        boostrap_samples[i] = dlip(hier_sample_1_boostrap, hier_sample_2_boostrap)
    end

    threshold_boostrap = quantile(boostrap_samples, 1 - θ)
    return 1.0*(distance_observed > threshold_boostrap)
end




function rejection_rates_boostrap(q_1::PPM, q_2::PPM, n::Int, m::Int, n_boostraps::Vector{Int}, θ::Float64, S::Int)
    # This function computes rejection rates for each number of boostrap.
    
    # Input:
    #      q_1, q_2 - tw    o laws of random probability measures of type PPM
    #      n, m - hierarchical sample sizes
    #      n_boostrap - vector of number of boostrap to consider
    #      θ - significance level
    #      S - number of simulations to estimate rejection rate
    
    rej_rates = zeros(length(n_boostrap))
    
    for (i, n_boostrap) in enumerate(n_boostrap)
        # for each number of boostrap we compute rejection rate
        for s in 1:S
            hier_emp_1, hier_emp_2 = generate_emp(q_1, n, m), generate_emp(q_2, n, m)
            rej_rates[i] += decide_boostrap_dlip(hier_emp_1, hier_emp_2, n_boostrap, θ)
        end

        rej_rates[i] /= S
    end
    return rej_rates
end

In [None]:
n = 100
m = 1

n_permutations = [50]
n_boostrap = n_permutations

θ = 0.05
S = 1

# Define Dirichlet processes

α = 5.0
P_0 = () -> rand(Beta(1,1))
a,b = 0.0, 1.0
q_1 = DP(α, P_0, a,b)
q_2 = DP(α, P_0, a,b)
Random.seed!(1234)
times_permutations = @elapsed begin
    [rejection_rates_permutations(q_1, q_2, n, m, n_permutations, θ, S) for i in 1:5]
end
times_boostraps = @elapsed begin
    [rejection_rates_boostrap(q_1,q_2,n,m,n_boostrap,θ,S) for i in 1:5]
end
times_permutations = times_permutations / 5
times_boostraps = times_boostraps / 5

In [None]:
println("Average time for permutation: ", times_permutations)
println("Average time for boostrap: ", times_boostraps)

In [None]:
16.   
14
31
32