https://discourse.julialang.org/t/julia-beginner-from-python-numba-outperforms-julia-in-rewrite-any-tips-to-improve-performance/66414

In [1]:
using BenchmarkTools
#using Distributed
#using Plots
#using Profile
#using ProfileVega
#using PyCall
using ScikitLearn
#using Traceur
using TimerOutputs

In [2]:
@sk_import datasets: (fetch_covtype)

PyObject <function fetch_covtype at 0x000000006B25AC10>

In [3]:
digits_data = fetch_covtype();

In [4]:
X_digits = abs.(digits_data["data"]);
X_digits = transpose(X_digits);

In [5]:
d, n = size(X_digits)

(54, 581012)

In [6]:
struct NaiveGreedy
    X::Matrix{Float64}
end

struct Result
    ranking::Vector{Int32}
    gains::Vector{Float64}
end

In [7]:
function lexsort(a, b, rev=false) 
    idxs = sortperm(a, alg=MergeSort, rev=rev)
    return idxs[sortperm(b[idxs], alg=MergeSort, rev=rev)]
end

function get_gains!(X, current_values, idxs, gains)
    @inbounds Threads.@threads for i in eachindex(idxs)
        s = 0.0
        for j in eachindex(current_values)
            s += @fastmath sqrt(current_values[j] + X[j, idxs[i]])
        end
        gains[i] = s
    end
end

function calculate_gains!(X, gains, current_values, idxs, current_concave_value_sum)
    get_gains!(X, current_values, idxs, gains)
    
    gains .-= current_concave_value_sum
    return gains
end

calculate_gains! (generic function with 1 method)

In [8]:
function fit(optimizer::NaiveGreedy, k, sample_cost)
    @timeit to "intro" begin
        d, n = size(optimizer.X)
        
        cost = 0.0

        ranking = Int32[]
        total_gains = Float64[]

        mask = zeros(Int8, n)
        current_values = zeros(Float64, d)
        current_concave_values = sqrt.(current_values)
        current_concave_values_sum = sum(current_concave_values)

        idxs = 1:n
    end

    gains = zeros(Float64, size(idxs)[1])
    @timeit to "while loop" begin
        while cost < k
            gains = @timeit to "calc_gains" calculate_gains!(optimizer.X, gains, current_values, idxs, current_concave_values_sum)
            
            if sample_cost != nothing
                gains ./= sample_cost[idxs]
                idx_idxs = @timeit to "lexsort" lexsort(gains, 1:size(gains)[1], true)

                @timeit to "select_idx" begin
                    for i in 1:size(idx_idxs)[1]
                        global idx = idx_idxs[i]
                        global best_idx = idxs[idx]
                        if cost + sample_cost[best_idx] <= k
                            break
                        end
                    end
                end
                curr_cost = sample_cost[best_idx]
            else
                global idx = argmax(gains)
                global best_idx = idxs[idx]
                curr_cost = 1.
            end
            
            if cost + curr_cost > k
                break
            end
                
            @timeit to "select_idx" begin
                cost += curr_cost
                # Calculate gains
                gain = gains[idx] * curr_cost
            end
            
            @timeit to "select_next" begin
                # Select next
                current_values += view(optimizer.X, :, best_idx)
                current_concave_values .= sqrt.(current_values)
                current_concave_values_sum = sum(current_concave_values)

                push!(ranking, best_idx)
                push!(total_gains, gain)

                mask[best_idx] = 1
                idxs = findall(==(0), mask)
            end
        end
    end
    return Result(ranking, total_gains)
end

fit (generic function with 1 method)

In [9]:
const to = TimerOutput()
k = 1000
opt1 = NaiveGreedy(X_digits)
res1 = fit(opt1, k, nothing)
to

[0m[1m ────────────────────────────────────────────────────────────────────────[22m
[0m[1m                         [22m        Time                   Allocations      
                         ──────────────────────   ───────────────────────
    Tot / % measured:         20.2s / 94.6%           9.19GiB / 96.1%    

 Section         ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────
 while loop           1    19.1s   100%   19.1s   8.83GiB  100%   8.83GiB
   calc_gains     1.00k    12.0s  62.9%  12.0ms   13.3MiB  0.15%  13.6KiB
   select_next    1.00k    5.76s  30.2%  5.76ms   8.82GiB  100%   9.03MiB
   select_idx     1.00k   7.49ms  0.04%  7.49μs   33.5KiB  0.00%    34.3B
 intro                1   77.2μs  0.00%  77.2μs    569KiB  0.01%   569KiB
[0m[1m ────────────────────────────────────────────────────────────────────────[22m

In [10]:
function fit(optimizer::NaiveGreedy, k, sample_cost)
    @timeit to "intro" begin
        d, n = size(optimizer.X)
        
        cost = 0.0

        ranking = Int32[]
        total_gains = Float64[]

        #mask = zeros(Int8, n)
        current_values = zeros(Float64, d)
        current_concave_values = sqrt.(current_values)
        current_concave_values_sum = sum(current_concave_values)

        #idxs = 1:n
        idxs = collect(1:n)
    end

    gains = zeros(Float64, size(idxs)[1])
    @timeit to "while loop" begin
        while cost < k
            gains = @timeit to "calc_gains" calculate_gains!(optimizer.X, gains, current_values, idxs, current_concave_values_sum)
            
            if sample_cost != nothing
                gains ./= sample_cost[idxs]
                idx_idxs = @timeit to "lexsort" lexsort(gains, 1:size(gains)[1], true)

                @timeit to "select_idx" begin
                    for i in 1:size(idx_idxs)[1]
                        global idx = idx_idxs[i]
                        global best_idx = idxs[idx]
                        if cost + sample_cost[best_idx] <= k
                            break
                        end
                    end
                end
                curr_cost = sample_cost[best_idx]
            else
                global idx = argmax(gains)
                global best_idx = idxs[idx]
                curr_cost = 1.
            end
            
            if cost + curr_cost > k
                break
            end
                
            @timeit to "select_idx" begin
                cost += curr_cost
                # Calculate gains
                gain = gains[idx] * curr_cost
            end
            
            @timeit to "select_next" begin
                # Select next
                #current_values += view(optimizer.X, :, best_idx)
                #current_concave_values .= sqrt.(current_values)
                #current_concave_values_sum = sum(current_concave_values)
                current_values .+= view(optimizer.X, :, best_idx)
                current_concave_values_sum = sum(sqrt, current_values)

                push!(ranking, best_idx)
                push!(total_gains, gain)

                #mask[best_idx] = 1
                #idxs = findall(==(0), mask)
                popat!(idxs, findfirst(==(best_idx), idxs))
            end
        end
    end
    return Result(ranking, total_gains)
end

fit (generic function with 1 method)

In [11]:
const to = TimerOutput()
k = 1000
opt1rev = NaiveGreedy(X_digits)
res1rev = fit(opt1rev, k, nothing)
to



[0m[1m ────────────────────────────────────────────────────────────────────────[22m
[0m[1m                         [22m        Time                   Allocations      
                         ──────────────────────   ───────────────────────
    Tot / % measured:         13.3s / 98.1%            274MiB / 5.31%    

 Section         ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────
 while loop           1    13.0s   100%   13.0s   10.1MiB  69.5%  10.1MiB
   calc_gains     1.00k    11.5s  88.7%  11.5ms   6.10MiB  42.0%  6.25KiB
   select_next    1.00k    184ms  1.41%   184μs   3.93MiB  27.0%  4.02KiB
   select_idx     1.00k   1.29ms  0.01%  1.29μs   31.2KiB  0.21%    32.0B
 intro                1    980μs  0.01%   980μs   4.43MiB  30.5%  4.43MiB
[0m[1m ────────────────────────────────────────────────────────────────────────[22m

In [13]:
@show res1rev.ranking == res1.ranking
@show res1rev.gains ≈ res1.gains;

res1rev.ranking == res1.ranking = true
res1rev.gains ≈ res1.gains = true
