In [1]:
using CSV

In [2]:
words_nltk = CSV.read("words_nltk.csv");
words_nltk = Array(words_nltk[:2]);

In [3]:
function edit_distance(X::String,Y::String)
    len_x = length(X)
    len_y = length(Y)
    D = zeros(Int, len_x+1,len_y+1)

    @inbounds for i in 1:(len_x+1)
        for j in 1:(len_y+1)
            if i==1
                D[i,j] = j
            elseif j==1
                D[i,j] = i
            elseif X[i-1] == Y[j-1]
                D[i,j] = D[i-1,j-1]
            else
                D[i,j] = 1+min(D[i,j-1], D[i-1,j], D[i-1,j-1])
            end
        end
    end
    return D[len_x,len_y]
end

edit_distance (generic function with 1 method)

In [4]:
edit_distance("lik","cat")

3

In [5]:
using BenchmarkTools

In [6]:
@benchmark edit_distance("exponential", "polynomial")

BenchmarkTools.Trial: 
  memory estimate:  1.14 KiB
  allocs estimate:  1
  --------------
  minimum time:     529.132 ns (0.00% GC)
  median time:      544.116 ns (0.00% GC)
  mean time:        792.770 ns (28.40% GC)
  maximum time:     441.130 μs (99.82% GC)
  --------------
  samples:          10000
  evals/sample:     190

In [35]:
function compute_distances(mistake,words) 
    cy_distances = []
    for word in words
        #ed = edit_distance(mistake, word)
        push!(cy_distances,edit_distance(mistake, word))
    end
    return cy_distances
end

compute_distances (generic function with 1 method)

In [100]:
mistake = "drauing"
@time distances = compute_distances(mistake,words_nltk); closest_word = words_nltk[argmin(distances)]

  0.188767 seconds (236.76 k allocations: 181.195 MiB)


"draine"

In [101]:
@benchmark compute_distances(mistake,words_nltk)

BenchmarkTools.Trial: 
  memory estimate:  181.19 MiB
  allocs estimate:  236754
  --------------
  minimum time:     149.058 ms (0.00% GC)
  median time:      157.181 ms (0.00% GC)
  mean time:        188.015 ms (13.60% GC)
  maximum time:     842.787 ms (81.91% GC)
  --------------
  samples:          27
  evals/sample:     1

In [45]:
function compute_distances2(mistake,words) 
    cy_distances = zeros(Int64,length(words))
    for (i,word) in enumerate(words)
        cy_distances[i] = edit_distance(mistake, word)
    end
    return cy_distances
end

compute_distances2 (generic function with 1 method)

In [46]:
using BenchmarkTools

In [47]:
mistake = "drauing"
@time distances = compute_distances2(mistake,words_nltk); closest_word = words_nltk[argmin(distances)]

  0.179847 seconds (250.14 k allocations: 180.639 MiB)


"draine"

In [52]:
@benchmark compute_distances2($mistake, $words_nltk)

BenchmarkTools.Trial: 
  memory estimate:  180.00 MiB
  allocs estimate:  236738
  --------------
  minimum time:     146.714 ms (0.00% GC)
  median time:      150.443 ms (0.00% GC)
  mean time:        174.335 ms (13.18% GC)
  maximum time:     814.746 ms (81.81% GC)
  --------------
  samples:          29
  evals/sample:     1

In [60]:
@time cy_distances = zeros(Int64,length(words_nltk));

  0.000272 seconds (7 allocations: 1.806 MiB)


In [61]:
### reuse

In [58]:
function compute_distances3(mistake,words) 
    cy_distances = zeros(Int64,length(words))
    for i in 1:length(words)
        cy_distances[i] = edit_distance(mistake, words[i])
    end
    return cy_distances
end

compute_distances3 (generic function with 1 method)

In [59]:
@benchmark compute_distances3($mistake, $words_nltk)

BenchmarkTools.Trial: 
  memory estimate:  180.00 MiB
  allocs estimate:  236738
  --------------
  minimum time:     148.139 ms (0.00% GC)
  median time:      164.628 ms (0.00% GC)
  mean time:        196.631 ms (13.96% GC)
  maximum time:     883.741 ms (80.78% GC)
  --------------
  samples:          26
  evals/sample:     1

### Multithreading version

In [19]:
Base.Threads.nthreads()

4

In [29]:
function compute_distances4(mistake,words) 
     cy_distances = zeros(Int64,length(words))
     Threads.@threads for i in 1:length(words)
        cy_distances[i] = edit_distance(mistake, words[i])
    end
    return cy_distances
end

compute_distances2 (generic function with 1 method)

In [30]:
mistake = "drauing"
@time distances = compute_distances4(mistake,words_nltk); closest_word =  words_nltk[argmin(distances)]

  0.320212 seconds (269.37 k allocations: 172.245 MiB, 64.83% gc time)


"draine"

In [65]:
using StaticArrays

┌ Info: Precompiling StaticArrays [90137ffa-7385-5640-81b9-e52037218182]
└ @ Base loading.jl:1186


In [66]:
function edit_distance_opt(X::String,Y::String)
    len_x = length(X)
    len_y = length(Y)
    D = StaticArrays.zeros(Int, len_x+1, len_y+1)

    @inbounds for i in 1:(len_x+1)
        for j in 1:(len_y+1)
            if i==1
                D[i,j] = j
            elseif j==1
                D[i,j] = i
            elseif X[i-1] == Y[j-1]
                D[i,j] = D[i-1,j-1]
            else
                D[i,j] = 1+min(D[i,j-1], D[i-1,j], D[i-1,j-1])
            end
        end
    end
    return D[len_x,len_y]
end

edit_distance_opt (generic function with 1 method)

In [68]:
@benchmark edit_distance_opt("lik","cat")

BenchmarkTools.Trial: 
  memory estimate:  208 bytes
  allocs estimate:  1
  --------------
  minimum time:     129.366 ns (0.00% GC)
  median time:      144.351 ns (0.00% GC)
  mean time:        150.494 ns (0.00% GC)
  maximum time:     418.453 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     949

In [69]:
function compute_distances_static(mistake,words) 
     cy_distances = zeros(Int64,length(words))
     Threads.@threads for i in 1:length(words)
        cy_distances[i] = edit_distance_opt(mistake, words[i])
    end
    return cy_distances
end

compute_distances_static (generic function with 1 method)

In [79]:
mistake = "drauing"
@time distances = compute_distances_static(mistake,words_nltk);

  0.104975 seconds (229.90 k allocations: 174.522 MiB)


In [81]:
@benchmark distances = compute_distances_static(mistake, words_nltk)

BenchmarkTools.Trial: 
  memory estimate:  170.10 MiB
  allocs estimate:  224438
  --------------
  minimum time:     68.646 ms (0.00% GC)
  median time:      71.546 ms (0.00% GC)
  mean time:        109.932 ms (33.80% GC)
  maximum time:     1.779 s (96.07% GC)
  --------------
  samples:          46
  evals/sample:     1

### String distances


In [98]:
using StringDistances

In [99]:
evaluate(Levenshtein(), "New York", "New Yorks")

1

In [96]:
function compute_distances_stringdistances(mistake,words) 
     measure = Levenshtein()
     cy_distances = zeros(Int64,length(words))
     Threads.@threads for i in 1:length(words)
        cy_distances[i] = evaluate(measure, mistake, words[i])
    end
    return cy_distances
end

compute_distances_stringdistances (generic function with 1 method)

In [97]:
@benchmark distances = compute_distances_stringdistances(mistake, words_nltk)

BenchmarkTools.Trial: 
  memory estimate:  35.51 MiB
  allocs estimate:  218527
  --------------
  minimum time:     45.755 ms (0.00% GC)
  median time:      48.419 ms (0.00% GC)
  mean time:        53.220 ms (0.00% GC)
  maximum time:     95.974 ms (0.00% GC)
  --------------
  samples:          94
  evals/sample:     1