In [34]:
using Revise
include("ngrams.jl")
include("datasetloader.jl")
LANGS = ["ara", "bel", "ben", "bul", "cat", "ces", "dan", "deu", "ell", 
"eng", "epo", "fas", "fin", "fra", "hau", "hbs", "heb", "hin", "hun", 
"ido", "ina", "isl", "ita", "jpn", "kab", "kor", "kur", "lat", "lit", 
"mar", "mkd", "msa", "nds", "nld", "nor", "pol", "por", "ron", "rus", 
"slk", "spa", "swa", "swe", "tat", "tgl", "tur", "ukr", "vie", "yid", 
"zho"]
G = [lang => load_ngrams(joinpath("ngrams", lang * ".txt")) for lang in LANGS];
sort([l=>sum(values(D)) for (l,D) in G], by=last, rev=true)


In [36]:
function norm_table!(t)
    D = last(t)
    vs = sum(values(D))
    for (k, v) in D
        D[k] /= vs
        D[k] = log(D[k])
    end
end
function likelihood(t, logt, default_logp=DEFAULT_LOGP)
    sc = 0.
    for (code, p) in t
        sc += p * get(logt, code, default_logp)
    end
    sc
end
function detector_lh(text; ngram=5, languages=LANGS, logtable=LOG_T)
    t = merged_ngrams(text, ngram)
    lhs = likelihood.(Ref(t), logtable)
    languages[argmax(lhs)]
end

norm_table! (generic function with 1 method)

In [37]:
norm_table!.(G);
LOG_T = last.(G);
DEFAULT_LOGP = (minimum.(values.(last.((G))))|>minimum)


In [None]:
include("benchmark.jl")
WV = WikiDataSet("corpus/wikipedia/test", langs=LANGS)
TV = TatoebaDataset("corpus/tatoeba", "tatoeba_test.txt", langs=LANGS)

In [44]:
WB = benchmark(
    # "likelihood-n5"=>detector_lh, 
    "likelihood-n4"=>t->detector_lh(t, ngram=4),
    "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=LANGS)

likelihood-n4

115.172757 seconds (1.63 G allocations: 28.833 GiB, 5.06% gc time, 0.21% compilation time: 8% of which was recompilation)


likelihood-n3 67.683621 seconds (885.58 M allocations: 16.358 GiB, 4.84% gc time, 0.09% compilation time)


2-element Vector{Any}:
 "likelihood-n4" => [1.0, 0.99, 1.0, 0.995, 1.0, 0.98, 0.99, 0.965, 1.0, 1.0  …  1.0, 0.985, 0.985, 0.995, 0.985, 0.985, 1.0, 0.965, 0.985, 0.995]
 "likelihood-n3" => [1.0, 0.99, 1.0, 0.98, 0.995, 0.985, 0.97, 0.96, 1.0, 1.0  …  1.0, 0.985, 0.985, 0.995, 0.985, 0.985, 1.0, 0.965, 0.975, 0.995]

In [45]:
showtable(WB, LANGS, threshold=-0.5)

|               | Average | ara     | bel    | ben     | bul    | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|--------|---------|--------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likelihood

|               | Average | ara     | bel    | ben     | bul    | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|--------|---------|--------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likelihood-n4 |  96.89% | 100.00% | 99.00% | 100.00% | 99.50% | 100.00% | 98.00% | 99.00% | 96.50% | 100.00% | 100.00% | 100.00% | 100.00% | 99.00% | 100.00% | 99.00% | 30.50% | 100.00% | 89.50% |  99.50% | 93.50% | 92.50% | 98.50% | 99.00% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 94.50% | 98.00% | 99.00% | 99.00% | 97.00% | 95.00% |  99.50% | 99.50% | 91.00% | 100.00% | 97.00% | 100.00% | 98.50% | 98.50% | 99.50% | 98.50% | 98.50% | 100.00% | 96.50% | 98.50% | 99.50% |
| likelihood-n3 |  95.95% | 100.00% | 99.00% | 100.00% | 98.00% |  99.50% | 98.50% | 97.00% | 96.00% | 100.00% | 100.00% |  99.50% | 100.00% | 99.00% | 100.00% | 98.50% |  8.50% | 100.00% | 89.50% | 100.00% | 90.00% | 85.00% | 98.50% | 98.50% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 92.50% | 96.50% | 99.00% | 99.00% | 96.50% | 91.50% | 100.00% | 99.50% | 90.50% |  99.50% | 97.00% | 100.00% | 98.50% | 98.50% | 99.50% | 98.50% | 98.50% | 100.00% | 96.50% | 97.50% | 99.50% |


In [46]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=("mkd", "hbs"))

likelihood-n5

  8.466144 seconds (90.29 M allocations: 1.663 GiB, 4.02% gc time, 0.05% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.99, 0.785]

In [47]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=TV, languages=("mkd", "hbs"))

likelihood-n5 29.418582 seconds (117.75 M allocations: 2.000 GiB, 1.36% gc time, 0.05% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.9329065300896286, 0.19799498746867167]