In [1]:
using Revise
include("ngrams.jl")
include("datasetloader.jl")
LANGS = ["ara", "bel", "ben", "bul", "cat", "ces", "dan", "deu", "ell", 
"eng", "epo", "fas", "fin", "fra", "hau", "hbs", "heb", "hin", "hun", 
"ido", "ina", "isl", "ita", "jpn", "kab", "kor", "kur", "lat", "lit", 
"mar", "mkd", "msa", "nds", "nld", "nor", "pol", "por", "ron", "rus", 
"slk", "spa", "swa", "swe", "tat", "tgl", "tur", "ukr", "vie", "yid", 
"zho"]
G = [lang => load_ngrams(joinpath("ngrams", lang * ".txt")) for lang in LANGS]

50-element Vector{Pair{String, Dict{Tuple{UInt8, Vararg{UInt8}}, Float64}}}:
 "ara" => Dict((0xd9, 0x86, 0xd9, 0x8a, 0xd8) => 0.003989445825360524, (0x63, 0x61) => 0.00029467622074500394, (0xd9, 0x8a, 0x20, 0xd9, 0x85) => 0.006367103586697142, (0x20, 0xd8, 0xa7, 0x20) => 0.00540693369022176, (0xa7, 0xd9, 0x86, 0x20, 0xd9) => 0.005268843596716616, (0xd9, 0x8a, 0xd8) => 0.10189825473245379, (0x68, 0x65) => 0.00010555566116238947, (0xb9, 0x20, 0xd9) => 0.0058931359699533415, (0xb3, 0xd8, 0xa8, 0xd8) => 0.0034786157162020937, (0xd9, 0x83, 0xd8, 0xb1) => 0.004341209688314843…)
 "bel" => Dict((0xd0, 0xb0, 0x20, 0xd0) => 0.05356847819684612, (0x80, 0xd0, 0xbd, 0xd1) => 0.0017948414065424912, (0x8c, 0xd0, 0xba) => 0.0028673775998338327, (0x8b, 0xd1, 0x9e) => 0.0036077595335681193, (0xb4, 0xd0, 0xb7, 0xd1, 0x8f) => 0.0030638296815467462, (0xbb, 0xd0, 0xba, 0xd0) => 0.000577329909094111, (0xd0, 0xbd, 0xd1, 0x83) => 0.005204917113475131, (0xb3, 0xd0, 0xb5, 0xd0, 0xbd) => 0.0005244402681460933, (0

In [2]:
sort([l=>sum(values(D)) for (l,D) in G], by=last, rev=true)

50-element Vector{Pair{String, Float64}}:
 "rus" => 30.641480653987365
 "mkd" => 28.930560339047275
 "mar" => 28.722910396336864
 "heb" => 28.467488630526557
 "ben" => 28.4607046957114
 "ukr" => 28.269445705417553
 "hin" => 27.64600875909456
 "bul" => 27.505348393951426
 "yid" => 26.9374804951315
 "ell" => 26.23108114090503
       ⋮
 "nor" => 16.584010890463613
 "zho" => 16.56832241716118
 "cat" => 16.50179721362697
 "ron" => 16.44484435225592
 "lit" => 16.337578179520495
 "ces" => 16.249634345355997
 "pol" => 16.205150897175734
 "slk" => 16.133646770570166
 "lat" => 15.535183747317609

In [3]:
function norm_table!(t)
    D = last(t)
    vs = sum(values(D))
    for (k, v) in D
        D[k] /= vs
        D[k] = log(D[k])
    end
end

norm_table! (generic function with 1 method)

In [4]:
norm_table!.(G);

In [5]:
LOG_T = last.(G);

In [6]:
DEFAULT_LOGP = (minimum.(values.(last.((G))))|>minimum)

-16.915131660459593

In [7]:
function likelihood(t, logt, default_logp=DEFAULT_LOGP)
    sc = 0.
    for (code, p) in t
        sc += p * get(logt, code, default_logp)
    end
    sc
end
    

likelihood (generic function with 2 methods)

In [8]:
function detector_lh(text; ngram=5, languages=LANGS, logtable=LOG_T)
    t = merged_ngrams(text, ngram)
    lhs = likelihood.(Ref(t), logtable)
    languages[argmax(lhs)]
end


detector_lh (generic function with 1 method)

In [9]:
include("benchmark.jl")

showtable (generic function with 2 methods)

In [10]:
WV = WikiDataSet("corpus/wikipedia/test", langs=LANGS)
TV = TatoebaDataset("corpus/tatoeba", "tatoeba_test.txt", langs=LANGS)

521469-element TatoebaDataset:
             "Pardonu, permesu al mi indiki tri erarojn en la supra artikolo." => "epo"
                                          "Mi veturigos vin al la flughaveno." => "epo"
                                                                 "Kio mankas?" => "epo"
                                            "Aĥ, kiam ili renkontiĝos denove?" => "epo"
 "Kiam mi estis infano, mi pasigis multajn horojn legante sola en mia ĉambro." => "epo"
                                              "Ŝajnas, ke la trajno malfruas." => "epo"
                                                                "Venu rapide!" => "epo"
                                                   "Li montris al mi albumon." => "epo"
                                                             "Estis du kukoj." => "epo"
                                      "Mia filo scipovas jam nombri ĝis cent." => "epo"
                                                                               ⋮
        

In [11]:
WB = benchmark(
    # "likelihood-n5"=>detector_lh, 
    "likelihood-n4"=>t->detector_lh(t, ngram=4),
    "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=LANGS)

likelihood-n4111.852930 seconds (1.63 G allocations: 28.844 GiB, 3.88% gc time, 0.71% compilation time)
likelihood-n3

 64.120249 seconds (885.42 M allocations: 16.355 GiB, 3.71% gc time, 0.09% compilation time)


2-element Vector{Any}:
 "likelihood-n4" => [1.0, 0.995, 1.0, 1.0, 1.0, 0.985, 0.99, 0.965, 1.0, 1.0  …  1.0, 0.99, 0.99, 0.99, 0.985, 0.985, 1.0, 0.97, 0.985, 0.995]
 "likelihood-n3" => [1.0, 0.995, 1.0, 0.985, 0.995, 0.99, 0.98, 0.96, 1.0, 0.995  …  1.0, 0.985, 0.99, 0.99, 0.985, 0.985, 1.0, 0.97, 0.975, 0.995]

In [12]:
showtable(WB, LANGS, threshold=-0.5)

|               | Average | ara     | bel    | ben     | bul     | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|--------|---------|---------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likeliho

|               | Average | ara     | bel    | ben     | bul     | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|--------|---------|---------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likelihood-n4 |  98.05% | 100.00% | 99.50% | 100.00% | 100.00% | 100.00% | 98.50% | 99.00% | 96.50% | 100.00% | 100.00% | 100.00% | 100.00% | 99.00% | 100.00% | 99.50% | 85.00% | 100.00% | 90.00% |  99.50% | 93.50% | 92.50% | 98.50% | 99.00% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 94.50% | 98.50% | 99.00% | 99.00% | 97.00% | 94.00% | 100.00% | 99.50% | 91.00% | 100.00% | 97.00% | 100.00% | 99.00% | 99.00% | 99.00% | 98.50% | 98.50% | 100.00% | 97.00% | 98.50% | 99.50% |
| likelihood-n3 |  96.85% | 100.00% | 99.50% | 100.00% |  98.50% |  99.50% | 99.00% | 98.00% | 96.00% | 100.00% |  99.50% |  99.50% | 100.00% | 99.00% | 100.00% | 99.00% | 50.50% | 100.00% | 89.50% | 100.00% | 88.00% | 85.50% | 98.50% | 98.50% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 91.50% | 97.50% | 99.00% | 99.00% | 96.50% | 93.00% | 100.00% | 99.50% | 90.00% | 100.00% | 97.00% | 100.00% | 98.50% | 99.00% | 99.00% | 98.50% | 98.50% | 100.00% | 97.00% | 97.50% | 99.50% |


In [13]:
hbs = WikiDataSet("corpus/wikipedia/test", langs=["hbs"]);

In [14]:
x,y = hbs[rand(1:length(hbs))]
detector_lh(x, ngram=4), y

("hbs", "hbs")

In [15]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=("mkd", "hbs"))

likelihood-n5

  8.150595 seconds (90.29 M allocations: 1.665 GiB, 3.07% gc time, 1.15% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.995, 0.915]

In [17]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=TV, languages=("mkd", "hbs"))

likelihood-n5

 30.047556 seconds (117.71 M allocations: 2.000 GiB, 1.43% gc time, 0.05% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.947247119078105, 0.26900584795321636]