In [1]:
using Revise
include("ngrams.jl")
include("datasetloader.jl")
LANGS = ["ara", "bel", "ben", "bul", "cat", "ces", "dan", "deu", "ell", 
"eng", "epo", "fas", "fin", "fra", "hau", "hbs", "heb", "hin", "hun", 
"ido", "ina", "isl", "ita", "jpn", "kab", "kor", "kur", "lat", "lit", 
"mar", "mkd", "msa", "nds", "nld", "nor", "pol", "por", "ron", "rus", 
"slk", "spa", "swa", "swe", "tat", "tgl", "tur", "ukr", "vie", "yid", 
"zho"]
G = [lang => load_ngrams(joinpath("ngrams", lang * ".txt")) for lang in LANGS]

50-element Vector{Pair{String, Dict{Tuple{UInt8, Vararg{UInt8}}, Float64}}}:
 "ara" => Dict((0x63, 0x61) => 0.00029467622074500394, (0xd9, 0x8a, 0x20, 0xd9, 0x85) => 0.006367103586697142, (0x20, 0xd8, 0xa7, 0x20) => 0.00540693369022176, (0xa7, 0xd9, 0x86, 0x20, 0xd9) => 0.005268843596716616, (0xd9, 0x8a, 0xd8) => 0.10189825473245379, (0x68, 0x65) => 0.00010555566116238947, (0xb9, 0x20, 0xd9) => 0.0058931359699533415, (0xd9, 0x83, 0xd8, 0xb1) => 0.004341209688314843, (0xd8, 0xb4, 0xd8) => 0.01657449209142566, (0xa7, 0x20, 0xd8, 0xa7, 0xd9) => 0.005374842588265839…)
 "bel" => Dict((0xd0, 0xb0, 0x20, 0xd0) => 0.05356847819684612, (0x8c, 0xd0, 0xba) => 0.0028673775998338327, (0xd0, 0xbd, 0xd1, 0x83) => 0.005204917113475131, (0x63, 0x61) => 9.886008612132829e-5, (0xd1, 0x87) => 0.03788978898882851, (0x8c, 0xd1, 0x81, 0xd1) => 0.00013294853270790118, (0x68, 0x65) => 0.0002951312235264679, (0xd1, 0x82, 0xd0, 0xbb) => 0.0007593643336462096, (0xb2, 0xd1, 0x83) => 0.004182843416893853, (0xd0, 0x

In [2]:
sort([l=>sum(values(D)) for (l,D) in G], by=last, rev=true)

50-element Vector{Pair{String, Float64}}:
 "rus" => 29.17435384230537
 "mar" => 28.33707905495157
 "ben" => 28.003329201823647
 "mkd" => 27.719607537022505
 "hin" => 27.320424888295342
 "ukr" => 26.870085156461805
 "heb" => 26.841208996192297
 "bul" => 26.162604724275432
 "yid" => 25.905631012734464
 "ell" => 24.848224322135664
       ⋮
 "ina" => 14.437181184262036
 "kab" => 14.379556857621482
 "ces" => 14.36855169578082
 "ron" => 14.28387259582836
 "cat" => 14.208888965785723
 "pol" => 14.171362391192819
 "slk" => 14.115390175992205
 "lit" => 14.00635909185294
 "lat" => 13.268710895614062

In [3]:
function norm_table!(t)
    D = last(t)
    vs = sum(values(D))
    for (k, v) in D
        D[k] /= vs
        D[k] = log(D[k])
    end
end

norm_table! (generic function with 1 method)

In [4]:
norm_table!.(G);

In [5]:
LOG_T = last.(G);

In [6]:
DEFAULT_LOGP = (minimum.(values.(last.((G))))|>minimum)

-16.86606702281564

In [7]:
function likelihood(t, logt, default_logp=DEFAULT_LOGP)
    sc = 0.
    for (code, p) in t
        sc += p * get(logt, code, default_logp)
    end
    sc
end
    

likelihood (generic function with 2 methods)

In [8]:
function detector_lh(text; ngram=5, languages=LANGS, logtable=LOG_T)
    t = merged_ngrams(text, ngram)
    lhs = likelihood.(Ref(t), logtable)
    languages[argmax(lhs)]
end


detector_lh (generic function with 1 method)

In [9]:
include("benchmark.jl")

showtable (generic function with 2 methods)

In [10]:
WV = WikiDataSet("corpus/wikipedia/test", langs=LANGS)
TV = TatoebaDataset("corpus/tatoeba", "tatoeba_test.txt", langs=LANGS)

521469-element TatoebaDataset:
             "Pardonu, permesu al mi indiki tri erarojn en la supra artikolo." => "epo"
                                          "Mi veturigos vin al la flughaveno." => "epo"
                                                                 "Kio mankas?" => "epo"
                                            "Aĥ, kiam ili renkontiĝos denove?" => "epo"
 "Kiam mi estis infano, mi pasigis multajn horojn legante sola en mia ĉambro." => "epo"
                                              "Ŝajnas, ke la trajno malfruas." => "epo"
                                                                "Venu rapide!" => "epo"
                                                   "Li montris al mi albumon." => "epo"
                                                             "Estis du kukoj." => "epo"
                                      "Mia filo scipovas jam nombri ĝis cent." => "epo"
                                                                               ⋮
        

In [11]:
WB = benchmark(
    # "likelihood-n5"=>detector_lh, 
    "likelihood-n4"=>t->detector_lh(t, ngram=4),
    "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=LANGS)

likelihood-n4

105.676696 seconds (1.63 G allocations: 28.829 GiB, 3.84% gc time, 0.65% compilation time)
likelihood-n3

 62.418989 seconds (884.97 M allocations: 16.348 GiB, 3.71% gc time, 0.11% compilation time)


2-element Vector{Any}:
 "likelihood-n4" => [1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 0.985, 0.965, 1.0, 1.0  …  1.0, 0.99, 0.985, 0.99, 0.985, 0.985, 1.0, 0.97, 0.985, 0.995]
 "likelihood-n3" => [1.0, 1.0, 1.0, 0.985, 1.0, 0.99, 0.975, 0.96, 1.0, 0.995  …  1.0, 0.99, 0.99, 0.99, 0.985, 0.985, 1.0, 0.97, 0.97, 0.995]

In [12]:
showtable(WB, LANGS, threshold=-0.5)

|               | Average | ara     | bel     | ben     | bul     | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|---------|---------|---------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likeli

|               | Average | ara     | bel     | ben     | bul     | cat     | ces    | dan    | deu    | ell     | eng     | epo     | fas     | fin    | fra     | hau    | hbs    | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor     | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus     | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho    |
|---------------|---------|---------|---------|---------|---------|---------|--------|--------|--------|---------|---------|---------|---------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|---------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|--------|
| likelihood-n4 |  98.14% | 100.00% | 100.00% | 100.00% | 100.00% | 100.00% | 99.00% | 98.50% | 96.50% | 100.00% | 100.00% | 100.00% | 100.00% | 99.00% | 100.00% | 99.50% | 87.50% | 100.00% | 90.00% |  99.50% | 93.50% | 92.50% | 98.50% | 99.50% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 95.00% | 98.50% | 99.00% | 99.00% | 97.00% | 95.50% |  99.50% | 99.50% | 91.00% | 100.00% | 97.00% | 100.00% | 99.00% | 98.50% | 99.00% | 98.50% | 98.50% | 100.00% | 97.00% | 98.50% | 99.50% |
| likelihood-n3 |  96.79% | 100.00% | 100.00% | 100.00% |  98.50% | 100.00% | 99.00% | 97.50% | 96.00% | 100.00% |  99.50% |  99.50% | 100.00% | 99.00% | 100.00% | 99.00% | 42.50% | 100.00% | 89.50% | 100.00% | 89.00% | 87.00% | 98.50% | 99.00% | 99.00% | 99.00% | 100.00% | 99.00% | 97.00% | 100.00% | 92.50% | 97.50% | 99.00% | 99.00% | 96.50% | 93.50% | 100.00% | 99.50% | 90.00% | 100.00% | 97.00% | 100.00% | 99.00% | 99.00% | 99.00% | 98.50% | 98.50% | 100.00% | 97.00% | 97.00% | 99.50% |


In [13]:
hbs = WikiDataSet("corpus/wikipedia/test", langs=["hbs"]);

In [14]:
x,y = hbs[rand(1:length(hbs))]
detector_lh(x, ngram=4), y

("mkd", "hbs")

In [13]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=("mkd", "hbs"))

likelihood-n5

  7.549779 seconds (90.22 M allocations: 1.664 GiB, 3.71% gc time, 1.29% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.995, 0.92]

In [15]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=TV, languages=("mkd", "hbs"))

likelihood-n5

 33.378441 seconds (117.60 M allocations: 1.998 GiB, 1.32% gc time, 0.06% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.9282970550576184, 0.2577276524644946]