In [1]:
using Revise
include("ngrams.jl")
include("datasetloader.jl")
LANGS = ["ara", "bel", "ben", "bul", "cat", "ces", "dan", "deu", "ell", 
"eng", "epo", "fas", "fin", "fra", "hau", "hbs", "heb", "hin", "hun", 
"ido", "ina", "isl", "ita", "jpn", "kab", "kor", "kur", "lat", "lit", 
"mar", "mkd", "msa", "nds", "nld", "nor", "pol", "por", "ron", "rus", 
"slk", "spa", "swa", "swe", "tat", "tgl", "tur", "ukr", "vie", "yid", 
"zho"]
G = [lang => load_ngrams(joinpath("ngrams", lang * ".txt")) for lang in LANGS]

50-element Vector{Pair{String, Dict{Tuple{UInt8, Vararg{UInt8}}, Float64}}}:
 "ara" => Dict((0x63, 0x61) => 0.00029467622074500394, (0x68, 0x65) => 0.00010555566116238947, (0x63, 0x20, 0x6d) => 0.00010555566116238947, (0xd8, 0xb4, 0xd8) => 0.01657449209142566, (0x70, 0x75, 0x74) => 0.004037504039461398, (0xa8, 0xd8) => 0.05337699287247487, (0xaf,) => 0.08298881434086272, (0xd9, 0x87, 0xd8) => 0.030677597244273158, (0xb3, 0x20) => 0.010194608449073564, (0x74, 0x20, 0x65, 0x6e) => 0.003931948378299008…)
 "bel" => Dict((0xd0, 0xb0, 0x20, 0xd0) => 0.05356847819684612, (0x80, 0xd0, 0xbd, 0xd1) => 0.0017948414065424912, (0xb0, 0x20, 0xd1, 0x82, 0xd1) => 0.0016050908823657271, (0xb0, 0xd0, 0xbc, 0xd0, 0xbe) => 0.0012221484089284342, (0xbb, 0xd0, 0xba, 0xd0) => 0.000577329909094111, (0xd0, 0xbd, 0xd1, 0x83) => 0.005204917113475131, (0xb3, 0xd0, 0xb5, 0xd0, 0xbd) => 0.0005244402681460933, (0xd1, 0x87) => 0.03788978898882851, (0xd0, 0xb3, 0xd0, 0xbb) => 0.0028589731977762345, (0x63, 0x61) => 9.8

In [2]:
sort([l=>sum(values(D)) for (l,D) in G], by=last, rev=true)

50-element Vector{Pair{String, Float64}}:
 "rus" => 30.187791629025785
 "mkd" => 28.896797026857826
 "bul" => 27.42843419163413
 "ukr" => 26.62244988563177
 "hbs" => 25.980957657438083
 "ita" => 25.71077834720614
 "spa" => 25.413867437643315
 "eng" => 24.86775581839151
 "fra" => 24.5766081478693
 "ina" => 24.495512105571148
       ⋮
 "heb" => 13.47283958076938
 "yid" => 12.132529131967047
 "kur" => 11.971806223760582
 "vie" => 11.952140372144667
 "ben" => 9.181655235780324
 "zho" => 8.158283496795718
 "jpn" => 7.267760355040137
 "ell" => 6.931174949876233
 "kor" => 4.717837692404967

In [3]:
function norm_table!(t)
    D = last(t)
    vs = sum(values(D))
    for (k, v) in D
        D[k] /= vs
        D[k] = log(D[k])
    end
end

norm_table! (generic function with 1 method)

In [4]:
norm_table!.(G);

In [5]:
LOG_T = last.(G)

50-element Vector{Dict{Tuple{UInt8, Vararg{UInt8}}, Float64}}:
 Dict((0x63, 0x61) => -10.829374267484004, (0x68, 0x65) => -11.856013056527026, (0x63, 0x20, 0x6d) => -11.856013056527026, (0xd8, 0xb4, 0xd8) => -6.799631292737374, (0x70, 0x75, 0x74) => -8.211869496254481, (0xa8, 0xd8) => -5.630116377247552, (0xaf,) => -5.188790353551395, (0xd9, 0x87, 0xd8) => -6.183963528535899, (0xb3, 0x20) => -7.285637188171951, (0x74, 0x20, 0x65, 0x6e) => -8.238361111701456…)
 Dict((0xd0, 0xb0, 0x20, 0xd0) => -6.068576192298406, (0x80, 0xd0, 0xbd, 0xd1) => -9.464620328902246, (0xb0, 0x20, 0xd1, 0x82, 0xd1) => -9.576356614532218, (0xb0, 0xd0, 0xbc, 0xd0, 0xbe) => -9.8489266930945, (0xbb, 0xd0, 0xba, 0xd0) => -10.598878403713769, (0xd0, 0xbd, 0xd1, 0x83) => -8.399933216502276, (0xb3, 0xd0, 0xb5, 0xd0, 0xbd) => -10.694960735089238, (0xd1, 0x87) => -6.414855338069508, (0xd0, 0xb3, 0xd0, 0xbb) => -8.99907445542656, (0x63, 0x61) => -12.363586693997396…)
 Dict((0xa3,) => -6.62700088977945, (0xb9, 0x20, 0xe0) 

In [6]:
DEFAULT_LOGP = (minimum.(values.(last.((G))))|>minimum)

-16.900214584493458

In [7]:
function likelihood(t, logt, default_logp=DEFAULT_LOGP)
    sc = 0.
    for (code, p) in t
        sc += p * get(logt, code, default_logp)
    end
    sc
end
    

likelihood (generic function with 2 methods)

In [8]:
function detector_lh(text; ngram=5, languages=LANGS, logtable=LOG_T)
    t = merged_ngrams(text, ngram)
    lhs = likelihood.(Ref(t), logtable)
    languages[argmax(lhs)]
end


detector_lh (generic function with 1 method)

In [9]:
include("benchmark.jl")

showtable (generic function with 2 methods)

In [10]:
WV = WikiDataSet("corpus/wikipedia/test", langs=LANGS)
TV = TatoebaDataset("corpus/tatoeba", "tatoeba_test.txt", langs=LANGS)

521469-element TatoebaDataset:
             "Pardonu, permesu al mi indiki tri erarojn en la supra artikolo." => "epo"
                                          "Mi veturigos vin al la flughaveno." => "epo"
                                                                 "Kio mankas?" => "epo"
                                            "Aĥ, kiam ili renkontiĝos denove?" => "epo"
 "Kiam mi estis infano, mi pasigis multajn horojn legante sola en mia ĉambro." => "epo"
                                              "Ŝajnas, ke la trajno malfruas." => "epo"
                                                                "Venu rapide!" => "epo"
                                                   "Li montris al mi albumon." => "epo"
                                                             "Estis du kukoj." => "epo"
                                      "Mia filo scipovas jam nombri ĝis cent." => "epo"
                                                                               ⋮
        

In [11]:
WB = benchmark(
    # "likelihood-n5"=>detector_lh, 
    "likelihood-n4"=>t->detector_lh(t, ngram=4),
    "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=LANGS)

likelihood-n4

114.825043 seconds (1.63 G allocations: 28.842 GiB, 4.03% gc time, 0.62% compilation time)
likelihood-n3

 68.638885 seconds (885.20 M allocations: 16.352 GiB, 3.85% gc time, 0.09% compilation time)


2-element Vector{Any}:
 "likelihood-n4" => [1.0, 0.99, 1.0, 0.98, 0.995, 0.98, 0.985, 0.97, 0.995, 1.0  …  1.0, 0.99, 0.98, 0.995, 0.985, 0.985, 1.0, 0.955, 0.98, 0.995]
 "likelihood-n3" => [1.0, 0.99, 1.0, 0.925, 0.995, 0.985, 0.975, 0.965, 1.0, 1.0  …  1.0, 0.99, 0.98, 0.995, 0.985, 0.985, 0.995, 0.965, 0.98, 1.0]

In [12]:
showtable(WB, LANGS, threshold=-0.5)

|               | Average | ara     | bel    | ben     | bul    | cat    | ces    | dan    | deu    | ell     | eng     | epo     | fas    | fin    | fra     | hau    | hbs   | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor    | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus    | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho     |
|---------------|---------|---------|--------|---------|--------|--------|--------|--------|--------|---------|---------|---------|--------|--------|---------|--------|-------|---------|--------|---------|--------|--------|--------|--------|--------|--------|--------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|--------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|---------|
| likelihood-n4 |  9

|               | Average | ara     | bel    | ben     | bul    | cat    | ces    | dan    | deu    | ell     | eng     | epo     | fas    | fin    | fra     | hau    | hbs   | heb     | hin    | hun     | ido    | ina    | isl    | ita    | jpn    | kab    | kor    | kur    | lat    | lit     | mar    | mkd    | msa    | nds    | nld    | nor    | pol     | por    | ron    | rus    | slk    | spa     | swa    | swe    | tat    | tgl    | tur    | ukr     | vie    | yid    | zho     |
|---------------|---------|---------|--------|---------|--------|--------|--------|--------|--------|---------|---------|---------|--------|--------|---------|--------|-------|---------|--------|---------|--------|--------|--------|--------|--------|--------|--------|--------|--------|---------|--------|--------|--------|--------|--------|--------|---------|--------|--------|--------|--------|---------|--------|--------|--------|--------|--------|---------|--------|--------|---------|
| likelihood-n4 |  95.59% | 100.00% | 99.00% | 100.00% | 98.00% | 99.50% | 98.00% | 98.50% | 97.00% |  99.50% | 100.00% | 100.00% | 99.00% | 99.00% | 100.00% | 98.50% | 5.00% | 100.00% | 87.50% |  99.00% | 87.50% | 92.50% | 98.50% | 99.00% | 94.00% | 98.50% | 93.50% | 99.00% | 97.00% | 100.00% | 92.00% | 98.00% | 99.00% | 97.00% | 97.00% | 91.50% |  99.00% | 99.00% | 91.00% | 99.50% | 93.00% | 100.00% | 99.00% | 98.00% | 99.50% | 98.50% | 98.50% | 100.00% | 95.50% | 98.00% |  99.50% |
| likelihood-n3 |  95.27% | 100.00% | 99.00% | 100.00% | 92.50% | 99.50% | 98.50% | 97.50% | 96.50% | 100.00% | 100.00% |  99.50% | 99.00% | 99.00% | 100.00% | 99.00% | 5.00% | 100.00% | 80.00% | 100.00% | 87.00% | 84.50% | 98.50% | 98.00% | 94.50% | 99.00% | 93.50% | 99.00% | 96.50% | 100.00% | 96.00% | 97.00% | 99.00% | 98.50% | 96.50% | 91.00% | 100.00% | 98.00% | 89.00% | 99.50% | 96.00% | 100.00% | 99.00% | 98.00% | 99.50% | 98.50% | 98.50% |  99.50% | 96.50% | 98.00% | 100.00% |


In [13]:
hbs = WikiDataSet("corpus/wikipedia/test", langs=["hbs"]);

In [14]:
x,y = hbs[rand(1:length(hbs))]
detector_lh(x, ngram=4), y

("mkd", "hbs")

In [13]:
WB2 = benchmark(
    "likelihood-n5"=>detector_lh, 
    # "likelihood-n4"=>t->detector_lh(t, ngram=4),
    # "likelihood-n3"=>t->detector_lh(t, ngram=3),
dataset=WV, languages=("mkd", "hbs"))

likelihood-n5

  7.841419 seconds (90.28 M allocations: 1.665 GiB, 3.70% gc time, 1.24% compilation time)


1-element Vector{Any}:
 "likelihood-n5" => [0.985, 0.065]