In [1]:
function normalize_text(text; blacklist=["wikipedia", "tatoeba"])
    text = replace(text, r"https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}" => " ")
    text = replace(text, r"[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}" => " ")
    text = replace(text, r"[^\p{L}]" => " ")
    text = " $text "
    text = lowercase(text)
    for w in blacklist
        text = replace(text, w => " ")
    end
    text = replace(text, r"\s\s+" => " ")
end
function ngrams(text::AbstractString, n, counter=Dict{NTuple{n, UInt8}, Float64}())
	text = transcode(UInt8, string(text))
	for i in 1:length(text)-n+1
        p = Tuple(text[i:i+n-1])
        counter[p] = get(counter, p, 0) + 1
	end
    counter
end

ngrams (generic function with 2 methods)

In [2]:
counter=Dict{NTuple, Float64}()
ngrams("abcde", 3, counter)
ngrams("fff", 2, counter)

Dict{Tuple{Vararg{T, N}} where {N, T}, Float64} with 4 entries:
  (0x62, 0x63, 0x64) => 1.0
  (0x63, 0x64, 0x65) => 1.0
  (0x61, 0x62, 0x63) => 1.0
  (0x66, 0x66)       => 2.0

In [3]:
function Base.:*(n::Number, d::Dict)
    return typeof(d)((k, v*n) for (k, v) in d)
end

function Base.:+(d1::Dict, d2::Dict)
    mergewith(+, d1, d2)
end

In [4]:
3.3counter + counter

Dict{Tuple{Vararg{T, N}} where {N, T}, Float64} with 4 entries:
  (0x62, 0x63, 0x64) => 4.3
  (0x63, 0x64, 0x65) => 4.3
  (0x61, 0x62, 0x63) => 4.3
  (0x66, 0x66)       => 8.6

In [8]:
c2=ngrams(normalize_text("中国"), 2) + ngrams(normalize_text("中国"), 3)

Dict{Tuple{UInt8, UInt8, Vararg{UInt8}}, Float64} with 13 entries:
  (0x20, 0xe4, 0xb8) => 1.0
  (0xad, 0xe5, 0x9b) => 1.0
  (0xe5, 0x9b)       => 1.0
  (0x9b, 0xbd)       => 1.0
  (0xe5, 0x9b, 0xbd) => 1.0
  (0xb8, 0xad, 0xe5) => 1.0
  (0xbd, 0x20)       => 1.0
  (0xb8, 0xad)       => 1.0
  (0xe4, 0xb8)       => 1.0
  (0x20, 0xe4)       => 1.0
  (0xad, 0xe5)       => 1.0
  (0x9b, 0xbd, 0x20) => 1.0
  (0xe4, 0xb8, 0xad) => 1.0

In [71]:
function dump_ngrams(D::Dict{T, Float64}, filename) where T <: Tuple{Vararg{UInt8}}
    open(filename, "w") do f
        for (k, v) in D
            write(f, join(string.(k, base=16), ""))
            write(f, ",")
            write(f, string(v))
            write(f, "\n")
        end
    end
end

function load_ngrams(filename)
    open(filename) do f
        D = []
        for line in eachline(f)
            k, v = split(line, ",")
            @assert iseven(length(k))
            k = Tuple(parse.(UInt8, Iterators.partition(string(k), 2), base=16))
            push!(D, k => parse(Float64, v))
        end
        Dict(D)
    end
end

load_ngrams (generic function with 1 method)

In [72]:
dump_ngrams(2.1c2, "ct.txt")

In [73]:
load_ngrams("ct.txt")

Dict{Tuple{UInt8, UInt8, Vararg{UInt8}}, Float64} with 13 entries:
  (0xe5, 0x9b)       => 2.1
  (0xb8, 0xad, 0xe5) => 2.1
  (0xe4, 0xb8)       => 2.1
  (0x20, 0xe4)       => 2.1
  (0xad, 0xe5, 0x9b) => 2.1
  (0x9b, 0xbd)       => 2.1
  (0xe5, 0x9b, 0xbd) => 2.1
  (0xbd, 0x20)       => 2.1
  (0xb8, 0xad)       => 2.1
  (0xad, 0xe5)       => 2.1
  (0x9b, 0xbd, 0x20) => 2.1
  (0xe4, 0xb8, 0xad) => 2.1
  (0x20, 0xe4, 0xb8) => 2.1