In [1]:
using JLD,JLD2,Knet

In [2]:
include("types.jl")
include("pre_processing.jl")



minibatch (generic function with 1 method)

In [3]:
d = JLD.load("pretrained_model.jld2")

Dict{String,Any} with 13 entries:
  "back"       => Any[Float32[1.70814 0.787587 … -0.636378 -1.76298; 4.32346 5.…
  "word_vocab" => Dict{AbstractString,Int64}("null"=>8668,"Secure"=>7932,"Puppy…
  "char_vocab" => Dict('\x00\x00\x00\x42'=>37,'\x00\x00\x20\x1d'=>116,'\x00\x00…
  "eowchar"    => '\x00\x00\x00\x13'
  "sowchar"    => '\x00\x00\x00\x12'
  "forw"       => Any[Float32[1.47578 0.055072 … -0.2823 -1.58338; -1.95253 -0.…
  "eosword"    => "</s>"
  "sosword"    => "<s>"
  "soft"       => Array{Float32,2}[[-0.303084 -1.18525 … -0.281071 -0.2053; -0.…
  "unkword"    => "<unk>"
  "cembed"     => Float32[0.593419 0.190279 … 0.11003 -0.000173569; -0.236287 0…
  "char"       => Any[Float32[0.0708071 -0.456333 … 0.557484 0.094816; -1.61625…
  "unkchar"    => '\x00\x00\x00\x11'

In [4]:
v = create_vocab(d)

Vocab(Dict('\x00\x00\x00\x42'=>37,'\x00\x00\x20\x1d'=>116,'\x00\x00\x00\x58'=>87,'\x00\x00\x20\x44'=>110,'\x00\x00\x00\x56'=>59,'\x00\x00\x00\xd7'=>107,'\x00\x00\x00\x48'=>29,'\x00\x00\x00\x75'=>5,'\x00\x00\x00\x50'=>11,'\x00\x00\x00\x7a'=>61…), Dict{String,Int64}(), Dict("null"=>8668,"Secure"=>7932,"Puppy"=>9040,"progression"=>11757,"gathered"=>6954,"Core"=>3600,"lovers"=>8832,"underground"=>9093,"backup"=>5047,"caught"=>2501…), "<s>", "</s>", "<unk>", '\x00\x00\x00\x12', '\x00\x00\x00\x13', '\x00\x00\x00\x11', Dict("NUM"=>0x09,"DET"=>0x06,"X"=>0x11,"VERB"=>0x10,"PROPN"=>0x0c,"ADV"=>0x03,"ADJ"=>0x01,"AUX"=>0x04,"PUNCT"=>0x0d,"PRON"=>0x0b…), Dict("reparandum"=>0x23,"root"=>0x01,"vocative"=>0x24,"csubj"=>0x0f,"advmod"=>0x04,"expl"=>0x14,"iobj"=>0x18,"mark"=>0x1a,"advcl"=>0x03,"nmod"=>0x1b…))

In [5]:
corpus = load_conllu("tr_imst-ud-train.conllu",v)

3685-element Array{Any,1}:
 Sanal parçacıklar sa bunların hiçbirini yapamazlar .                                                                                                                                                                                                                   
 Ona her şeyimi verdim .                                                                                                                                                                                                                                                
 Karşısında , pantolonu dizlerine dek ıslak , önlük torbası ham eriklerle dolu İbrahim dikiliyordu .                                                                                                                                                                    
 Nereden biliyorsunuz .                                                                                                                                                           

In [6]:
s = corpus[1]

Sanal parçacıklar sa bunların hiçbirini yapamazlar . 

In [7]:
function fillwvecs!(sentences, isents, wembed; GPUFEATS=false)
    for (s, isents) in zip(sentences, isents)
        empty!(s.wvec)
        for w in isents
            if GPUFEATS
                push!(s.wvec, wembed[:, w])
            else
                push!(s.wvec, Array(wembed[:, w]))
            end
        end
    end
end

fillwvecs! (generic function with 1 method)

In [145]:
function fillcvecs!(sentences, forw, back; GPUFEATS=false)
    T = length(forw)
    for i in 1:length(sentences)
        s = sentences[i]
        empty!(s.fvec)
        empty!(s.bvec)
        N = length(s)
        for n in 1:N
            t = T-N+n
            if GPUFEATS #GPU
                push!(s.fvec, forw[t][:,i])
                push!(s.bvec, back[t][:,i])
            else #CPU
                push!(s.fvec, Array(forw[t][:,i]))
                push!(s.bvec, Array(back[t][:,i]))
            end
        end
    end
end

fillcvecs! (generic function with 1 method)

In [173]:
function lmloss(model, data, mask, forw, back; result=nothing)
    T = length(data)
    B = length(data[1])
    weight, bias = wsoft(model), bsoft(model)
    idx(t,b,n) = data[t][b] + (b-1)*n

    total = count = 0
    for t in 1:T
        ypred = weight * vcat(forw[t], back[t]) .+ bias
        nrows,ncols = size(ypred)
        index = Int[]
        for b=1:B
            if mask[t][b]==1
                push!(index, idx(t,b,nrows))
            end
        end
        o1 = logp(ypred, dims=1)
        o2 = o1[index]
        total += sum(o2)
        count += length(o2)
    end
    
    if result != nothing
        result[1] += AutoGrad.getval(total)
        result[2] += count
    end
    return total
end

lmloss (generic function with 1 method)

In [147]:
function wordlstm(model, data, mask, embeddings)
    weight, bias = wforw(model), bforw(model)
    T = length(data)
    B = length(data[1])
    H = div(length(bias), 4)


    if isa(weight, KnetArray)
        mask = map(KnetArray, mask)
    end
    
    wzero = fill!(similar(bias, H, B), 0)

    # forward lstm
    hidden = cell = wzero
    fhiddens = Array{Any}(undef,T-2)  # fhiddens = Array(Any, T-2) : deprecated
    for t in 1:T-2
        (hidden, cell) = _lstm(weight, bias, hidden, cell, embeddings[:, data[t]]; mask=mask[t])
        fhiddens[t] = hidden
    end

    # backward lstm
    weight_b, bias_b = wback(model), bback(model)
    hidden = cell = wzero
    bhiddens = Array{Any}(undef,T-2)  # bhiddens = Array(Any, T-2) : deprecated
    for t in T:-1:3
        (hidden, cell) = _lstm(weight_b, bias_b, hidden, cell, embeddings[:, data[t]]; mask=mask[t])
        bhiddens[t-2] = hidden
    end
    return fhiddens, bhiddens
end

wordlstm (generic function with 1 method)

In [148]:
function charlstm(model, data, mask)
    weight, bias, embeddings = wchar(model), bchar(model), cembed(model)
    T = length(data)
    B = length(data[1])
    H = div(length(bias), 4)

    
    if isa(weight, KnetArray)
        mask = map(KnetArray, mask)
    end
    
    czero = fill!(similar(bias, H, B), 0)
    hidden = cell = czero
    for t in 1:T
        (hidden, cell) = _lstm(weight, bias, hidden, cell, embeddings[:, data[t]]; mask=mask[t])
    end
    return hidden
end

charlstm (generic function with 1 method)

In [149]:
function _lstm(weight, bias, hidden, cell, input; mask=nothing)
    gates = weight * vcat(input, hidden) .+ bias
    H = size(hidden, 1)
    forget = sigm.(gates[1:H, :])
    ingate = sigm.(gates[1+H:2H, :])
    outgate = sigm.(gates[1+2H:3H, :])
    change = tanh.(gates[1+3H:4H, :])
    (mask != nothing) && (mask = reshape(mask, 1, length(mask)))

    cell = cell .* forget + ingate .* change
    hidden = outgate .* tanh.(cell)
    if mask != nothing
        hidden = hidden .* mask
        cell = cell .* mask
    end
    return (hidden, cell)
end

_lstm (generic function with 1 method)

In [150]:
function goldbatch(sentences, maxlen, wdict, unkwid, pad=unkwid)
    B = length(sentences)
    T = maxlen
    data = [ Array{Int}(undef,B) for t in 1:T ]
    mask = [ Array{Float32}(undef,B) for t in 1:T ]
    for t in 1:T
        for b in 1:B
            N = length(sentences[b])
            n = t - T + N
            if n <= 0
                mask[t][b] = 0
                data[t][b] = pad
            else
                mask[t][b] = 1
                data[t][b] = get(wdict, sentences[b].word[n], unkwid)
            end
        end
    end
    return data, mask
end

goldbatch (generic function with 2 methods)

In [151]:
function tokenbatch(words, maxlen, sos, eos, pad=eos)
    B = length(words) # batchsize
    T = maxlen + 2
    data = [ Array{Int}(undef,B) for t in 1:T ]
    mask = [ Array{Float32}(undef,B) for t in 1:T ]
    @inbounds for t in 1:T
        for b in 1:B
            N = length(words[b]) # wordlen
            n = t - T + N + 1 # cursor 
            if n < 0
                mask[t][b] = 0
                data[t][b] = pad
            else
                mask[t][b] = 1
                if n == 0
                    data[t][b] = sos
                elseif n <= N
                    data[t][b] = words[b][n]
                elseif n == N+1
                    data[t][b] = eos
                else
                    error()
                end
            end
        end
    end
    return data, mask
end

tokenbatch (generic function with 2 methods)

In [142]:
function maptoint(sentences, v::Vocab)
    MAXWORD = 32
    wdict = empty!(v.idict) # it is already empty ?
    cdict = v.cdict
    unkcid = cdict[v.unkchar]
    words = Vector{Int}[]
    sents = Vector{Int}[]

    maxwordlen = 0; maxsentlen = 0;
    for w in (v.sosword, v.eosword)
        wid = get!(wdict, w, 1+length(wdict))
        word = Array{Int}(undef,length(w)) #Array(Int, length(w))
        wordi = 0 # to check 2 byte characters
        for c in w
            word[wordi+=1] = get(cdict, c, unkcid)
        end
        (wordi != length(w)) && error("Missing in single word process")
        (wordi > maxwordlen) && (maxwordlen = wordi)
        push!(words, word)
    end

    for s in sentences
        sent = Array{Int}(undef,length(s.word)) #Array(Int, length(s.word))
        senti = 0
        for w in s.word
            ndict = length(wdict)
            wid = get!(wdict, w, 1+ndict)
            sent[senti+=1] = wid
            if wid == 1+ndict
                word = Array{Int}(undef,length(w)) #Array(Int, length(w))
                wordi = 0
                for c in w
                    word[wordi+=1] = get(cdict, c, unkcid)
                end
                (wordi != length(w)) && error("Missing in single word process")
                if wordi > MAXWORD; wordi=MAXWORD; word = word[1:wordi]; end;
                (wordi > maxwordlen) && (maxwordlen = wordi) 
                push!(words, word)
            end
        end
        @assert(senti == length(s.word))
        (senti > maxsentlen) && (maxsentlen = senti)
        push!(sents, sent)
    end
    @assert(length(wdict) == length(words))
    return words, sents, maxwordlen, maxsentlen
end

maptoint (generic function with 1 method)

In [16]:
makewmodel1(d)=[ d["cembed"],
                 d["char"][1],
                 d["char"][2],
                 d["forw"][1],
                 d["forw"][2],
                 d["back"][1],
                 d["back"][2],
                 d["soft"][1],
                 d["soft"][2] ]

makewmodel1 (generic function with 1 method)

In [17]:
function makewmodel(d)
    d1 = makewmodel1(d)
    if gpu() >= 0
        return map(KnetArray, d1)
    else
        return map(Array, d1)
    end
end

makewmodel (generic function with 1 method)

In [18]:
cembed(m) = m[1]
wchar(m) = m[2]; bchar(m) = m[3];
wforw(m) = m[4]; bforw(m) = m[5];
wback(m) = m[6]; bback(m) = m[7];
wsoft(m) = m[8]; bsoft(m) = m[9];

In [174]:
function fillvecs!(wmodel, sentences, vocab; batchsize=128)

    words, sents, maxwordlen, maxsentlen = maptoint(sentences, vocab)
    sow = vocab.cdict[vocab.sowchar]
    eow = vocab.cdict[vocab.eowchar]
    paw = vocab.cdict[vocab.unkchar]
 
    # word-embeddings calcutation
    wembed = Any[]
    #free_KnetArray();
    for i=1:batchsize:length(words)
        j = min(i+batchsize-1,length(words))
        wij = view(words,i:j)
        maxij = maximum(map(length, wij))
        cdata, cmask = tokenbatch(wij, maxij, sow, eow)
        push!(wembed, charlstm(wmodel, cdata, cmask))
    end
    wembed =hcat(wembed...) # Here I applied the changes from hcatn-> hcat in newer version
    fillwvecs!(sentences, sents, wembed)

    sos,eos,unk = vocab.idict[vocab.sosword], vocab.idict[vocab.eosword], vocab.odict[vocab.unkword]
    result = zeros(2)
    #free_KnetArray()
    for i=1:batchsize:length(sents)
        j = min(i+batchsize-1, length(sents))
        isentij = view(sents, i:j)
        maxij = maximum(map(length, isentij))
        wdata, wmask = tokenbatch(isentij, maxij, sos, eos)
        forw, back = wordlstm(wmodel, wdata, wmask, wembed)
        sentij = view(sentences, i:j)
        fillcvecs!(sentij, forw, back)
        odata, omask = goldbatch(sentij, maxij, vocab.odict, unk)
        lmloss(wmodel,odata,omask,forw,back; result=result) 
    end
    return exp(-result[1]/result[2])
end

fillvecs! (generic function with 1 method)

In [153]:
wmodel = makewmodel(d)

9-element Array{Array{Float32,2},1}:
 [0.593419 0.190279 … 0.11003 -0.000173569; -0.236287 0.455614 … 0.138825 -0.0971059; … ; -0.341316 0.0835031 … -0.0285478 -0.0718531; -0.205958 -0.168918 … -0.13226 -0.0319474]
 [0.0708071 -0.456333 … 0.557484 0.094816; -1.61625 -1.37924 … 0.342474 0.548784; … ; 1.07664 -0.430524 … -0.360356 0.668587; 0.324491 -0.175173 … -0.790751 1.16417]            
 [1.2185; 1.52402; … ; -1.1745; -1.53404]                                                                                                                                        
 [1.47578 0.055072 … -0.2823 -1.58338; -1.95253 -0.402249 … -0.221281 -0.667834; … ; 1.28302 -0.0459206 … -2.12551 -0.95058; 0.924036 0.269601 … 0.515548 7.38111]               
 [-0.916012; -2.25621; … ; -0.166855; 0.169325]                                                                                                                                  
 [1.70814 0.787587 … -0.636378 -1.76298; 4.32346 5.14362 … 0.705938 -0.04

In [154]:
words, sents, maxwordlen, maxsentlen = maptoint(corpus, v)

(Array{Int64,1}[[1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1], [1, 1, 1]  …  [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], Array{Int64,1}[[3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 9], [14, 15, 16, 17, 18, 19, 15, 20, 21, 22, 23, 24, 25, 26, 9], [27, 28, 9], [29, 30, 31, 9], [32, 33, 34, 9], [35, 36, 37, 9], [38, 39], [40, 41, 9], [42, 43, 44, 9]  …  [13779, 13780, 1124, 73, 13781, 13782, 9633, 9], [13783, 13784, 15, 13785, 13786, 13787, 606, 607, 99, 39], [13788, 9628, 1915, 215, 9], [13789, 15, 854, 658, 4761, 13790, 9], [13791, 13792, 9], [5308, 13793, 6209, 146, 9], [8698, 5901, 160, 13794, 7669, 3910, 13795, 15, 13796, 13797, 1161, 941, 13

In [175]:
fillvecs!(wmodel, corpus, v)

│   caller = #lmloss#43(::Array{Float64,1}, ::Function, ::Array{Array{Float32,2},1}, ::Array{Array{Int64,1},1}, ::Array{Array{Float32,1},1}, ::Array{Any,1}, ::Array{Any,1}) at In[173]:24
└ @ Main ./In[173]:24


9.343284618635387

In [176]:
corpus

3685-element Array{Any,1}:
 Sanal parçacıklar sa bunların hiçbirini yapamazlar .                                                                                                                                                                                                                   
 Ona her şeyimi verdim .                                                                                                                                                                                                                                                
 Karşısında , pantolonu dizlerine dek ıslak , önlük torbası ham eriklerle dolu İbrahim dikiliyordu .                                                                                                                                                                    
 Nereden biliyorsunuz .                                                                                                                                                           