# RNN language model
Loosely based on [Zaremba et al. 2014](https://arxiv.org/abs/1409.2329), this example trains a word based RNN language model on Mikolov's PTB data with 10K vocab. It uses the `batchSizes` feature of `rnnforw` to process batches with different sized sentences. The `mb` minibatching function sorts sentences in a corpus by length and tries to group similarly sized sentences together. For an example that uses fixed length batches and goes across sentence boundaries see the [charlm](https://github.com/denizyuret/Knet.jl/blob/master/examples/charlm/charlm.ipynb) notebook. **TODO:** convert to the new RNN interface; check for performance regression (40s->90s, 2000ppl?).

In [1]:
using Pkg; haskey(Pkg.installed(),"Knet") || Pkg.add("Knet");

In [2]:
EPOCHS=10
RNNTYPE=:lstm
BATCHSIZE=64
EMBEDSIZE=128
HIDDENSIZE=256
VOCABSIZE=10000
NUMLAYERS=1
DROPOUT=0.5
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08;

In [3]:
# Load data
using Knet
include(Knet.dir("data","mikolovptb.jl"))
(trn,val,tst,vocab) = mikolovptb()
@assert VOCABSIZE == length(vocab)+1 # +1 for the EOS token
for x in (trn,val,tst,vocab); println(summary(x)); end

42068-element Array{Array{UInt16,1},1}
3370-element Array{Array{UInt16,1},1}
3761-element Array{Array{UInt16,1},1}
9999-element Array{String,1}


In [4]:
# Print a sample
println(tst[1])
println(vocab[tst[1]])

UInt16[0x008e, 0x004e, 0x0036, 0x00fb, 0x0938, 0x0195]
["no", "it", "was", "n't", "black", "monday"]


In [5]:
@doc mikolovptb

```
mikolovptb()
```

Read [PTB](https://catalog.ldc.upenn.edu/ldc99t42) text from Mikolov's [RNNLM](http://www.fit.vutbr.cz/~imikolov/rnnlm) toolkit which has been lowercased and reduced to a 10K vocabulary size.  Return a tuple (trn,dev,tst,vocab) where

```
trn::Vector{Vector{UInt16}}: 42068 sentences, 887521 words
dev::Vector{Vector{UInt16}}: 3370 sentences, 70390 words
tst::Vector{Vector{UInt16}}: 3761 sentences, 78669 words
vocab::Vector{String}: 9999 unique words
```


In [6]:
# Minibatch data into (x,y,b) triples. This is the most complicated part of the code:
# for language models x and y contain the same words shifted, x has an EOS in the beginning, y has an EOS at the end
# x,y = [ s11,s21,s31,...,s12,s22,...] i.e. all the first words followed by all the second words etc.
# b = [b1,b2,...,bT] i.e. how many sentences have first words, how many have second words etc.
# length(x)==length(y)==sum(b) and length(b)=length(s1)+1 (+1 because of EOS)
# sentences in batch should be sorted from longest to shortest, i.e. s1 is the longest sentence
function mb(sentences,batchsize)
    sentences = sort(sentences,by=length,rev=true)
    data = []; eos = VOCABSIZE
    for i = 1:batchsize:length(sentences)
        j = min(i+batchsize-1,length(sentences))
        sij = view(sentences,i:j)
        T = 1+length(sij[1])
        x = UInt16[]; y = UInt16[]; b = UInt16[]
        for t=1:T
            bt = 0
            for s in sij
                if t == 1
                    push!(x,eos)
                    push!(y,s[1])
                elseif t <= length(s)
                    push!(x,s[t-1])
                    push!(y,s[t])
                elseif t == 1+length(s)
                    push!(x,s[t-1])
                    push!(y,eos)
                else
                    break
                end
                bt += 1
            end
            push!(b,bt)
        end
        push!(data,(x,y,b))
    end
    return data
end

mbtrn = mb(trn,BATCHSIZE)
mbval = mb(val,BATCHSIZE)
mbtst = mb(tst,BATCHSIZE)
map(length,(mbtrn,mbval,mbtst))

(658, 53, 59)

In [7]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(EMBEDSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(EMBEDSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    return r,wr,wx,wy,by
end;

In [8]:
# Define loss and its gradient
function predict(ws,xs,bs;pdrop=0)
    r,wr,wx,wy,by = ws
    x = wx[:,xs] # xs=(ΣBt) x=(X,ΣBt)
    x = dropout(x,pdrop)
    (y,_) = rnnforw(r,wr,x,batchSizes=bs) # y=(H,ΣBt)
    y = dropout(y,pdrop)
    return wy * y .+ by  # return=(V,ΣBt)
end

loss(w,x,y,b;o...) = nll(predict(w,x,b;o...), y)

lossgradient = gradloss(loss);

In [9]:
# Train and test loops
function train(model,data,optim)
    Σ,N=0,0
    for (x,y,b) in data
        grads,loss1 = lossgradient(model,x,y,b;pdrop=DROPOUT)
        update!(model, grads, optim)
        n = length(y)
        Σ,N = Σ+n*loss1, N+n
    end
    return Σ/N
end

function test(model,data)
    Σ,N=0,0
    for (x,y,b) in data
        loss1 = loss(model,x,y,b)
        n = length(y)
        Σ,N = Σ+n*loss1, N+n
    end
    return Σ/N
end;

In [10]:
model = optim = nothing; 
Knet.gc() # free gpu memory
if !isfile("rnnlm.jld2")
    # Initialize and train model
    model = initmodel()
    optim = optimizers(model,Adam,lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS)
    for epoch=1:EPOCHS
        @time global j1 = train(model,mbtrn,optim)  # ~39 seconds
        @time global j2 = test(model,mbval)         # ~1 second
        @time global j3 = test(model,mbtst)         # ~1 second
        println((epoch,exp(j1),exp(j2),exp(j3))); flush(stdout)  # prints perplexity = exp(negative_log_likelihood)
    end
    Knet.save("rnnlm.jld2","model",model)
else
    model = Knet.load("rnnlm.jld2","model")
    @time global j1 = test(model,mbtrn)
    @time global j2 = test(model,mbval)
    @time global j3 = test(model,mbtst)
    println((EPOCHS,exp(j1),exp(j2),exp(j3))); flush(stdout)  # prints perplexity = exp(negative_log_likelihood)
end
summary(model)

 34.825407 seconds (4.88 M allocations: 265.037 MiB, 21.56% gc time)
  2.540304 seconds (37.73 k allocations: 3.262 MiB, 18.31% gc time)
  3.755211 seconds (43.40 k allocations: 3.718 MiB, 16.92% gc time)
(10, 1209.7198f0, 1523.0144f0, 1388.2964f0)


"Tuple{RNN,KnetArray{Float32,3},KnetArray{Float32,2},KnetArray{Float32,2},KnetArray{Float32,2}}"