# Character based RNN language model trained on 'The Complete Works of William Shakespeare'
Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness

In [21]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30;

In [22]:
# Load 'The Complete Works of William Shakespeare'
using Knet
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4925284-element Array{UInt8,1}", "525665-element Array{UInt8,1}", "84-element Array{Char,1}")

In [23]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 

    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
   


In [24]:
# Minibatch data
function mb(a)
    N = div(length(a),BATCHSIZE)
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb(trn),mb(tst)
map(length, (dtrn,dtst))

(192, 20)

In [25]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(INPUTSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(INPUTSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    r,(wr,wx,wy,by)
end;

In [26]:
# Define loss and its gradient
function predict(r,ws,xs,hx,cx)
    wr,wx,wy,by = ws
    x = wx[:,xs]                                    # xs=(B,T) x=(X,B,T)
    x = dropout(x,DROPOUT)
    y,hy,cy = rnnforw(r,wr,x,hx,cx,hy=true,cy=true) # y=(H,B,T) hy=cy=(H,B,L)
    y = dropout(y,DROPOUT)
    y2 = reshape(y,size(y,1),size(y,2)*size(y,3))   # y2=(H,B*T)
    return wy*y2.+by, hy, cy
end

function loss(r,w,x,y,h)
    py,hy,cy = predict(r,w,x,h...)
    h[1],h[2] = getval(hy),getval(cy)
    return nll(py,y)
end

lossgradient = gradloss(loss,2);

In [27]:
# Train and test loops
function train(rnn,weights,data,optim)
    hiddens = Any[nothing,nothing]
    losses = []
    for (x,y) in data
        grads,loss1 = lossgradient(rnn,weights,x,y,hiddens)
        update!(weights, grads, optim)
        push!(losses, loss1)
    end
    return mean(losses)
end

function test(rnn,weights,data)
    hiddens = Any[nothing,nothing]
    losses = []
    for (x,y) in data
        push!(losses, loss(rnn,weights,x,y,hiddens))
    end
    return mean(losses)
end; 

In [28]:
# Initialize model
rnn=weights=optim=nothing; knetgc()
rnn,weights = initmodel()
optim = optimizers(weights, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);

In [29]:
info("Training...")
@time for epoch in 1:EPOCHS
    @time trnloss = train(rnn,weights,dtrn,optim) # ~18 seconds
    @time tstloss = test(rnn,weights,dtst)        # ~0.5 seconds
    println((:epoch, epoch, :trnppl, exp(trnloss), :tstppl, exp(tstloss)))
end

[1m[36mINFO: [39m[22m[36mTraining...
[39m

 17.084233 seconds (253.34 k allocations: 132.278 MiB, 0.04% gc time)
  0.607702 seconds (38.99 k allocations: 10.744 MiB)
(:epoch, 1, :trnppl, 17.664436f0, :tstppl, 9.183411f0)
 16.843650 seconds (232.61 k allocations: 131.185 MiB, 0.04% gc time)
  0.546336 seconds (8.24 k allocations: 9.039 MiB, 0.18% gc time)
(:epoch, 2, :trnppl, 7.6239805f0, :tstppl, 6.705649f0)
 16.907754 seconds (233.63 k allocations: 131.201 MiB, 0.03% gc time)
  0.552095 seconds (4.40 k allocations: 8.980 MiB)
(:epoch, 3, :trnppl, 6.1014156f0, :tstppl, 5.7781134f0)
 17.008465 seconds (234.47 k allocations: 131.212 MiB, 0.03% gc time)
  0.551228 seconds (7.79 k allocations: 9.032 MiB, 0.16% gc time)
(:epoch, 4, :trnppl, 5.393814f0, :tstppl, 5.245802f0)
 17.086273 seconds (235.00 k allocations: 131.220 MiB, 0.03% gc time)
  0.553913 seconds (4.40 k allocations: 8.980 MiB)
(:epoch, 5, :trnppl, 4.933017f0, :tstppl, 4.8756704f0)
 17.130323 seconds (234.47 k allocations: 131.212 MiB, 0.03% gc time)
  0.555645 seconds

In [30]:
# Sample from trained model
function generate(rnn,weights,n)
    function sample(y)
        p,r=Array(exp.(y-logsumexp(y))),rand()
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    h,c = nothing,nothing
    x = findfirst(chars,'\n')
    for i=1:n
        y,h,c = predict(rnn,weights,[x],h,c)
        x = sample(y)
        print(chars[x])
    end
    println()
end

generate(rnn,weights,1000)

    Obeath for him e'en take he.
  CASSIO. Glad my long-string necessery, come, do Ebsero.
    And when he hie's battles, his father's queen!
    And though thou not so brufy to hear and
    Locke the castle the mouldy murtherers of your
    To charge thee a child shin, I was all. When draws Tit
    After sheek for merect to good where,
    Strike, try, Christman, of the death of comfort,
    And struttians, commvan-kinver in't.
  LYSANDER. So, my lord.
  TIMON. They could not conduct hawh'b done, by my gusts,
    And shall wear me what common an ene?
    To this, and weak a secuad, which never find
    To bed, indeed, truly asham'd out of gracious hars!
    They pleds her own fault.
  SHYLOCK. Caesar I now hurt, I do; what, right and air stands?
  FIRST BACWBRGMAM. And ask them, away these rush blood; and this I can not
    unfector the noble traitor to command'nor I bear her.
  CORIOLANUS. From your fools of two order of starp, but letter
    With like guards to re
