# Character based RNN language model trained on 'The Complete Works of William Shakespeare'
Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness

In [1]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30;

In [2]:
# Load 'The Complete Works of William Shakespeare'
using Knet
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4925284-element Array{UInt8,1}", "525665-element Array{UInt8,1}", "84-element Array{Char,1}")

In [3]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 

    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
   


In [4]:
# Minibatch data
function mb(a)
    N = div(length(a),BATCHSIZE)
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb(trn),mb(tst)
map(length, (dtrn,dtst))

(192, 20)

In [5]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(INPUTSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(INPUTSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    return r,wr,wx,wy,by
end;

In [6]:
# Define loss and its gradient
function predict(ws,xs,hx,cx)
    r,wr,wx,wy,by = ws
    x = wx[:,xs]                                    # xs=(B,T) x=(X,B,T)
    x = dropout(x,DROPOUT)
    y,hy,cy = rnnforw(r,wr,x,hx,cx,hy=true,cy=true) # y=(H,B,T) hy=cy=(H,B,L)
    y = dropout(y,DROPOUT)
    y2 = reshape(y,size(y,1),size(y,2)*size(y,3))   # y2=(H,B*T)
    return wy*y2.+by, hy, cy
end

function loss(w,x,y,h)
    py,hy,cy = predict(w,x,h...)
    h[1],h[2] = getval(hy),getval(cy)
    return nll(py,y)
end

lossgradient = gradloss(loss);

In [14]:
# Train and test loops
function train(model,data,optim)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        grads,loss1 = lossgradient(model,x,y,hiddens)
        update!(model, grads, optim)
        Σ,N=Σ+loss1,N+1
    end
    return Σ/N
end

function test(model,data)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        Σ,N = Σ+loss(model,x,y,hiddens), N+1
    end
    return Σ/N
end; 

In [15]:
# Initialize model
model=optim=nothing; knetgc()
model = initmodel()
optim = optimizers(model, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);

In [16]:
info("Training...")
@time for epoch in 1:EPOCHS
    @time trnloss = train(model,dtrn,optim) # ~18 seconds
    @time tstloss = test(model,dtst)        # ~0.5 seconds
    println((:epoch, epoch, :trnppl, exp(trnloss), :tstppl, exp(tstloss)))
end

[1m[36mINFO: [39m[22m[36mTraining...
[39m

 17.228594 seconds (243.32 k allocations: 131.754 MiB, 0.05% gc time)
  0.713869 seconds (208.56 k allocations: 19.673 MiB, 0.50% gc time)
(:epoch, 1, :trnppl, 13.917706f0, :tstppl, 7.7539396f0)
 17.002396 seconds (237.14 k allocations: 131.419 MiB, 0.07% gc time)
  0.540640 seconds (4.39 k allocations: 8.980 MiB)
(:epoch, 2, :trnppl, 6.683613f0, :tstppl, 6.022151f0)
 17.100168 seconds (238.03 k allocations: 131.432 MiB, 0.04% gc time)
  0.548066 seconds (8.03 k allocations: 9.035 MiB, 0.17% gc time)
(:epoch, 3, :trnppl, 5.5206413f0, :tstppl, 5.272025f0)
 17.164160 seconds (239.27 k allocations: 131.452 MiB, 0.04% gc time)
  0.553129 seconds (4.39 k allocations: 8.980 MiB)
(:epoch, 4, :trnppl, 4.927807f0, :tstppl, 4.8367276f0)
 17.201483 seconds (239.04 k allocations: 131.448 MiB, 0.03% gc time)
  0.551327 seconds (7.71 k allocations: 9.030 MiB, 0.15% gc time)
(:epoch, 5, :trnppl, 4.5414467f0, :tstppl, 4.532178f0)
 17.227623 seconds (239.63 k allocations: 131.457 MiB, 0.03% gc time)
  

In [18]:
# Sample from trained model
function generate(model,n)
    function sample(y)
        p,r=Array(exp.(y-logsumexp(y))),rand()
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    h,c = nothing,nothing
    x = findfirst(chars,'\n')
    for i=1:n
        y,h,c = predict(model,[x],h,c)
        x = sample(y)
        print(chars[x])
    end
    println()
end

generate(model,1000)

.         Carch if he came 'To those fixed;
     'Tis his father from the battle craved bestow and o'ertend.
        Not cold hunt, where they wounds:
      [The Pedrailded]

                   Enter Mar of them, and Attendanigona challensaugh

ACT BIOPEDRWAV. Yes. You are not still-to fall aboard
    And, like likes when knock your mind. Fie!
    SERVAND shadardy the gods should she grow hends invocheth.
    There's an emples doubt not anything. Here he,
    Satisfactorous, are here. In the fessal
    That become yet and tarally.
    Peace, sirrah, he would not call me.
    Forbid wife; but know of yourselves,
    Old the shorts for the fed.   I think he in mine
    hundred fastic perfume.
  COMINIUS. Your varinade, perform thieves so living;
    Foretell, thence, and on my I look'n, oath,
    Yet it hath told not sly to your vexs!
    You are no peace fall that, knows; and live and infuring
    May till I speak with a hand, defend the danger:
    Or else the moo
