# Character based RNN language model trained on 'The Complete Works of William Shakespeare'
Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness

In [1]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30;

In [2]:
# Load 'The Complete Works of William Shakespeare'
using Knet
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [3]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [4]:
# Minibatch data
function mb(a)
    N = div(length(a),BATCHSIZE)
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb(trn),mb(tst)
map(length, (dtrn,dtst))

(192, 20)

In [5]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(INPUTSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(INPUTSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    return r,wr,wx,wy,by
end;

In [6]:
# Define loss and its gradient
function predict(ws,xs,hx,cx;pdrop=0)
    r,wr,wx,wy,by = ws
    x = wx[:,xs]                                    # xs=(B,T) x=(X,B,T)
    x = dropout(x,pdrop)
    y,hy,cy = rnnforw(r,wr,x,hx,cx,hy=true,cy=true) # y=(H,B,T) hy=cy=(H,B,L)
    y = dropout(y,pdrop)
    y2 = reshape(y,size(y,1),size(y,2)*size(y,3))   # y2=(H,B*T)
    return wy*y2.+by, hy, cy
end

function loss(w,x,y,h;o...)
    py,hy,cy = predict(w,x,h...;o...)
    h[1],h[2] = getval(hy),getval(cy)
    return nll(py,y)
end
using AutoGrad: gradloss
lossgradient = gradloss(loss);

In [7]:
# Train and test loops
function train(model,data,optim)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        grads,loss1 = lossgradient(model,x,y,hiddens;pdrop=DROPOUT)
        update!(model, grads, optim)
        Σ,N=Σ+loss1,N+1
    end
    return Σ/N
end

function test(model,data)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        Σ,N = Σ+loss(model,x,y,hiddens), N+1
    end
    return Σ/N
end; 

In [8]:
# Train model or load from file if exists
#using JLD
model=optim=nothing; 
Knet.gc()
if !isfile("shakespeare.jld")
    model = initmodel()
    optim = optimizers(model, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);    
    @info("Training...")
    @time for epoch in 1:EPOCHS
        @time trnloss = train(model,dtrn,optim) # ~18 seconds
        @time tstloss = test(model,dtst)        # ~0.5 seconds
        println((:epoch, epoch, :trnppl, exp(trnloss), :tstppl, exp(tstloss)))
    end
    #save("shakespeare.jld","model",model)
else
    model = load("shakespeare.jld","model")
end
summary(model)

┌ Info: Training...
└ @ Main In[8]:8


 28.468388 seconds (22.34 M allocations: 4.346 GiB, 2.85% gc time)
  0.979191 seconds (1.24 M allocations: 75.520 MiB, 2.02% gc time)
(:epoch, 1, :trnppl, 17.36091f0, :tstppl, 9.151436f0)
 18.358390 seconds (212.83 k allocations: 3.258 GiB, 1.41% gc time)
  0.536230 seconds (4.37 k allocations: 12.884 MiB, 0.58% gc time)
(:epoch, 2, :trnppl, 7.55034f0, :tstppl, 6.611075f0)
 18.436042 seconds (213.84 k allocations: 3.258 GiB, 0.92% gc time)
  0.544317 seconds (4.37 k allocations: 12.884 MiB, 0.58% gc time)
(:epoch, 3, :trnppl, 6.0113096f0, :tstppl, 5.717249f0)
 18.461435 seconds (214.86 k allocations: 3.258 GiB, 0.92% gc time)
  0.543928 seconds (4.37 k allocations: 12.884 MiB, 0.58% gc time)
(:epoch, 4, :trnppl, 5.320017f0, :tstppl, 5.180779f0)
 18.521647 seconds (214.86 k allocations: 3.258 GiB, 0.91% gc time)
  0.546192 seconds (4.37 k allocations: 12.884 MiB, 0.58% gc time)
(:epoch, 5, :trnppl, 4.8687263f0, :tstppl, 4.8216457f0)
 18.543604 seconds (214.86 k allocations: 3.258 GiB, 0

"Tuple{Knet.RNN,KnetArray{Float32,3},KnetArray{Float32,2},KnetArray{Float32,2},KnetArray{Float32,2}}"

In [10]:
# Sample from trained model
function generate(model,n)
    function sample(y)
        p,r=Array(exp.(Array(y).-logsumexp(y))),rand()
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    h,c = nothing,nothing
    x = something(findfirst(isequal('\n'), chars), 0)
    for i=1:n
        y,h,c = predict(model,[x],h,c)
        x = sample(y)
        print(chars[x])
    end
    println()
end

generate(model,1000)

  RICHMOND]  If your fanter's trock, Sir John, Tubine's is!
    What you love me- I come for-old!
  CONSTANCE. Before thou art]                                           Exeunt




SCENE 2.

Before OUFLOUF
               Ophe and with yours? I do lock upon my
    Of great dull conceit, for I fine either
     to grimf'd you to your tent.
  PETRUCHIO. I thank you, my good Fortung lamb of Vaulnce.
  SUFFOLK. Entreat I am less'd, indoctorous chief
    For some red and vaunt as he doth pay us,
    And sent these Rosalind Dein Pejeona
    Than his poor clostly couple as your splean.
    Fall-yead, I'll call him nothing service; and let them
    To good these father was kill'd the darkness of water
    Softh Paris throught.
  DEMETRIUS. A fool, would made upon you.
  AJAX. My lord, you will!
  MACCOLUS. O, good Pity. Stand you round.
    False, the Badamon, I will done me from Wild,
    hit that greeting into their drinks, being notes- once
    Must bearing abway in 
