# Character based RNN language model trained on 'The Complete Works of William Shakespeare'
Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness

In [1]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30;

In [2]:
# Load 'The Complete Works of William Shakespeare'
using Knet
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [3]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [4]:
# Minibatch data
function mb(a)
    N = div(length(a),BATCHSIZE)
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb(trn),mb(tst)
map(length, (dtrn,dtst))

(192, 20)

In [5]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(INPUTSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(INPUTSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    return r,wr,wx,wy,by
end;

In [6]:
# Define loss and its gradient
function predict(ws,xs,hx,cx;pdrop=0)
    r,wr,wx,wy,by = ws
    x = wx[:,xs]                                    # xs=(B,T) x=(X,B,T)
    x = dropout(x,pdrop)
    y,hy,cy = rnnforw(r,wr,x,hx,cx,hy=true,cy=true) # y=(H,B,T) hy=cy=(H,B,L)
    y = dropout(y,pdrop)
    y2 = reshape(y,size(y,1),size(y,2)*size(y,3))   # y2=(H,B*T)
    return wy*y2.+by, hy, cy
end

function loss(w,x,y,h;o...)
    py,hy,cy = predict(w,x,h...;o...)
    h[1],h[2] = getval(hy),getval(cy)
    return nll(py,y)
end

lossgradient = gradloss(loss);

In [7]:
# Train and test loops
function train(model,data,optim)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        grads,loss1 = lossgradient(model,x,y,hiddens;pdrop=DROPOUT)
        update!(model, grads, optim)
        Σ,N=Σ+loss1,N+1
    end
    return Σ/N
end

function test(model,data)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        Σ,N = Σ+loss(model,x,y,hiddens), N+1
    end
    return Σ/N
end; 

In [8]:
# Train model or load from file if exists
using JLD
model=optim=nothing; knetgc()
if !isfile("shakespeare.jld")
    model = initmodel()
    optim = optimizers(model, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);    info("Training...")
    @time for epoch in 1:EPOCHS
        @time trnloss = train(model,dtrn,optim) # ~18 seconds
        @time tstloss = test(model,dtst)        # ~0.5 seconds
        println((:epoch, epoch, :trnppl, exp(trnloss), :tstppl, exp(tstloss)))
    end
    save("shakespeare.jld","model",model)
else
    model = load("shakespeare.jld","model")
end
summary(model)

[1m[36mINFO: [39m[22m[36mTraining...
[39m

 23.367439 seconds (2.11 M allocations: 233.286 MiB, 0.26% gc time)
  0.986285 seconds (363.41 k allocations: 27.896 MiB, 0.43% gc time)
(:epoch, 1, :trnppl, 17.207191f0, :tstppl, 8.8799925f0)
 17.668259 seconds (232.09 k allocations: 131.532 MiB, 0.12% gc time)
  0.570718 seconds (3.65 k allocations: 8.966 MiB)
(:epoch, 2, :trnppl, 7.401502f0, :tstppl, 6.533962f0)
 17.757882 seconds (232.25 k allocations: 131.535 MiB, 0.08% gc time)
  0.574279 seconds (3.65 k allocations: 8.966 MiB)
(:epoch, 3, :trnppl, 5.9158125f0, :tstppl, 5.6346264f0)
 17.814497 seconds (233.28 k allocations: 131.550 MiB, 0.09% gc time)
  0.583290 seconds (7.14 k allocations: 9.020 MiB, 0.36% gc time)
(:epoch, 4, :trnppl, 5.2262263f0, :tstppl, 5.1046495f0)
 17.897471 seconds (233.34 k allocations: 131.551 MiB, 0.07% gc time)
  0.578130 seconds (3.65 k allocations: 8.966 MiB)
(:epoch, 5, :trnppl, 4.785228f0, :tstppl, 4.751793f0)
 17.905744 seconds (233.21 k allocations: 131.550 MiB, 0.08% gc time)
  0.579939 seconds

"Tuple{Knet.RNN,Knet.KnetArray{Float32,3},Knet.KnetArray{Float32,2},Knet.KnetArray{Float32,2},Knet.KnetArray{Float32,2}}"

In [9]:
# Sample from trained model
function generate(model,n)
    function sample(y)
        p,r=Array(exp.(y-logsumexp(y))),rand()
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    h,c = nothing,nothing
    x = findfirst(chars,'\n')
    for i=1:n
        y,h,c = predict(model,[x],h,c)
        x = sample(y)
        print(chars[x])
    end
    println()
end

generate(model,1000)

    And, how to had pass it rank'd!  
    Upar it not.
  VIRGILIA. I Winnob sailors. Hardle you, I welcome, my courage is.
  BEDFORD. Mine us gellow grandament my hoarses-tit
    No mock dogst perbertious; are the best lov'd;
    Buffice this your childish effects and doings!
    But if I beseech you, sir, be relents
    I fear me.
  MURTHERENE. Nay, forswore here is a bradger so. If I will not live,
    Why, to take set fore sore of this offite the wife.
  HORTENSIUS. Why, underthrew your husband, thou speaks for comfort
    And seems my nature confederates: 'tis twain to
    First obly he'll not forswore.              Untellias Marcius.
    But what's I with many wishest childre Petrac's need?
                    Our moets that come from Troy.                      
                   Altur's maidy sweet,
      Enfirs, 'tis but thy wounds to in my love.
            Which, line of my good Queen,
           And, truest as like against night;
    The sack find liet rea
