# Character based RNN language model trained on 'The Complete Works of William Shakespeare'
Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness

In [1]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30;

In [2]:
# Load 'The Complete Works of William Shakespeare'
using Knet; @show gpu()
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

┌ Info: Recompiling stale cache file /kuacc/users/dyuret/.julia/compiled/v1.0/Knet/f4vSz.ji for Knet [1902f260-5fb4-5aff-8c31-6271790ab950]
└ @ Base loading.jl:1184


gpu() = 0


("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [3]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [4]:
# Minibatch data
function mb(a)
    N = div(length(a),BATCHSIZE)
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb(trn),mb(tst)
map(length, (dtrn,dtst))

(192, 20)

In [5]:
# Define model
function initmodel()
    w(d...)=KnetArray(xavier(Float32,d...))
    b(d...)=KnetArray(zeros(Float32,d...))
    r,wr = rnninit(INPUTSIZE,HIDDENSIZE,rnnType=RNNTYPE,numLayers=NUMLAYERS,dropout=DROPOUT)
    wx = w(INPUTSIZE,VOCABSIZE)
    wy = w(VOCABSIZE,HIDDENSIZE)
    by = b(VOCABSIZE,1)
    return r,wr,wx,wy,by
end;

In [6]:
# Define loss and its gradient
function predict(ws,xs,hx,cx;pdrop=0)
    r,wr,wx,wy,by = ws
    x = wx[:,xs]                                    # xs=(B,T) x=(X,B,T)
    x = dropout(x,pdrop)
    y,hy,cy = rnnforw(r,wr,x,hx,cx,hy=true,cy=true) # y=(H,B,T) hy=cy=(H,B,L)
    y = dropout(y,pdrop)
    y2 = reshape(y,size(y,1),size(y,2)*size(y,3))   # y2=(H,B*T)
    return wy*y2.+by, hy, cy
end

function loss(w,x,y,h;o...)
    py,hy,cy = predict(w,x,h...;o...)
    h[1],h[2] = getval(hy),getval(cy)
    return nll(py,y)
end
using AutoGrad: gradloss
lossgradient = gradloss(loss);

In [7]:
# Train and test loops
function train(model,data,optim)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        grads,loss1 = lossgradient(model,x,y,hiddens;pdrop=DROPOUT)
        update!(model, grads, optim)
        Σ,N=Σ+loss1,N+1
    end
    return Σ/N
end

function test(model,data)
    hiddens = Any[nothing,nothing]
    Σ,N=0,0
    for (x,y) in data
        Σ,N = Σ+loss(model,x,y,hiddens), N+1
    end
    return Σ/N
end; 

In [8]:
# Train model or load from file if exists
#using JLD
model=optim=nothing; 
Knet.gc()
if !isfile("shakespeare.jld")
    model = initmodel()
    optim = optimizers(model, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);    
    @info("Training...")
    @time for epoch in 1:EPOCHS
        @time trnloss = train(model,dtrn,optim) # ~18 seconds
        @time tstloss = test(model,dtst)        # ~0.5 seconds
        println((:epoch, epoch, :trnppl, exp(trnloss), :tstppl, exp(tstloss)))
    end
    #save("shakespeare.jld","model",model)
else
    model = load("shakespeare.jld","model")
end
summary(model)

┌ Info: Training...
└ @ Main In[8]:8


 23.929362 seconds (15.95 M allocations: 4.041 GiB, 2.10% gc time)
  0.998169 seconds (1.31 M allocations: 79.210 MiB, 2.62% gc time)
(:epoch, 1, :trnppl, 15.987634f0, :tstppl, 8.394304f0)
 17.551927 seconds (210.34 k allocations: 3.258 GiB, 1.23% gc time)
  0.515916 seconds (4.37 k allocations: 12.884 MiB, 0.44% gc time)
(:epoch, 2, :trnppl, 7.1263394f0, :tstppl, 6.3735137f0)
 17.559598 seconds (211.35 k allocations: 3.258 GiB, 0.76% gc time)
  0.519077 seconds (4.37 k allocations: 12.884 MiB, 0.55% gc time)
(:epoch, 3, :trnppl, 5.818079f0, :tstppl, 5.522996f0)
 17.630614 seconds (212.36 k allocations: 3.258 GiB, 0.73% gc time)
  0.522979 seconds (4.37 k allocations: 12.884 MiB, 0.49% gc time)
(:epoch, 4, :trnppl, 5.165233f0, :tstppl, 5.0538926f0)
 17.745022 seconds (212.36 k allocations: 3.258 GiB, 0.73% gc time)
  0.518279 seconds (4.37 k allocations: 12.884 MiB, 0.58% gc time)
(:epoch, 5, :trnppl, 4.744109f0, :tstppl, 4.7103724f0)
 17.775082 seconds (212.36 k allocations: 3.258 GiB

"Tuple{Knet.RNN,KnetArray{Float32,3},KnetArray{Float32,2},KnetArray{Float32,2},KnetArray{Float32,2}}"

In [9]:
# Sample from trained model
function generate(model,n)
    function sample(y)
        p,r=Array(exp.(Array(y).-logsumexp(y))),rand()
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    h,c = nothing,nothing
    x = something(findfirst(isequal('\n'), chars), 0)
    for i=1:n
        y,h,c = predict(model,[x],h,c)
        x = sample(y)
        print(chars[x])
    end
    println()
end

generate(model,1000)

[SHEENTITHES    ke't

  DEMETRIUS. Mouth weary, and noe requer? I'll need no furth,
    And all I papes no tingers will now in resemble.
  CAIUS. I hopp'd with me as down;
    Mike the deer stalching home to use me,
    Of your goes tooulls. Would I was?
  SECOND MARGA. Alack, adox, falsole.
  BRUTUS. Pact night; for I am not people of us.
  HASTINGS. Vertaice, mine, leave about the passion hold,
    Which shall not good and tend for cat here.
  PLOWN. As nocobbeak with flint fairly frie, passagation, from
    by slave-is all pronon. Lie unto thy brand ere Sala.
  VALENTINE. Here, sir, past him likely with his secrease stoph'd.
  )and. I see under 'Antague, all thing. I hear at the note pleasant
    and his present in time. Yet, go do impure to it?
  Dam. I aike not enter than.
    'About your daughters! I know he is for't;
    But, that loose but as scarted, friends were
    The pillain that will die this knot to please.
  CLEOPATRA. The voice pardon'd with gilt, i
