# Character based RNN language model
(c) Deniz Yuret, 2018. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

* Objectives: Learn to define and train a character based language model and generate text from it. Minibatch blocks of text. Keep a persistent RNN state between updates. Train a Shakespeare generator and a Julia programmer using the same type of model.
* Prerequisites: [RNN basics](06.rnn.ipynb), minibatch, param, param0, RNN, dropout, train!, Adam, nll, value

In [None]:
using Pkg
for p in ("Knet","ProgressMeter")
    haskey(Pkg.installed(),p) || Pkg.add(p)
end

## Define the model

In [None]:
using Knet: param, param0, RNN, dropout, value

In [None]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]

In [None]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * x .+ l.b

In [None]:
struct CharLM; input; rnn; output; end

CharLM(vocab::Int,input::Int,hidden::Int; o...) = 
    CharLM(Embed(vocab,input), RNN(input,hidden; o...), Linear(hidden,vocab))

function (c::CharLM)(x; pdrop=0, hidden=[])
    x = c.input(x)                # (B,T)->(X,B,T)
    x = dropout(x, pdrop)
    x = c.rnn(x, hidden=hidden)   # (H,B,T)
    hidden .= value.(hidden)      # avoid deps in next iter
    x = dropout(x, pdrop)
    x = reshape(x, size(x,1), :)  # (H,B*T)
    return c.output(x)            # (V,B*T)
end

In [None]:
# To generate text from trained models
function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    h = []
    for i=1:n
        y = model([x], hidden=h)
        x = sample(y)
        print(chars[x])
    end
    println()
end;

In [None]:
# For running experiments
using Knet: AutoGrad, Knet, train!, Adam; import ProgressMeter
function trainresults(file,model,chars)
    if (print("Train from scratch? ");readline()[1]=='y')
        updates = 0; prog = ProgressMeter.Progress(EPOCHS * length(dtrn))
        callback(J)=(ProgressMeter.update!(prog, updates); (updates += 1) <= prog.n)
        opt = Adam(lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
        train!(model, dtrn; callback=callback, optimizer=opt, pdrop=DROPOUT, hidden=[])
        Knet.gc(); Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    return model,chars
end

## The Complete Works of William Shakespeare

In [None]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30
ENV["COLUMNS"]=92;

In [None]:
# Load 'The Complete Works of William Shakespeare'
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

In [None]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 

In [None]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

In [None]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

In [None]:
shakemodel,shakechars = trainresults("shakespeare.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT), chars);

In [None]:
using Knet: nll
# exp(nll(shakemodel,dtst))  # Perplexity = 4.19365

In [None]:
generate(shakemodel,shakechars,1000)

## Julia programmer

In [None]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 10
ENV["COLUMNS"]=92;

In [None]:
# Read julia base library source code
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

In [None]:
# Find unique chars, sort by frequency, assign integer ids.
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
chars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(chars); charid[chars[i]]=i; end
hcat(chars, map(c->charcnt[c],chars))

In [None]:
# Keep only VOCABSIZE most frequent chars, split into train and test
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

In [None]:
# Print a sample
r = rand(1:(length(trn)-1000))
println(string(chars[trn[r:r+1000]]...)) 

In [None]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

In [None]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

In [None]:
juliamodel,juliachars = trainresults("juliacharlm.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT),chars);

In [None]:
using Knet: nll
# exp(nll(juliamodel,dtst))  # Perplexity = 96.20856

In [None]:
generate(juliamodel,juliachars,1000)