# Character based RNN language model
(c) Deniz Yuret, 2019. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

* Objectives: Learn to define and train a character based language model and generate text from it. Minibatch blocks of text. Keep a persistent RNN state between updates. Train a Shakespeare generator and a Julia programmer using the same type of model.
* Prerequisites: [RNN basics](60.rnn.ipynb), [Iterators](25.iterators.ipynb)
* New functions:
[converge](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.converge)

In [1]:
# Set display width, load packages, import symbols
ENV["COLUMNS"]=72
using Pkg; haskey(Pkg.installed(),"Knet") || Pkg.add("Knet")
using Statistics: mean
using Base.Iterators: cycle
using IterTools: takenth
using Knet: Knet, AutoGrad, Data, param, param0, mat, RNN, dropout, value, nll, adam, minibatch, progress!, converge

## Define the model

In [2]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]  # (B,T)->(X,B,T)->rnn->(H,B,T)

In [3]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * mat(x,dims=1) .+ l.b  # (H,B,T)->(H,B*T)->(V,B*T)

In [4]:
# Let's define a chain of layers
struct Chain
    layers
    Chain(layers...) = new(layers)
end
(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y) = nll(c(x),y)
(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)

In [5]:
# The h=0,c=0 options to RNN enable a persistent state between iterations
CharLM(vocab::Int,embed::Int,hidden::Int; o...) = 
    Chain(Embed(vocab,embed), RNN(embed,hidden;h=0,c=0,o...), Linear(hidden,vocab))

CharLM (generic function with 1 method)

## Train and test utilities

In [6]:
# For running experiments
function trainresults(file,maker,chars)
    if (print("Train from scratch? "); readline()[1]=='y')
        model = maker()
        a = adam(model,cycle(dtrn))
        b = (exp(model(dtst)) for _ in takenth(a,100))
        c = converge(b, alpha=0.1)
        progress!(p->p.currval, c)
        Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    Knet.gc() # To save gpu memory
    return model,chars
end

trainresults (generic function with 1 method)

In [7]:
# To generate text from trained models
function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    reset!(model)
    for i=1:n
        y = model([x])
        x = sample(y)
        print(chars[x])
    end
    println()
end

reset!(m::Chain)=(for r in m.layers; r isa RNN && (r.c=r.h=0); end);

## The Complete Works of William Shakespeare

In [8]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
VOCABSIZE = 84
INPUTSIZE = 168
HIDDENSIZE = 334
NUMLAYERS = 1;

In [9]:
# Load 'The Complete Works of William Shakespeare'
include(Knet.dir("data","gutenberg.jl"))
trn,tst,shakechars = shakespeare()
map(summary,(trn,tst,shakechars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [10]:
# Print a sample
println(string(shakechars[trn[1020:1210]]...))


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [11]:
# Minibatch data
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(192, 20)

In [12]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("256×100 Array{UInt8,2}", "256×100 Array{UInt8,2}")

In [13]:
# [180, 06:58, 2.32s/i] 3.3026385
shakemaker() = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)
shakemodel,shakechars = trainresults("shakespeare132.jld2", shakemaker, shakechars);

Train from scratch? stdin> n


In [14]:
#exp(shakemodel(dtst))  # Perplexity = 3.3150165f0

In [15]:
generate(shakemodel,shakechars,1000)

PARYNENGET           With her good wife

                Enter KING PHILJES; NECNOLBUS
  PAGE OVERNAS, the a darkey sounds in colours.
    Petruchio's gart, sweet Jamor, alike how sliolat
     Up her shrine, and all these will not be not tall-
    But if thou chost of thoo I will never hang
    To fright a follier wrongs with drum.  
  Ophin. It is worth else.
  Hot. A very sign is a deed-proud. How do't thy little fell!  
  Lear. I am lost ourselves, and thou begst it from Some for the day
     place in last, that diverses thy prince women
     That it is elft his beard; the rue too late
     Not truck at me. It give them for quires.
  IAGO. I yet rough this if his heart of eternal women
    I should accuse your voice.
  Lear. Yet, sir, a tear.
    And stock, my lord.
  Friar. Farewell!
  Beat. What a base devil sparefull some gentry should have will not talk with the
    cap
    unseas'd; that the Hotsoman knew'st it now, come!--Let's buring one
    reason to ho


## Julia programmer

In [16]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2;

In [17]:
# Read julia base library source code
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia", "base")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        global text
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

2832949

In [18]:
# Find unique chars, sort by frequency, assign integer ids.
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
juliachars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(juliachars); charid[juliachars[i]]=i; end
hcat(juliachars, map(c->charcnt[c],juliachars))

197×2 Array{Any,2}:
 ' '   593337
 'e'   183265
 't'   146524
 'n'   128856
 'i'   119906
 'r'   109778
 'a'   103761
 's'    99377
 'o'    94416
 '\n'   89082
 'l'    70424
 'd'    61536
 'c'    57355
 ⋮           
 'μ'        1
 '”'        1
 '≶'        1
 'ᶜ'        1
 '¾'        1
 '⁹'        1
 '⛵'        1
 '⁷'        1
 '⊗'        1
 'Λ'        1
 '⋅'        1
 'ϒ'        1

In [19]:
# Keep only VOCABSIZE most frequent chars, split into train and test
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

(2832949, 2308661, 524288)

In [20]:
# Print a sample
r = rand(1:(length(trn)-1000))
println(string(juliachars[trn[r:r+1000]]...)) 

For `n <
0`, this is equivalent to `x << -n`.

# Examples
```jldoctest
julia> Int8(13) >> 2
3

julia> bitstring(Int8(13))
"00001101"

julia> bitstring(Int8(3))
"00000011"

julia> Int8(-14) >> 2
-4

julia> bitstring(Int8(-14))
"11110010"

julia> bitstring(Int8(-4))
"11111100"
```
See also [`>>>`](@ref), [`<<`](@ref).
"""
function >>(x::Integer, c::Integer)
    @_inline_meta
    if c isa UInt
        throw(MethodError(>>, (x, c)))
    end
    typemin(Int) <= c <= typemax(Int) && return x >> (c % Int)
    (x >= 0 || c < 0) && return zero(x) >> 0
    oftype(x, -1)
end
>>(x::Integer, c::Int) = c >= 0 ? x >> unsigned(c) : x << unsigned(-c)

"""
    >>>(x, n)

Unsigned right bit shift operator, `x >>> n`. For `n >= 0`, the result is `x`
shifted right by `n` bits, where `n >= 0`, filling with `0`s. For `n < 0`, this
is equivalent to `x << -n`.

For [`Unsigned`](@ref) integer types, this is equivalent to [`>>`](@ref). For
[`Signed`](@ref) integer types, this is equivalent to `signed(unsigned(x)

In [21]:
# Minibatch data
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(563, 127)

In [22]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("64×64 Array{Int64,2}", "64×64 Array{Int64,2}")

In [23]:
# [150, 05:04, 2.03s/i] 3.2988634
juliamaker() = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)
juliamodel,juliachars = trainresults("juliacharlm132.jld2", juliamaker, juliachars);

Train from scratch? stdin> n


In [24]:
#exp(juliamodel(dtst))  # Perplexity = 3.8615866f0

In [25]:
generate(juliamodel,juliachars,1000)

x             t)
        @inline occursin = fillpadd(min(one(Tr,UInt64(1))) + 1*n - 4Valued_byte
        l += 1
        push!(based_end(x,cptr))
        m = read(f, Int32(0))
        if padding == 2
            convert(P, dump(xyp, xmpfilebuf)) + leading_zeros(i2, len)
            push!(blk.args, :(print(out, 'x'), 'x'))
            elseif pointext = get(io,:SignificandSoze(b)->exponent_biass[], pb, 0)
            cad = '5' ? fixupdic(dept)
            exponent_base = 32 && (p == 17) || buf == 1
            hhigh = UInt32(b)
            t = BigFloat(width, p)
            higherror(a)
        end
    elseif arrayref === Float32(heanonizerox(aft))
        return Base.extread_block
    else
        len = 0 # final errors out might
        fmp = dispatchunk(domtree, self)
        x = point
    end
    blk = ccall(:GetStypes, pad, Any, (Csize_t, Base), s, false, 0)
    while !isempty(Woppend[keys(prevind(W, first_byteslice(str))))>0.0, 0,
                blust_idx > block.succs[2]]
        