# Character based RNN language model
(c) Deniz Yuret, 2018. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

* Objectives: Learn to define and train a character based language model and generate text from it. Minibatch blocks of text. Keep a persistent RNN state between updates. Train a Shakespeare generator and a Julia programmer using the same type of model.
* Prerequisites: [RNN basics](06.rnn.ipynb), minibatch, param, param0, RNN, dropout, train!, Adam, nll, value

In [1]:
using Pkg
for p in ("Knet","ProgressMeter")
    haskey(Pkg.installed(),p) || Pkg.add(p)
end

## Define the model

In [2]:
using Knet: param, param0, RNN, dropout, value

┌ Info: Recompiling stale cache file /home/gridsan/dyuret/.julia/compiled/v1.0/Knet/f4vSz.ji for Knet [1902f260-5fb4-5aff-8c31-6271790ab950]
└ @ Base loading.jl:1187


In [3]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]

In [4]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * x .+ l.b

In [5]:
struct CharLM; input; rnn; output; end

CharLM(vocab::Int,input::Int,hidden::Int; o...) = 
    CharLM(Embed(vocab,input), RNN(input,hidden; o...), Linear(hidden,vocab))

function (c::CharLM)(x; pdrop=0, hidden=[])
    x = c.input(x)                # (B,T)->(X,B,T)
    x = dropout(x, pdrop)
    x = c.rnn(x, hidden=hidden)   # (H,B,T)
    hidden .= value.(hidden)      # avoid deps in next iter
    x = dropout(x, pdrop)
    x = reshape(x, size(x,1), :)  # (H,B*T)
    return c.output(x)            # (V,B*T)
end

In [6]:
# To generate text from trained models
function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    h = []
    for i=1:n
        y = model([x], hidden=h)
        x = sample(y)
        print(chars[x])
    end
    println()
end;

In [7]:
# For running experiments
using Knet: AutoGrad, Knet, train!, Adam; import ProgressMeter
function trainresults(file,model,chars)
    if (print("Train from scratch? ");readline()[1]=='y')
        updates = 0; prog = ProgressMeter.Progress(EPOCHS * length(dtrn))
        callback(J)=(ProgressMeter.update!(prog, updates); (updates += 1) <= prog.n)
        opt = Adam(lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
        train!(model, dtrn; callback=callback, optimizer=opt, pdrop=DROPOUT, hidden=[])
        Knet.gc(); Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    return model,chars
end

trainresults (generic function with 1 method)

## The Complete Works of William Shakespeare

In [8]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
INPUTSIZE = 168
VOCABSIZE = 84
HIDDENSIZE = 334
NUMLAYERS = 1
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 30
ENV["COLUMNS"]=92;

In [9]:
# Load 'The Complete Works of William Shakespeare'
include(Knet.dir("data","gutenberg.jl"))
trn,tst,chars = shakespeare()
map(summary,(trn,tst,chars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [10]:
# Print a sample
println(string(chars[trn[1020:1210]]...)) 


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [11]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(192, 20)

In [12]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("256×100 Array{UInt8,2}", "256×100 Array{UInt8,2}")

In [13]:
shakemodel,shakechars = trainresults("shakespeare.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT), chars);

Train from scratch? stdin> y


[32mProgress: 100%|█████████████████████████████████████████████████████| Time: 0:02:13[39mm


In [14]:
using Knet: nll
exp(nll(shakemodel,dtst))  # Perplexity

4.2537537f0

In [15]:
generate(shakemodel,shakechars,1000)

"Dav's  
  Est. Yea, you should he 'em. If they shall do.
  Wor, know thee?
ADRIANA. Why, here, Captain Tobolus
  Her truths are those honour'd them: O, thou wast
  I never eardly thing to this-belly.
  Was play the belly, way, marry,
     Nor more than I am a duk'd?

Enter PISTOL and ONTH.
  COUNTESS. Peace is bonds, their faults asham'd
    The radst at the charm.
  MRS. PAGE. Sport the affection?
  BEDFORD. Eoses he is me; sir, 'tis to heaven with
    some watch are not. Where would you out the remembrance is to come and is
    thought by you, more that to say the greatness too;
    I will not live by thine Blutus' liquid to thee.
    What, Macturashel! the King rise here my neces
    Of this heels and insteath'd, and of one thing
    What nothing in my anking?
    He was company that warrants often so,
    And to the King hath assage our uncled coming her.
                                                       [Flourishion at me]
    Such arrounded shiots Hec


## Julia programmer

In [16]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 10
ENV["COLUMNS"]=92;

In [17]:
# Read julia base library source code
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

9131265

In [18]:
# Find unique chars, sort by frequency, assign integer ids.
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
chars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(chars); charid[chars[i]]=i; end
hcat(chars, map(c->charcnt[c],chars))

3642×2 Array{Any,2}:
 ' '   1971836
 'e'    548012
 't'    477724
 'n'    343215
 'r'    338122
 'i'    329419
 's'    325865
 'a'    316561
 'o'    275999
 '\n'   265652
 'l'    203478
 ','    200306
 ')'    194094
 ⋮            
 'ה'         1
 '🍢'         1
 '𝗾'         1
 '𝔔'         1
 'É'         1
 '𝓟'         1
 '𝚿'         1
 '𝕨'         1
 'ɛ'         1
 'Χ'         1
 '🕙'         1
 'ℚ'         1

In [19]:
# Keep only VOCABSIZE most frequent chars, split into train and test
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

(9131265, 8606977, 524288)

In [20]:
# Print a sample
r = rand(1:(length(trn)-1000))
println(string(chars[trn[r:r+1000]]...)) 

the result to type `T`, throwing an `InexactError` if the value is
not representable.

`digits`, `sigdigits` and `base` work as for [`round`](@ref).
"""
function floor end

"""
    ceil([T,] x)
    ceil(x; digits::Integer= [, base = 10])
    ceil(x; sigdigits::Integer= [, base = 10])

`ceil(x)` returns the nearest integral value of the same type as `x` that is greater than or
equal to `x`.

`ceil(T, x)` converts the result to type `T`, throwing an `InexactError` if the value is not
representable.

`digits`, `sigdigits` and `base` work as for [`round`](@ref).
"""
function ceil end

round(::Type{T}, x::Integer) where {T<:Integer} = convert(T, x)
trunc(::Type{T}, x::Integer) where {T<:Integer} = convert(T, x)
floor(::Type{T}, x::Integer) where {T<:Integer} = convert(T, x)
 ceil(::Type{T}, x::Integer) where {T<:Integer} = convert(T, x)

## integer construction ##

"""
    @int128_str str
    @int128_str(str)

`@int128_str` parses a string into a Int128
Throws an `ArgumentError` if the stri

In [21]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(2101, 127)

In [22]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("64×64 Array{Int64,2}", "64×64 Array{Int64,2}")

In [23]:
juliamodel,juliachars = trainresults("juliacharlm.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT),chars);

Train from scratch? stdin> n


In [24]:
using Knet: nll
exp(nll(juliamodel,dtst))  # Perplexity

5.028319f0

In [25]:
generate(juliamodel,juliachars,1000)

   < shape == 3
    ([x], [x], [2, 3], [1,1,b"])]
    SparseVector([1,3,3],[8,1,1], 1:0, 1,8)
end
@testset "BroadcastQ of Pair.Han.return array greatessager arrays" begin
    # The 324-by-Y writable is available on point, a_tests
    [0.000158980715314,2.7206942494801835e-01,
     2.5212241616157455   1.14180073591295e-01,1.0183052285577349e-01,
     1.1856332520854704e-01,2.5843721571217466e-01,1.7779606293984581e-01,
     5.4694543820536906e-01,4.8506347147319964e-01,2.208561352207217e-01
        1.441444248586268e-01,1.9022236482660231e-01,1.4219025461243420e-01)
eltype(rval) =
    throw(ArgumentError("collection to types with $A1ha Diagonal")) => Any
#45039
# This file is parse indicisting
using SubArray

let x = 24
    @test SubStringString(s, y) == [1, 3, 4]
end

@test sprint(show, x) == Expr(:error, "reg count; begin & GenericStrings")
@testset "scale on summinger(@__doubleerror("bp"))


# R output
    @tested(replace("exported = 0x0000") == "0x0000000"
    out = reinterpret(Int