In [1]:
import Pkg
Packages = ["IterTools", "LinearAlgebra", "LinearAlgebra", "StatsBase", "Test"]
for p in Packages; Pkg.add(p); end;
using Knet, Base.Iterators, IterTools, LinearAlgebra, StatsBase, Test

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m Installed[22m[39m IterTools ─ v1.2.0
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
 [90m [c8e1da08][39m[92m + IterTools v1.2.0[39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
 [90m [c8e1da08][39m[92m + IterTools v1.2.0[39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
 [90m [37e2e46d][39m[92m + LinearAlgebra [39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m 

┌ Info: Precompiling IterTools [c8e1da08-722c-5040-9ed9-7db0dc04731e]
└ @ Base loading.jl:1242
┌ Info: Precompiling StatsBase [2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91]
└ @ Base loading.jl:1242


In [2]:
macro size(z, s); esc(:(@assert (size($z) == $s) string(summary($z),!=,$s))); end; # for debugging

In [3]:
const datadir = "nn4nlp-code/data/ptb"

"nn4nlp-code/data/ptb"

In [4]:
isdir(datadir) || run(`git clone https://github.com/neubig/nn4nlp-code.git`)

Cloning into 'nn4nlp-code'...


Process(`[4mgit[24m [4mclone[24m [4mhttps://github.com/neubig/nn4nlp-code.git[24m`, ProcessExited(0))

In [5]:
struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

In [6]:
function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    io = open(file, "r")
    lines = readlines(io)
    close(io)
    
    reviews = []
    
    #tokenize each line with the function:tokenizer, add each tokenized line to reviews
    for line in lines; tokenized = push!(tokenizer(line), eos); push!(reviews, tokenized); end
      
    freq = Dict(); 
    
    #iterate over all words, count how many times they occur by adding them to w2i
    for review in reviews
        for word in review
            if word in keys(freq); freq[word] += 1; else; freq[word] = 1; end
        end
    end
    
    #sort the dictionary based on word frequencies, returns an array
    freq = sort(collect(freq), by=x->x[2])
    
    # Remove the least common word until we reach the specified vocabsize.
    # Keep track of total removed values
    total_removed = 0
    while (length(freq) > vocabsize)
        total_removed += freq[1][2]
        popfirst!(freq) 
    end
    
    # keep only the words that occur >= mincount times
    temp = []
    for (word, count) in freq
        if count >= mincount; push!(temp, (word, count)); end
    end
    freq = temp

    #turn array back into dictionary
    freq = Dict{String, Int}(freq)
    
    #add total removed values to <unk>    
    freq[unk] += total_removed
    
    # Create i2w
    w2i = Dict(); i2w = []
    for (i,elt) in enumerate(keys(freq))
        w2i[elt] = i
        push!(i2w, elt)
    end
    i2w = Vector{String}(i2w)
    
    Vocab(w2i, i2w, w2i[unk], w2i[eos], tokenizer)
end

Vocab

In [7]:
@info "Testing Vocab"
f = "$datadir/train.txt"
v = Vocab(f)
@test all(v.w2i[w] == i for (i,w) in enumerate(v.i2w))
@test length(Vocab(f).i2w) == 10000
@test length(Vocab(f, vocabsize=1234).i2w) == 1234
@test length(Vocab(f, mincount=5).i2w) == 9859

┌ Info: Testing Vocab
└ @ Main In[7]:1


[32m[1mTest Passed[22m[39m

In [8]:
function words_to_ints(vocab::Vocab, sentence)
    [get(vocab.w2i, word, vocab.unk) for word in vocab.tokenizer(sentence)]
end

words_to_ints (generic function with 1 method)

In [9]:
train_vocab = Vocab("$datadir/train.txt");

In [10]:
struct TextReader
    file::String
    vocab::Vocab
end

In [11]:
function Base.iterate(r::TextReader, s=nothing)
    if s==nothing
        state=open(r.file)
    else 
        state=s
    end
    if eof(state)
        close(state)
        return nothing
    else
        line= readline(state)
        return words_to_ints(r.vocab, line), state
    end
end


# These are some optional functions that can be defined for iterators. They are required for
# `collect` to work, which converts an iterator to a regular array.

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

In [12]:
@info "Testing TextReader"
train_sentences, valid_sentences, test_sentences =
    (TextReader("$datadir/$file.txt", train_vocab) for file in ("train","valid","test"))
@test length(first(train_sentences)) == 24
@test length(collect(train_sentences)) == 42068
@test length(collect(valid_sentences)) == 3370
@test length(collect(test_sentences)) == 3761

┌ Info: Testing TextReader
└ @ Main In[12]:1


[32m[1mTest Passed[22m[39m

In [13]:
struct Embed; w; end

function Embed(vocabsize::Int, embedsize::Int)
    Embed(param(embedsize,vocabsize))
end

function (l::Embed)(x)
    l.w[:,x]
end

In [14]:
@info "Testing Embed"
Knet.seed!(1)
embed = Embed(100,10)
embed.w
input = rand(1:100, 2, 3)
output = embed(input)

@test size(output) == (10, 2, 3)
@test norm(output) ≈ 0.59804f0

┌ Info: Testing Embed
└ @ Main In[14]:1


[32m[1mTest Passed[22m[39m

In [15]:
struct Linear; w; b; end

function Linear(inputsize::Int, outputsize::Int)
    Linear(param(outputsize, inputsize), param0(outputsize))
end

function (l::Linear)(x)
    l.w * mat(x,dims=1) .+ l.b

end

In [16]:
@info "Testing Linear"
Knet.seed!(1)
linear = Linear(100,10)
input = oftype(linear.w, randn(Float32, 100, 5))
output = linear(input)
@test size(output) == (10, 5)
@test norm(output) ≈ 5.5301356f0

┌ Info: Testing Linear
└ @ Main In[16]:1


[32m[1mTest Passed[22m[39m

In [17]:
struct NNLM; vocab; windowsize; embed; hidden; output; dropout; end

In [18]:
function NNLM(vocab::Vocab, windowsize::Int, embedsize::Int, hiddensize::Int, dropout::Real)
    
    vocabsize = length(vocab.w2i)

    embed = Embed(vocabsize, embedsize)
    hidden = Linear(windowsize * embedsize, hiddensize)
    output = Linear(hiddensize, vocabsize)
    
    NNLM(vocab, windowsize, embed, hidden, output, dropout)
    
end

NNLM

In [19]:
HIST = 3
EMBED = 128
HIDDEN = 128
DROPOUT = 0.5
VOCAB = length(train_vocab.i2w)

10000

In [20]:
model = NNLM(train_vocab, HIST, EMBED, HIDDEN, DROPOUT);

In [21]:
@test model.vocab === train_vocab
@test model.windowsize === HIST
@test size(model.embed.w) == (EMBED,VOCAB)
@test size(model.hidden.w) == (HIDDEN,HIST*EMBED)
@test size(model.hidden.b) == (HIDDEN,)
@test size(model.output.w) == (VOCAB,HIDDEN)
@test size(model.output.b) == (VOCAB,)
@test model.dropout == 0.5

[32m[1mTest Passed[22m[39m

In [22]:
function pred_v1(m::NNLM, hist::AbstractVector{Int})
    @assert length(hist) == m.windowsize 
    embeds = m.embed(hist)
    embeds = reshape(embeds, size(embeds)[1] * size(embeds)[2])
    embeds = dropout(embeds, m.dropout)
    h = m.hidden(embeds)
    h = dropout(h, m.dropout)
    out = m.output(tanh.(h))
    out = reshape(out, size(out)[1])
end

pred_v1 (generic function with 1 method)

In [23]:
h = repeat([model.vocab.eos], model.windowsize)
p = pred_v1(model, h);

@test size(p) == size(train_vocab.i2w)

[32m[1mTest Passed[22m[39m

In [24]:
## This predicts the scores for the whole sentence, will be used for later testing.
function scores_v1(model, sent)
    hist = repeat([ model.vocab.eos ], model.windowsize)
    scores = []
    for word in [ sent; model.vocab.eos ]
        push!(scores, pred_v1(model, hist))
        hist = [ hist[2:end]; word ]
    end
    hcat(scores...)
end

scores_v1 (generic function with 1 method)

In [25]:
function generate(m::NNLM; maxlength=30)
    history = repeat([model.vocab.eos], model.windowsize)
    sentence_indexes = []
    sentence_words = []
    
    for i in 1:maxlength
        scores = softmax(pred_v1(m, history))
        scores[m.vocab.eos] = -10000
        scores[m.vocab.unk] = -10000
        
        best_index = argmax(softmax(scores))
        best_word = m.vocab.i2w[best_index]
        
        #handle history
        popfirst!(history)
        push!(history, i)
        
        #build the sentence
        push!(sentence_indexes, best_index)
        push!(sentence_words, best_word)
    end
    join(sentence_words, " ")
end

generate (generic function with 1 method)

In [26]:
guess = generate(model; maxlength=10)

"creek henry eaton shook toys steelworkers developers wpp inner-city rather"

In [27]:
s = generate(model; maxlength=5)
@test s isa String
@test length(split(s)) <= 5

[32m[1mTest Passed[22m[39m

In [28]:
function loss_v1(m::NNLM, sent::AbstractVector{Int}; average = true)
    new_sent = deepcopy(sent)
    push!(new_sent, m.vocab.eos)

    pred = scores_v1(m, new_sent)[:,1:length(new_sent)]
    if average; nll(pred, new_sent); else; nll(pred, new_sent; average=false); end
end

loss_v1 (generic function with 1 method)

In [29]:
s = first(train_sentences)
avgloss = loss_v1(model,s)

9.2103f0

In [30]:
s = first(train_sentences)
avgloss = loss_v1(model,s)
(tot, cnt) = loss_v1(model, s, average = false)
@test 9 < avgloss < 10
@test cnt == length(s) + 1

[32m[1mTest Passed[22m[39m

In [31]:
@test tot/cnt ≈ avgloss

[32m[1mTest Passed[22m[39m

In [32]:
function maploss(lossfn, model, data; average = true)
    if average
        losses = []
        for sent in data; push!(losses, lossfn(model, sent; average=average)); end
        return sum(losses)/length(losses)
    else
        losses = []
        for sent in data; push!(losses, lossfn(model, sent; average=average)); end 
        total_loss = sum((x->x[1]).(losses))
        (total_loss, length(data) + sum(length.(data)))
    end
end

maploss (generic function with 1 method)

In [33]:
tst100 = collect(take(test_sentences, 100))
avgloss = maploss(loss_v1, model, tst100)

9.2103615f0

In [34]:
tst100 = collect(take(test_sentences, 100))
avgloss = maploss(loss_v1, model, tst100)
@test 9 < avgloss < 10

[32m[1mTest Passed[22m[39m

In [35]:
(tot, cnt) = maploss(loss_v1, model, tst100, average = false)
@test cnt == length(tst100) + sum(length.(tst100))

[32m[1mTest Passed[22m[39m

In [36]:
@test tot/cnt ≈ avgloss

[32m[1mTest Passed[22m[39m

In [37]:
@info "Timing loss_v1 with 1000 sentences"
tst1000 = collect(take(test_sentences, 1000))
@time maploss(loss_v1, model, tst1000)

┌ Info: Timing loss_v1 with 1000 sentences
└ @ Main In[37]:1


 37.973831 seconds (3.48 M allocations: 6.268 GiB, 3.93% gc time)


9.2103615f0

In [38]:
@info "Timing loss_v1 training with 100 sentences"
trn100 = ((model,x) for x in collect(take(train_sentences, 100)))
@time sgd!(loss_v1, trn100)

┌ Info: Timing loss_v1 training with 100 sentences
└ @ Main In[38]:1


 31.980503 seconds (13.60 M allocations: 23.567 GiB, 22.40% gc time)


In [39]:
function pred_v2(m::NNLM, hist::AbstractMatrix{Int})
    embeds = m.embed(hist)
    embeds = reshape(embeds, size(embeds)[1]*size(embeds)[2], size(embeds)[3])
    embeds = dropout(embeds, m.dropout)
    h = m.hidden(embeds)
    h = tanh.(h)
    h = dropout(h, m.dropout)
    out = m.output(h)
end

pred_v2 (generic function with 1 method)

In [40]:
function scores_v2(model, sent)
    hist = [ repeat([ model.vocab.eos ], model.windowsize); sent ]
    hist = vcat((hist[i:end+i-model.windowsize]' for i in 1:model.windowsize)...)
    @assert size(hist) == (model.windowsize, length(sent)+1)
    return pred_v2(model, hist)
end

scores_v2 (generic function with 1 method)

In [56]:
sent = first(test_sentences)
s1, s2 = scores_v1(model, sent), scores_v2(model, sent)
@test size(s1) == size(s2) == (length(train_vocab.i2w), length(sent)+1)
@test s1 ≈ s2

[32m[1mTest Passed[22m[39m

In [57]:
function loss_v2(m::NNLM, sent::AbstractVector{Int}; average = true)
    losses = []
    push!(sent, m.vocab.eos)
    pred = scores_v2(m, sent)
    
    if average
        for i in (1:length(sent))
            predi = pred[:,i]
            lossi = nll(predi, [sent[i]])
            push!(losses, lossi)
        end
        return sum(losses)/length(losses)
        
    else
        for i in (1:length(sent))
            predi = pred[:,i]
            lossi = nll(predi, [sent[i]])
            push!(losses, lossi)
        end
        return (sum(losses), length(losses))
    end
end

loss_v2 (generic function with 1 method)

In [58]:
s = first(test_sentences)
@test loss_v1(model, s) ≈ loss_v2(model, s)

[32m[1mTest Passed[22m[39m

In [59]:
tst100 = collect(take(test_sentences, 100))
@test maploss(loss_v1, model, tst100) ≈ maploss(loss_v2, model, tst100)

[32m[1mTest Passed[22m[39m

## Part 6. Multiple sentences at a time (minibatching)
To get even more performance out of a GPU we will process multiple sentences at a time. This is called minibatching and is unfortunately complicated by the fact that the sentences in a batch may not be of the same length. 

Let's first write the minibatched versions of `pred` and `loss`, and see how to batch sentences together later.

## pred_v3

`pred_v3` takes a model `m`, a N×B×S dimensional history array `hist`, and returns a V×B×S dimensional score array, where N is `m.windowsize`, V is the vocabulary size, B is the batch size, and S is maximum sentence length in the batch + 1 for the final eos token. 

First, the embeddings for all entries in `hist` are looked up, which results in an array of E×N×B×S where E is the embedding size. 

The embedding array is reshaped to (E*N)×(B*S) and dropout is applied. It is then fed to the hidden layer which returns a H×(B*S) hidden output where H is the hidden size. 

Following element-wise tanh and dropout, the output layer turns this into a score array of V×(B*S) which is reshaped and returned as a V×B×S dimensional tensor.

In [60]:
# model, hist -> out
function pred_v3(m::NNLM, hist::Array{Int})
    embeds = m.embed(hist)
    embeds = reshape(embeds, size(embeds)[1]*size(embeds)[2],size(embeds)[3]*size(embeds)[4])
    embeds = dropout(embeds, m.dropout)
    h = m.hidden(embeds)
    h = tanh.(h)
    h = dropout(h, m.dropout)
    out = m.output(h)
    out = reshape(out, size(out)[1], size(hist)[2], size(hist)[3])
end

pred_v3 (generic function with 1 method)

In [61]:
function scores_v3(model, sent)
    hist = [ repeat([ model.vocab.eos ], model.windowsize); sent ]
    hist = vcat((hist[i:end+i-model.windowsize]' for i in 1:model.windowsize)...)
    @assert size(hist) == (model.windowsize, length(sent)+1)
    hist = reshape(hist, size(hist,1), 1, size(hist,2))
    return pred_v3(model, hist)
end

scores_v3 (generic function with 1 method)

In [62]:
sent = first(train_sentences)
scores_v3(model, sent);

In [63]:
@test scores_v2(model, sent) ≈ scores_v3(model, sent)[:,1,:]

[32m[1mTest Passed[22m[39m

In [64]:
function mask!(a,pad)
    for row in 1:size(a)[1]
        count = 0
        for column in size(a)[2]:-1:1
            if a[row,column] == pad; count+=1; else break; end
        end
        for column in size(a)[2]:-1:1
            if count > 1; a[row,column] = 0; count-=1; end
        end
    end
    a
end

mask! (generic function with 1 method)

In [65]:
a = [1 2 1 1 1; 2 2 2 1 1; 1 1 2 2 2; 1 1 2 2 1]
mask!(a,1)
@test mask!(a,1) == [1 2 1 0 0; 2 2 2 1 0; 1 1 2 2 2; 1 1 2 2 1]

[32m[1mTest Passed[22m[39m

In [66]:
function loss_v3(m::NNLM, batch::AbstractMatrix{Int}; average = true)

    num_batch = size(batch)[1] 
    
    losses = []
    
    for b in 1:num_batch
        sent = batch[b,:]
        pred = scores_v3(m, sent)[:,:,1:length(sent)]
        batch_loss = nll(pred, sent)        
        push!(losses, batch_loss)
    end
    
    if average
        return sum(losses) / num_batch
    else
        return (sum(losses), length(losses))
    end
    
end

loss_v3 (generic function with 1 method)

In [67]:
s = first(test_sentences)
b = [ s; model.vocab.eos ]'
loss_v3(model, b)

9.103133f0

In [68]:
@test loss_v2(model, s) ≈ loss_v3(model, b)

[32m[1mTest Passed[22m[39m

In [69]:
# ### Minibatching
#
# Below is a sample implementation of a sequence minibatcher. The `LMData` iterator wraps a
# TextReader and produces batches of sentences with similar length to minimize padding (too
# much padding wastes computation). To be able to scale to very large files, we do not want
# to read the whole file, sort by length etc. Instead `LMData` keeps around a small number
# of buckets and fills them with similar sized sentences from the TextReader. As soon as one
# of the buckets reaches the desired batch size it is turned into a matrix with the
# necessary padding and output. When the TextReader is exhausted the remaining buckets are
# returned (which may have smaller batch sizes). I will let you figure the rest out from the
# following, there is no code to write for this part.

struct LMData
    src::TextReader
    batchsize::Int
    maxlength::Int
    bucketwidth::Int
    buckets
end

function LMData(src::TextReader; batchsize = 64, maxlength = typemax(Int), bucketwidth = 10)
    numbuckets = min(128, maxlength ÷ bucketwidth)
    buckets = [ [] for i in 1:numbuckets ]
    LMData(src, batchsize, maxlength, bucketwidth, buckets)
end

Base.IteratorSize(::Type{LMData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{LMData}) = Base.HasEltype()
Base.eltype(::Type{LMData}) = Matrix{Int}

function Base.iterate(d::LMData, state=nothing)
    if state == nothing
        for b in d.buckets; empty!(b); end
    end
    bucket,ibucket = nothing,nothing
    while true
        iter = (state === nothing ? iterate(d.src) : iterate(d.src, state))
        if iter === nothing
            ibucket = findfirst(x -> !isempty(x), d.buckets)
            bucket = (ibucket === nothing ? nothing : d.buckets[ibucket])
            break
        else
            sent, state = iter
            if length(sent) > d.maxlength || length(sent) == 0; continue; end
            ibucket = min(1 + (length(sent)-1) ÷ d.bucketwidth, length(d.buckets))
            bucket = d.buckets[ibucket]
            push!(bucket, sent)
            if length(bucket) === d.batchsize; break; end
        end
    end
    if bucket === nothing; return nothing; end
    batchsize = length(bucket)
    maxlen = maximum(length.(bucket))
    batch = fill(d.src.vocab.eos, batchsize, maxlen + 1)
    for i in 1:batchsize
        batch[i, 1:length(bucket[i])] = bucket[i]
    end
    empty!(bucket)
    return batch, state
end

In [70]:
@info "Timing loss_v2 and loss_v3 at various batch sizes"
@info loss_v2; test_collect = collect(test_sentences)
GC.gc(); @time p2 = maploss(loss_v2, model, test_collect)
for B in (1, 8, 16, 32, 64, 128, 256)
    @info loss_v3,B; test_batches = collect(LMData(test_sentences, batchsize = B))
    GC.gc(); @time p3 = maploss(loss_v3, model, test_batches); @test p3 ≈ p2
end

┌ Info: Timing loss_v2 and loss_v3 at various batch sizes
└ @ Main In[70]:1
┌ Info: loss_v2
└ @ Main In[70]:2


 52.176930 seconds (1.95 M allocations: 19.044 GiB, 9.73% gc time)


┌ Info: (loss_v3, 1)
└ @ Main In[70]:5


 45.423749 seconds (746.51 k allocations: 18.992 GiB, 8.34% gc time)


┌ Info: (loss_v3, 8)
└ @ Main In[70]:5


 46.731847 seconds (414.72 k allocations: 22.153 GiB, 8.46% gc time)
[91m[1mTest Failed[22m[39m at [39m[1mIn[70]:6[22m
  Expression: p3 ≈ p2
   Evaluated: 8.996761f0 ≈ 9.0626745f0


Test.FallbackTestSetException: There was an error during testing

In [None]:
# For training, a batchsize around 64 seems best, although things are a bit more complicated
# here: larger batch sizes make fewer updates per epoch which may slow down convergence. We
# will use the smaller test data to get quick results.

@info "Timing SGD for loss_v2 and loss_v3 at various batch sizes"
train(loss, model, data) = sgd!(loss, ((model,sent) for sent in data))
@info loss_v2; test_collect = collect(test_sentences)
GC.gc(); @time train(loss_v2, model, test_collect)
for B in (1, 8, 16, 32, 64, 128, 256)
    @info loss_v3,B; test_batches = collect(LMData(test_sentences, batchsize = B))
    GC.gc(); @time train(loss_v3, model, test_batches)
end

In [101]:
# ## Part 7. Training
#
# You should be able to get the validation loss under 5.1 (perplexity under 165) in 100
# epochs with default parameters.  This takes about 5 minutes on a V100 GPU.
#
# Please review Knet function `progress!` and iterator function `ncycle` used below.

model = NNLM(train_vocab, HIST, EMBED, HIDDEN, DROPOUT)
train_batches = collect(LMData(train_sentences))
valid_batches = collect(LMData(valid_sentences))
test_batches = collect(LMData(test_sentences))
train_batches50 = train_batches[1:50] # Small sample for quick loss calculation

epoch = adam(loss_v3, ((model, batch) for batch in train_batches))
bestmodel, bestloss = deepcopy(model), maploss(loss_v3, model, valid_batches)


progress!(ncycle(epoch, 20), seconds=5) do x
    global bestmodel, bestloss
    ## Report gradient norm for the first batch
    f = @diff loss_v3(model, train_batches[1])
    gnorm = sqrt(sum(norm(grad(f,x))^2 for x in params(model)))
    ## Report training and validation loss
    trnloss = maploss(loss_v3, model, train_batches50)
    devloss = maploss(loss_v3, model, valid_batches)
    ## Save model that does best on validation data
    if devloss < bestloss
        bestmodel, bestloss = deepcopy(model), devloss
    end
    (trn=exp(trnloss), dev=exp(devloss), ∇=gnorm)
end


┣████████████████████┫ [100.00%, 13240/13240, 47:18/47:18, 4.66i/s] (trn = 58.026688f0, dev = 75.65292f0, ∇ = 0.3552466f0))█▏                 ┫ [10.98%, 1454/13240, 05:14/47:36, 4.53i/s] (trn = 119.91741f0, dev = 132.27208f0, ∇ = 0.28435707f0)████                ┫ [20.50%, 2714/13240, 09:44/47:27, 4.65i/s] (trn = 87.47838f0, dev = 102.39043f0, ∇ = 0.30593753f0)[21.90%, 2900/13240, 10:23/47:25, 4.80i/s] (trn = 91.07605f0, dev = 101.68594f0, ∇ = 0.31700698f0)████████████▉       ┫ [64.58%, 8551/13240, 30:32/47:16, 4.65i/s] (trn = 65.62978f0, dev = 79.62601f0, ∇ = 0.3490029f0)██████████████▋     ┫ [73.52%, 9734/13240, 34:47/47:19, 4.55i/s] (trn = 61.659767f0, dev = 78.60619f0, ∇ = 0.34601617f0)


In [240]:
generate(bestmodel, maxlength=10)

"the to the in for peters the to the and"