# LSTM Named Entity Tagger

Named-entity recognition (NER) (also known as entity identification, entity chunking and entity extraction) is a subtask of information extraction that seeks to locate and classify named entities in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc.

Most research on NER systems has been structured as taking an unannotated block of text, such as the following **example**:

**INPUT:** Jim bought 300 shares of Acme Corp. in 2006.

And producing an annotated block of text that highlights the names of entities:

**OUTPUT:** [Jim]Person bought 300 shares of [Acme Corp.]Organization in [2006]Time.

In this example, a person name consisting of one token, a two-token company name and a temporal expression have been detected and classified.(Wikipedia)

Your task in this lab is to implement named entity LSTM based tagger which uses an LSTM to extract features from the input sentence, which are then passed through a multi-layer perceptron to predict
the tag of the word. Finally, train that model on [WikiNER](https://github.com/neulab/dynet-benchmark/tree/master/data/tags) dataset.

In [1]:
# Set-Up related files and Hyper-parameters
using Pkg; for p in ["Knet","ArgParse"]; haskey(Pkg.installed(),p) || Pkg.add(p); end
using Knet
import Knet: train!
using Printf, Dates, Random
STDOUT = Base.stdout
using ArgParse
import Base: length, iterate
include(Pkg.dir("Knet","data","wikiner.jl"));

└ @ Pkg.API /opt/julia-1.0.3/share/julia/stdlib/v1.0/Pkg/src/API.jl:395


## Problem 1. Prepare samples for the network
Your first task is to prepare instances for the network. We're given with the tokens (words and tags) and we need to make them understandable by our neural network. For this purpose, we build vocabularies (for both words and tags) and construct vocabulary to index dictionaries by using those vocabularies (w2i and t2i, word2index, tag2index). Then, we convert words and tags to indices with the usage of our dictionaries. <br/>
```julia
julia> show_instance() # show instance in not implemented in Knet, it is a hypothetical procedure
Inputs sentence:
Sent-> That inscribed in the genealogical records of his family is Jiang Zhoutai .
NERs-> O    O         O  O   O            O       O  O   O      O  I-PER I-PER   O

Timesteps:
Time step 1 ---> Inputs: That
                 Outputs: O
Time step 2 ---> Inputs: inscribed
                 Outputs:O
Time step 3 ---> Inputs: in
                 Outputs: O
Time step 4 ---> Inputs: the
                 Outputs: O
Time step 5 ---> Inputs: genealogical
                 Outputs: O
Time step 6 ---> Inputs: records  . 
                 Outputs: O
Time step 7 ---> Inputs: of 
                 Outputs: O
Time step 8 ---> Inputs: his 
                 Outputs: O
Time step 9 ---> Inputs: family 
                 Outputs: O
Time step 10 --->Inputs: is 
                 Outputs: O
Time step 11 ---> Inputs: Jiang 
                  Outputs: I-PER
Time step 12 ---> Inputs: Zhoutai 
                  Outputs: I-PER
Time step 13 ---> Inputs: . 
                  Outputs: O
```

Our input and output arrays should be integers instead of texts.

In [2]:
data = WikiNERData();

In [3]:
# data of type WikiNERData has 3 fields:

data.w2i #dictionary with 28484 entries: word -> index
data.t2i #dictionary with 9     entries: tag -> index
data.trn #array with 142153 entries
@show data.trn[1]       #array of 32 tuples (word, tag)
@show data.trn[1][1]    #tuple
@show data.trn[1][1][1] #word
@show data.trn[1][1][2] #tag

data.trn[1] = Array{SubString{String},1}[["The", "I-MISC"], ["Oxford", "I-MISC"], ["Companion", "I-MISC"], ["to", "I-MISC"], ["Philosophy", "I-MISC"], ["says", "O"], [",", "O"], ["\"", "O"], ["there", "O"], ["is", "O"], ["no", "O"], ["single", "O"], ["defining", "O"], ["position", "O"], ["that", "O"], ["all", "O"], ["anarchists", "O"], ["hold", "O"], [",", "O"], ["and", "O"], ["those", "O"], ["considered", "O"], ["anarchists", "O"], ["at", "O"], ["best", "O"], ["share", "O"], ["a", "O"], ["certain", "O"], ["family", "O"], ["resemblance", "O"], [".", "O"], ["\"", "O"]]
(data.trn[1])[1] = SubString{String}["The", "I-MISC"]
((data.trn[1])[1])[1] = "The"
((data.trn[1])[1])[2] = "I-MISC"


"I-MISC"

In [4]:
# make_instances procedure is given to you
function make_instances(data, w2i, t2i)
    words = []; tags = []
    for k = 1:length(data)
        this_words, this_tags = make_instance(data[k], w2i, t2i)
        push!(words, this_words)
        push!(tags, this_tags)
    end
    order = sortperm(words, by=length, rev=true)
    return words, tags
end

#=
You need to implement make_instance function
instance is a list of tuples. Each tuple contains a word and the corresponding tag as string.
You need to convert them into indices using word to index (w2i) and tag to index (t2i)
=#
function make_instance(instance, w2i, t2i)
    input = []
    output = [] 
    
    # START ANSWER
    for i in instance
        key, tag = i
        if !haskey(data.w2i, key)
            push!(input, w2i[UNK])
            push!(output, t2i[tag])
        else
            push!(input, w2i[key])
            push!(output, t2i[tag])
        end
    end
    # END ANSWER
    return input, output
end

make_instance (generic function with 1 method)

In [5]:
#=
This struct contains processed data (e.g words and tags are indices)
and necessary variables to prepare minibatches.
WikiNERProcessed struct works as an iterator.
=#
mutable struct WikiNERProcessed
    words
    tags
    batchsize
    ninstances
    shuffled
end


function WikiNERProcessed(instances, w2i, t2i; batchsize=16, shuffled=false)
    words, tags = make_instances(instances, w2i, t2i)
    ninstances = length(words)
    return WikiNERProcessed(words, tags, batchsize, ninstances, shuffled)
end


function length(d::WikiNERProcessed)
    d, r = divrem(d.ninstances, d.batchsize)
    return r == 0 ? d : d+1
end

length (generic function with 129 methods)

In [6]:
#=
You will use the RNN callable object in your model. It supports variable length instances in its input.
However, you need to prepare your input such as the RNN object can work on it. See the batchSizes option of the RNN object.
=#
function iterate(d::WikiNERProcessed, state=ifelse(d.shuffled, randperm(d.ninstances), 1:d.ninstances))    
    words = []
    tags = []
    batchsizes = zeros(d.batchsize)
    
    if state == "stop"
        return nothing;
    end
        

    if(first(state)+d.batchsize >= d.ninstances)
        return ((words, tags, batchsizes), "stop")
    end

    
    words = []
    tags = []
    for i in 1:50
        push!(words, [])
        push!(tags, [])
    end
    
    batchsizes = zeros(50)
    
    
    sentences = d.words[first(state):first(state)+d.batchsize-1]
    labels = d.tags[first(state):first(state)+d.batchsize-1]
       
    for inarray in 1:50
        for sentence_num in 1:length(sentences)
            if length(sentences[sentence_num]) >= inarray
                theword = sentences[sentence_num][inarray]
                thetag = labels[sentence_num][inarray]
                push!(words[inarray], theword)
                push!(tags[inarray], thetag)
                batchsizes[inarray] += 1  
            end
        end
    end
    
    new_state = (first(state)+d.batchsize):(last(state))
    batchsizes = convert(KnetArray, batchsizes)
    
    return ((words, tags, batchsizes), new_state)
end

iterate (generic function with 251 methods)

# Problem 2. Implement Layers
You need to implement layers that we are going to use in our tagger network. We supplied the layer definitions, but you still need to implement their initialization schemes and forward propagation functionality. Here, you need to implement three different layers,

1. Embedding layer (projection of one hot column vectors, or just array indexing)
2. Linear layer (just projection)
3. Hidden layer (projection and then non-linear activation)

In [7]:
# DO NOT TOUCH CELL, take advantage of _usegpu and _atype in the following parts
_usegpu = gpu()>=0
_atype = ifelse(_usegpu, KnetArray{Float32}, Array{Float64})

mutable struct Embedding
    w # weight
end


mutable struct Linear
    w # weight
    b # bias
end


mutable struct Hidden
    w # weight
    b # bias
    fun # non-linear activation function like relu or tanh
end

In [10]:
# initializations
function Embedding(vocabsize::Int, embedsize::Int, atype=_atype, scale=0.01)
    w = Param(convert(atype, scale*randn(embedsize, vocabsize)));
    #w = Param(scale*randn(embedsize, vocabsize))
    return Embedding(w)
end


function Linear(xsize::Int, ysize::Int, atype=_atype, scale=0.01)
    # start your answer
    o = xsize
    i = ysize
    Linear(scale * param(o,i), param0(o))
    # end your answer
end


function Hidden(xsize::Int, ysize::Int, fun=relu, atype=_atype, scale=0.1)
    # start your answer
    i = xsize
    o = ysize
    Hidden(scale * param(o,i), param0(o), fun)
    # end your answer
end

Hidden

In [11]:
function (l::Embedding)(x)
    embedz = []
    for i in x
        temp = l.w[:, convert(Array{Int32,1}, i)]
        @show size(i)
        @show size(temp)
        push!(embedz, temp)
    end
    embedz
    return KnetArray(embedz)
end

(l::Linear)(x) = l.w * reshape(x, size(x)[1], size(x)[2]*size(x)[3]) .+ l.b

(d::Hidden)(x) = d.f.(d.w * reshape(x, size(x)[1], size(x)[2]*size(x)[3]) .+ d.b);


In [17]:
w1 = WikiNERProcessed(data.trn, data.w2i, data.t2i);
words1, tags1, batchsizes1 = first(w1)
E1 = Embedding(VOCABSIZE, EMBEDSIZE) 
embeddings1 = E1(words1);

ErrorException: KnetPtr: bad device id -1.

In [21]:
VOCABSIZE = length(data.w2i)
EMBEDSIZE = 20

20

# Problem 3. Implement Model
You need to implement initweights function which takes input, hidden and output dimensions and returns the whole model as `Array{Any}` julia data type.

In [13]:
# DO NOT TOUCH THIS CELL
mutable struct NERTagger
    embed_layer::Embedding 
    rnn::RNN
    hidden_layer::Hidden
    softmax_layer::Linear
end

In [18]:
# model initialization
# Check the array type (cpu vs gpu)
# Initialize your modules using given arguments
function NERTagger(rnn_hidden, words, tags, embed, mlp_hidden, usegpu=_usegpu, winit=0.01)
    # start your answer
    embed_layer = Embedding(VOCABSIZE, EMBEDSIZE)
    rnn = RNN(EMBEDSIZE, rnn_hidden)
    hidden_layer = Hidden(rnn_hidden, mlp_hidden)
    softmax_layer = Linear(mlp_hidden, tags)
    return NERTagger(embed_layer, rnn, hidden_layer, softmax_layer)
    # end your answer
end

NERTagger

In [19]:
# model forward propagation
# Call your modules as described in the introduction section
function (m::NERTagger)(x, batchsizes)
    #start your answer
    embed_out = m.embed_layer(x)
    rnn_out = m.rnn(embed_out, batchSizes = batchsizes)
    hidden_layer = m.hidden_layer(rnn_out)
    softmax_out = m.softmax_layer(hidden_layer)
    #end your answer
end

In [28]:
NUMRNN = 50
NUMWORDS = length(data.w2i)
NUMTAGS = length(data.t2i)
NUMMLP = 40
EMBEDSIZE = 20

model = NERTagger(NUMRNN, NUMWORDS, NUMTAGS, EMBEDSIZE, NUMMLP)

NERTagger(Embedding(P(Array{Float64,2}(20,28484))), LSTM(input=20,hidden=50), Hidden([-0.0128861 0.0147508 … -0.00173876 -0.00817138; -0.0136994 0.00480354 … 0.00407628 -0.00563847; … ; 0.000620629 -0.00741351 … -0.0119897 -0.0102568; 0.00640071 0.00320762 … -0.00961443 -0.00129682], P(Array{Float32,1}(40)), Knet.relu), Linear([0.00157205 -0.00123316 … -0.00149301 -0.000248446; 0.00191285 0.000966202 … -0.00079508 0.000473948; … ; 0.000830567 1.64982e-6 … -0.00131028 0.00150225; -0.00146644 0.000483051 … 0.000237693 0.00154957], P(Array{Float32,1}(40))))

## Problem 4. Implement Loss Function
Implement loss functions defined the cell below.

In [30]:
# Get your probabilities from your model 
# Calculate the loss function for average per token.
function (m::NERTagger)(x, batchsizes, ygold)
    # start your answer
    softmax_out = model(x, batchsizes)
    ypredict = predict(m, x, batchsizes)
    Knet.nll(ypredict, ygold)
    # end your answer
end

## Problem 5. Implement Accuracy function
Accuracy function counts the number of words(tokens) and also counts the number of correctly predicted tokens and returns ```number_of_correctly_pred_token / number_of_total_token```

In [32]:
# possible helpful procedures: argmax, vec
function accuracy(m::NERTagger, data, i2t)
    ncorrect = 0
    ntokens = 0
    for (x, ygold, batchsizes) in data
        ypredict = predict(m,x,batchsizes)
        for i in 1:length(ypredict)
            if ygold[i] == ypredict[i]
                ncorrect += 1
            end
            ntokens+=1
        end
    end
    return ncorrect/ntokens
end

accuracy (generic function with 1 method)

In [None]:
# DON'T TOUCH this cell
function main(args)
    o = parse_options(args)
    atype = o[:atype]
    display(o)
    o[:seed] > 0 && Knet.seed!(o[:seed])

    # load WikiNER data
    data = WikiNERData()
    
    # build model
    nwords, ntags = length(data.w2i), data.ntags
    model = NERTagger(o[:hidden], nwords, ntags, o[:embed], o[:mlp])
    initopt!(model)
    # opt = optimizers(w, Adam)

    # make instances
    trn = WikiNERProcessed(data.trn, data.w2i, data.t2i; batchsize=o[:batchsize])
    dev = WikiNERProcessed(data.dev, data.w2i, data.t2i; batchsize=o[:batchsize])

    # train bilstm tagger
    nwords = data.nwords
    ninstances = length(trn)
    println("nwords=$nwords, ntags=$ntags, ninstances=$ninstances"); flush(STDOUT)
    println("startup time: ", Int((now()-t00).value)*0.001); flush(STDOUT)
    t0 = now()
    all_time = dev_time = all_tagged = this_tagged = this_loss = 0
    iter = 0
    while true
        # training
        for (k, (x, ygold, batchsizes)) in enumerate(trn)
            num_tokens = length(x)
            # instance_loss = adam!(model, (x, ygold, batchsizes))
            instance_loss = mytrain!(model, x, batchsizes, ygold)
            this_loss += num_tokens*instance_loss
            this_tagged += num_tokens
            iter += 1
            if iter % o[:report] == 0
                println(this_loss/this_tagged); flush(STDOUT)
            end
            if iter % o[:valid] == 0
                # validation
                dev_start = now()
                tag_acc = accuracy(model, dev, data.i2t)
                dev_time += Int((now()-dev_start).value)*0.001
                train_time = Int((now()-t0).value)*0.001-dev_time

                # report
                @printf("%d iters finished, loss=%f\n", iter, this_loss/this_tagged)
                all_tagged += this_tagged
                this_loss = this_tagged = 0
                all_time = Int((now()-t0).value)*0.001
                @printf("tag_acc=%.4f, time=%.4f, word_per_sec=%.4f\n",
                    tag_acc, train_time, all_tagged/train_time)
                flush(STDOUT)
            end
            iter >= o[:iters] && return
        end

    end
end

function parse_options(args)
    s = ArgParseSettings()
    s.description = "LSTM Tagger in Knet"

    @add_arg_table s begin
        ("--usegpu"; action=:store_true; help="use GPU or not")
        ("--embed"; arg_type=Int; default=128; help="word embedding size")
        ("--hidden"; arg_type=Int; default=50; help="LSTM hidden size")
        ("--mlp"; arg_type=Int; default=32; help="MLP size")
        ("--epochs"; arg_type=Int; default=3; help="number of training epochs")
        ("--iters"; arg_type=Int; default=20000; help="number of training iterations")
        ("--report"; arg_type=Int; default=500; help="report period in iters")
        ("--valid"; arg_type=Int; default=5000; help="valid period in iters")
        ("--seed"; arg_type=Int; default=-1; help="random seed")
        ("--batchsize"; arg_type=Int; default=16; help="batchsize")
    end

    isa(args, AbstractString) && (args=split(args))
    o = parse_args(args, s; as_symbols=true)
    o[:atype] = (gpu() >= 0 && o[:usegpu]) ? KnetArray{Float32} : Array{Float64}
    println(o); flush(STDOUT)
    return o
end


function mytrain!(model::NERTagger, x, batchsizes, ygold)
    values = []
    J = @diff model(x, batchsizes, ygold)
    for par in params(model)
        g = grad(J, par)
        update!(value(par), g, par.opt)
    end
    return value(J)
end

function initopt!(model::NERTagger, optimizer="Adam()")
    for par in params(model)
        par.opt = eval(Meta.parse(optimizer))
    end
end

In [None]:
t00 = now();main("--seed 1 --iters 10000 --usegpu")