### Use Hmm from hmm.jl

In [1]:
include("hmm.jl")

fit! (generic function with 1 method)

In [2]:
function build_sequences_from_data(file_path; min_sequence_length_allowed=5)
    f = open(file_path, "r")
    lines = readlines(f)
    sequences = Array{Sequence}([])
    sequence_counter = 0
    min_seq_length = min_sequence_length_allowed
    max_seq_length = 0
    words = Array{String}([])
    tags = Array{String}([])

    for line in lines
        line_splitted = split(line, "\t")

        if line_splitted[1] == "\n"
            current_lenght = length(words)
            
            if (current_lenght < min_sequence_length_allowed) continue end
            if (current_lenght < min_seq_length) min_seq_length = current_lenght end
            if (current_lenght > max_seq_length) max_seq_length = current_lenght end
            
            push!(sequences, Sequence(words,tags))
            words = Array{String}([])
            tags = Array{String}([])
            sequence_counter +=1
        else
            push!(words, line_splitted[2])
            push!(tags, line_splitted[5])    
        end
    end
    print("\nNumber sequences: ", sequence_counter)
    print("\nMin sequence length: ", min_seq_length)
    print("\nMax sequence length: ", max_seq_length)
    return sequences
end

build_sequences_from_data (generic function with 1 method)

In [3]:
file_path_train = homedir() * "/Documents/Datasets/conll/train-02-21.conll"
file_path_valid = homedir() * "/Documents/Datasets/conll/dev-22.conll"
file_path_test = homedir() * "/Documents/Datasets/conll/test-23.conll"

train_seq = build_sequences_from_data(file_path_train, min_sequence_length_allowed=3);
valid_seq = build_sequences_from_data(file_path_valid, min_sequence_length_allowed=3);
test_seq = build_sequences_from_data(file_path_test, min_sequence_length_allowed=3);


Number sequences: 39642
Min sequence length: 3
Max sequence length: 141
Number sequences: 1684
Min sequence length: 3
Max sequence length: 118
Number sequences: 2408
Min sequence length: 3
Max sequence length: 67

In [4]:
hmm = Hmm()

Hmm(Set{String}(),Set{String}(),Dict{String,Int64}(),Dict{String,Int64}(),Dict{Int64,String}(),Dict{Int64,String}(),Int64[],,,Int64[],Float64[],,,Float64[],false)

In [7]:
@time fit!(hmm, train_seq);

  4.255216 seconds (10.05 M allocations: 362.571 MB, 6.98% gc time)


In [9]:
@time begin
    total_predicted_states = 0
    total_correct = 0
    for seq in train_seq
        total_correct += sum(seq.labels .== posterior_decode(hmm, seq))
        total_predicted_states += length(seq) 
    end
end

117.745981 seconds (838.67 M allocations: 149.904 GB, 9.83% gc time)


In [11]:
print("accuracy: ", total_correct/total_predicted_states)

accuracy: 0.970337716362044

In [32]:
seq_ = train_seq[1]

Sequence(String["In","an","Oct.","19","review","of","``","The","Misanthrope","''"  …  "Kim","Cattrall",",","was","mistakenly","attributed","to","Christina","Haag","."],String["IN","DT","NNP","CD","NN","IN","``","DT","NN","''"  …  "NNP","NNP",",","VBD","RB","VBN","TO","NNP","NNP","."])

In [37]:
size(train_seq)

(39642,)

In [38]:
using BenchmarkTools

In [39]:
@benchmark sum(seq_.labels .== posterior_decode(hmm, seq_))

BenchmarkTools.Trial: 
  memory estimate:  8.05 MiB
  allocs estimate:  44050
  --------------
  minimum time:     3.966 ms (0.00% GC)
  median time:      6.256 ms (0.00% GC)
  mean time:        7.102 ms (9.73% GC)
  maximum time:     30.399 ms (7.00% GC)
  --------------
  samples:          700
  evals/sample:     1

#### Paralelize evaluation

In [8]:
addprocs(4) 

4-element Array{Int64,1}:
 2
 3
 4
 5

In [20]:
@everywhere function evaluate(hmm,sequences)
    total_predicted_states = 0
    total_correct = 0
    for seq in sequences
        total_correct += sum(seq.labels .== posterior_decode(hmm, seq))
        total_predicted_states += length(seq)
        return total_correct, total_predicted_states
    end 
   return total_correct, total_predicted_states
end

In [27]:
#@everywhere include("hmm.jl")

In [26]:
#time begin
#    # Put the end-2 to make num workers divisible by trian size
#    n = length(train_seq[1:end-2])
#    n_processors = length(workers())
#    splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]    
#    train_seq_splits = [train_seq[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
#    res = pmap(evaluate, (hmm,train_seq_splits))
#    result_paralel = count_reduce(res);
#end

#### Problem with the HMM:

If a word is not observed during training you cannot use it to predict its tag!

- "Countered" is not in tran but can be found in test_seq

In [None]:
begin
    total_predicted_states = 0
    total_correct = 0
    for seq in test_seq
        total_correct += sum(seq.labels .== posterior_decode(hmm, seq))
        total_predicted_states += length(seq) 
    end
end