# Important types for sequences

During the sequence tutorials we will mostly work using sequences of words. 

It might be useful then to define a type that will be used in the different assignemnts that you will have to face.


Nice material:
- http://www.nowozin.net/sebastian/blog/streaming-log-sum-exp-computation.html


In [14]:
using BenchmarkTools

In [19]:
type Sequence
    words::Array{String}
    labels::Array{String}
    
    function Sequence(words)
        states=["*" for x in words]
        return new(words, states)
    end
    
    function Sequence(words, states)
        return new(words, states)
    end
    
end



In [20]:
Sequence([])

Sequence(String[],String[])

In [21]:
function Base.length(sequence::Sequence)
    return length(sequence.words)
end



In [22]:
seq1 = Sequence(["the", "house", "is", "big"])

Sequence(String["the","house","is","big"],String["*","*","*","*"])

In [23]:
length(seq1)

4

In [24]:
seq1.labels

4-element Array{String,1}:
 "*"
 "*"
 "*"
 "*"

In [25]:
length(seq1.words)

4

In [26]:
seq1

Sequence(String["the","house","is","big"],String["*","*","*","*"])

### Define a sequence of words with a sequence of labels

In [27]:
seq2 = Sequence(["the", "house", "is", "big"],
                ["det","noun","verb","adj"])

Sequence(String["the","house","is","big"],String["det","noun","verb","adj"])

In [28]:
seq2.labels

4-element Array{String,1}:
 "det" 
 "noun"
 "verb"
 "adj" 

In [29]:
seq = Sequence(["the", "house", "is", "big"], 
               ["det","noun","verb","adj"])

Sequence(String["the","house","is","big"],String["det","noun","verb","adj"])

In [30]:
seq.labels

4-element Array{String,1}:
 "det" 
 "noun"
 "verb"
 "adj" 

### Be carefull with phrases!

- **The type that we just defined does not accept single strings**

In [31]:
Sequence("the house is big")

LoadError: MethodError: Cannot `convert` an object of type String to an object of type Array{String,N}
This may have arisen from a call to the constructor Array{String,N}(...),
since type constructors fall back to convert methods.

**Nevertheless the ```sequence``` type can accept an Array with a single string containing a sequence. **

**This is a behaviour we might not want it**

In [32]:
seq2 = Sequence(["the house is big"])

Sequence(String["the house is big"],String["*"])

In [33]:
seq2.words

1-element Array{String,1}:
 "the house is big"

In [34]:
split("the house is big")

4-element Array{SubString{String},1}:
 "the"  
 "house"
 "is"   
 "big"  

In [35]:
split("the house is big"," ")

4-element Array{SubString{String},1}:
 "the"  
 "house"
 "is"   
 "big"  

# Our toy data: Rainy/Sunny example

In [36]:
Sigma = ["walk", "shop", "clean", "tennis"]
Lambda = ["rainy", "sunny"]

sequence_list = []

s1 = Sequence(["walk", "walk", "shop", "clean"],
             ["rainy", "sunny", "sunny", "sunny"])

s2 = Sequence(["walk", "walk", "shop", "clean"], 
              ["rainy", "rainy", "rainy", "sunny"])

s3 = Sequence(["walk", "shop", "shop", "clean"], 
              ["sunny", "sunny", "sunny", "sunny"])

train_sequences = [s1, s2, s3]

s1_t = Sequence(["walk", "walk", "shop", "clean"], 
                ["rainy", "sunny", "sunny", "sunny"])

s2_t = Sequence(["clean", "walk", "tennis", "walk"], 
                ["sunny", "sunny", "sunny", "sunny"])

test_sequences = [s1_t, s2_t];

In [37]:
train_sequences[2]

Sequence(String["walk","walk","shop","clean"],String["rainy","rainy","rainy","sunny"])

# Hidden markov model

In [38]:
train_sequences

3-element Array{Sequence,1}:
 Sequence(String["walk","walk","shop","clean"],String["rainy","sunny","sunny","sunny"])
 Sequence(String["walk","walk","shop","clean"],String["rainy","rainy","rainy","sunny"])
 Sequence(String["walk","shop","shop","clean"],String["sunny","sunny","sunny","sunny"])

In [39]:
test_sequences

2-element Array{Sequence,1}:
 Sequence(String["walk","walk","shop","clean"],String["rainy","sunny","sunny","sunny"])  
 Sequence(String["clean","walk","tennis","walk"],String["sunny","sunny","sunny","sunny"])

#### Get all possible states and all possible words

In [40]:
word_to_int = Dict{String,Int64}()
state_to_int = Dict{String,Int64}()

Dict{String,Int64} with 0 entries

In [41]:
function get_possible_words_and_states(sequences)
    state_counter = 1
    word_counter = 1
    
    possible_words = Set{String}()
    possible_states = Set{String}()
    
    for seq in sequences
        for (t,w) in zip(seq.labels, seq.words)
            push!(possible_states, t)
            push!(possible_words, w)
        end
    end
    
    return possible_words, possible_states
end

get_possible_words_and_states (generic function with 1 method)

In [42]:
possible_words, possible_states = get_possible_words_and_states([train_sequences; test_sequences])

(Set(String["tennis","walk","clean","shop"]),Set(String["sunny","rainy"]))

#### map words to positions and states to positions

In [43]:
num_words = length(possible_words)
num_states = length(possible_states)

2

In [44]:
function assign_elements_to_integers(elements)
    element_to_pos = Dict{String, Int64}()
    for (k,e) in enumerate(elements)
        element_to_pos[e] = k
    end
    return element_to_pos
end

function assign_elements_to_integers2(elements)
    element_to_pos = Dict{String, Int64}()
    k = 1
    for e in elements
        element_to_pos[e] = k
        k +=1
    end
    return element_to_pos
end

assign_elements_to_integers2 (generic function with 1 method)

In [45]:
#word_to_pos = assign_elements_to_integers(possible_words);
#state_to_pos = assign_elements_to_integers(possible_states);

# Hardcode order
word_to_pos = Dict("walk"=>1, "clean" =>2, "shop"=>3, "tennis"=>4)
state_to_pos = Dict("rainy"=>1, "sunny"=> 2)

### Computing  sufficient statistics (counts) of the HMM

In [46]:
function update_initial_counts!(initial_counts, seq, state_to_pos)
    initial_counts[state_to_pos[seq.labels[1]]] = initial_counts[state_to_pos[seq.labels[1]]] + 1
end

function update_transition_counts!(transition_counts, seq, state_to_pos)
    for (t1,t2) in zip(seq.labels[1:end-1], seq.labels[2:end])
        transition_counts[state_to_pos[t1], state_to_pos[t2]] += 1 
    end    
end

function update_emission_counts!(emission_counts, seq, state_to_pos, word_to_pos)
    for (t,w) in zip(seq.labels, seq.words)
        emission_counts[state_to_pos[t], word_to_pos[w]] += 1 
    end 
end

function update_final_counts!(final_counts, seq, state_to_pos)
    final_counts[state_to_pos[seq.labels[end]]] +=1
end

update_final_counts! (generic function with 1 method)

In [113]:
function sufficient_statistics_hmm(sequences, state_to_pos, word_to_pos)
    
    n_states = length(state_to_pos)
    n_words = length(word_to_pos)
    
    initial_counts      = zeros(n_states)
    transition_counts   = zeros(n_states, n_states)
    final_counts        = zeros(n_states)
    emission_counts     = zeros(n_states, n_words)
    
    for seq in sequences
        update_initial_counts!(initial_counts, seq, state_to_pos)
        update_transition_counts!(transition_counts, seq,  state_to_pos)
        update_emission_counts!(emission_counts, seq,  state_to_pos, word_to_pos)
        update_final_counts!(final_counts, seq,  state_to_pos)
    end
    
    return initial_counts, transition_counts, final_counts, emission_counts
end



sufficient_statistics_hmm (generic function with 1 method)

In [114]:
counts = sufficient_statistics_hmm(train_sequences, state_to_pos, word_to_pos);

In [115]:
initial_counts, transition_counts, final_counts, emission_counts = counts;

In [116]:
initial_counts

2-element Array{Float64,1}:
 2.0
 1.0

In [117]:
transition_counts

2×2 Array{Float64,2}:
 2.0  2.0
 0.0  5.0

In [118]:
final_counts

2-element Array{Float64,1}:
 0.0
 3.0

In [119]:
emission_counts

2×4 Array{Float64,2}:
 3.0  0.0  1.0  0.0
 2.0  3.0  3.0  0.0

#### Sanity Checks HMM

- Initial counts must sum to the number of sentences  $$ \sum_{k=1}^K C_{\text{init}}(c_k) = M$$

- Transition counts and Final Counts should sum to the number of tokens: $$\sum_{k,l=1}^K C_{\text{trans}}(c_k,c_l)  + \sum_{k=1}^K C_{\text{final}}(c_k) = M \cdot N$$

- Emission counts must sum to the number of tokens
$$
\sum_{j=1}^J \sum_{k=1}^K C_{\text{emiss}}(w_j,c_k) = M \cdot N 
$$

In [120]:
emission_counts

2×4 Array{Float64,2}:
 3.0  0.0  1.0  0.0
 2.0  3.0  3.0  0.0

In [121]:
M = length(train_sequences)
N = length(train_sequences[1].words)
print("M: ", M, "\n","N: ", N,"\n" ,"M*N: ", M*N)

M: 3
N: 4
M*N: 12

In [122]:
print("\ninitial_counts sum: ", sum(initial_counts))
print("\nemission_counts sum: ", sum(emission_counts))
print("\ntransition and final counts sum: ", sum(transition_counts) + sum(final_counts))


initial_counts sum: 3.0
emission_counts sum: 12.0
transition and final counts sum: 12.0

In [123]:
function check_counts(data, 
                      possible_states,
                      initial_counts,
                      transition_counts, 
                      emission_counts, 
                      final_counts)
    """
    This Check is only valid if all instances have the same length!!!!
    """
    n_samples = length(data)
    sequence_length = length(data[1].words)
    problem_checks = []
    
    if sum(initial_counts) != n_samples
        print("\nERROR: initial_counts are not correctly computed")
        push!(problem_checks,"initial_counts")
    end
    
    if sum(transition_counts) + sum(final_counts) != sequence_length*n_samples
        print("\nERROR: transition_counts are not correctly computed")
        push!(problem_checks,"transition_counts")
    end
    
    if sum(emission_counts)  != sequence_length*n_samples
        print("\nERROR: emission_counts are not correctly computed")
        push!(problem_checks,"emission_counts")
    end
    
    if length(problem_checks) == 0
        print("\nAll checks passed")
    end
end



check_counts (generic function with 1 method)

In [124]:
check_counts(train_sequences, 
             possible_states,
             initial_counts,
             transition_counts, 
             emission_counts, 
             final_counts)


All checks passed

## From counts to probabilities

The following formulas specify how to find the parameters of the HMM:

$$
P_{\text{init}}(c_k \,\vert\, \text{start}) = \frac{C_{\text{init}}(c_k)}{ \sum_{k=1}^K
C_{\text{init}} (c_l)}
$$

$$
P_{\text{final}}(\text{stop} \,\vert\, c_l) = \frac{C_{\text{final}}(c_l) }
{\sum_{k=1}^K C_{\text{trans}}(c_k,c_l) + C_{\text{final}}(c_l)}
$$

$$
P_{\text{trans}}( c_k \,\vert\, c_l) = \frac{C_{\text{trans}}(c_k, c_l) }
{\sum_{p=1}^K C_{\text{trans}}(c_p,c_l) + C_{\text{final}}(c_l)}
$$

$$
P_{\text{emiss}} (w_j \,\vert\, c_k) = \frac{C_{\text{emiss}} (w_j, c_k) }{\sum_{q=1}^J C_{\text{emiss}}(w_q,c_k)}
$$



In [125]:
transition_counts

2×2 Array{Float64,2}:
 2.0  2.0
 0.0  5.0

In [126]:
initial_probs = (initial_counts / sum(initial_counts))
transition_probs = transition_counts./(sum(transition_counts, 2) + final_counts)
final_probs =  final_counts ./ (sum(transition_counts, 2) + final_counts )
emission_probs = (emission_counts ./ sum(emission_counts, 2))

2×4 Array{Float64,2}:
 0.75  0.0    0.25   0.0
 0.25  0.375  0.375  0.0

In [127]:
typeof(initial_counts), typeof(transition_counts)

(Array{Float64,1},Array{Float64,2})

In [128]:
initial_probs

2-element Array{Float64,1}:
 0.666667
 0.333333

#### visualize probabilities with the tag associated to the state

In [129]:
state_to_pos

Dict{String,Int64} with 2 entries:
  "sunny" => 2
  "rainy" => 1

In [130]:
word_to_pos

Dict{String,Int64} with 4 entries:
  "tennis" => 4
  "walk"   => 1
  "clean"  => 2
  "shop"   => 3

In [131]:
transition_probs

2×2 Array{Float64,2}:
 0.5  0.5  
 0.0  0.625

In [132]:
emission_probs

2×4 Array{Float64,2}:
 0.75  0.0    0.25   0.0
 0.25  0.375  0.375  0.0

## Defining an HMM


- Make a print function that prints beautifally the probabilities of the HMM, somehting like

   hmm.transition_probs
   
                Sunny  Rainny
       Sunny    0.625  0.0
       Rainny   0.5    0.5

In [133]:
type Hmm
    possible_words::Set{String}
    possible_states::Set{String}
    
    word_to_pos::Dict{String, Int64}
    state_to_pos::Dict{String, Int64}   
    pos_to_word::Dict{Int64, String}
    pos_to_state::Dict{Int64, String}

    initial_counts::Vector{Int64}
    transition_counts::Matrix{Int64} 
    emission_counts::Matrix{Int64}
    final_counts::Vector{Int64}

    initial_probs::Vector{Float64}
    transition_probs::Matrix{Float64}
    emission_probs::Matrix{Float64}
    final_probs::Vector{Float64}
    
    trained::Bool
    
    Hmm() = new(Set{String}(), 
                Set{String}(),
                Dict{String, Int64}(),
                Dict{String, Int64}(),
                Dict{Int64, String}(),
                Dict{Int64, String}(),
                Vector{Int64}(),
                Matrix{Int64}(),
                Matrix{Int64}(),
                Vector{Int64}(),
                Vector{Int64}(),
                Matrix{Int64}(),
                Matrix{Int64}(),
                Vector{Int64}(),
                false)
    
   
end



In [134]:
aux = Hmm()

Hmm(Set{String}(),Set{String}(),Dict{String,Int64}(),Dict{String,Int64}(),Dict{Int64,String}(),Dict{Int64,String}(),Int64[],,,Int64[],Float64[],,,Float64[],false)

In [135]:
hmm = Hmm()

hmm.possible_words      = possible_words
hmm.possible_states     = possible_states
hmm.word_to_pos         = word_to_pos
hmm.state_to_pos        = state_to_pos
hmm.pos_to_word         = map(reverse, word_to_pos)
hmm.pos_to_state        = map(reverse, state_to_pos)
hmm.initial_counts      = initial_counts
hmm.transition_counts   = transition_counts
hmm.final_counts        = final_counts
hmm.initial_probs       = initial_probs
hmm.transition_probs    = transition_probs
hmm.emission_probs      = emission_probs
hmm.final_probs         = final_probs[:]
hmm.trained             = true


true

#### Define a custom method for showing the HMM: TODO

Looking at the past printed info is not very nice

In [136]:
#import Base.show

In [137]:
#Base.show(io::IO, hmm::Hmm) = print(io, "\n possible_tags=$hmm.possible_tags\n possible_words=$(hmm.possible_words)")

In [138]:
hmm.possible_states

Set(String["sunny","rainy"])

In [139]:
println(hmm)

Hmm(Set(String["tennis","walk","clean","shop"]),Set(String["sunny","rainy"]),Dict("tennis"=>4,"walk"=>1,"clean"=>2,"shop"=>3),Dict("sunny"=>2,"rainy"=>1),Dict(4=>"tennis",2=>"clean",3=>"shop",1=>"walk"),Dict(2=>"sunny",1=>"rainy"),[2,1],[2 2; 0 5],,[0,3],[0.666667,0.333333],[0.5 0.5; 0.0 0.625],[0.75 0.0 0.25 0.0; 0.25 0.375 0.375 0.0],[0.0,0.375],true)


## Computations in log domain: why?

We will compute logprobabilities since multiplying several probabilities will lead to numerical underflow but summing logprobabilities will not.

Notice that sometimes computations in log domain can be tricky. Let us consider the following example

In [140]:
#srand(12345)
#a = rand(10)
a = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10]

print(log(sum(exp.(a))),"\n")
print(log(sum(exp.(10*a))),"\n")
print(log(sum(exp.(100*a))),"\n")
print(log(sum(exp.(1000*a))),"\n")

2.7999098843442183
9.45876378447825
90.00004540096037
Inf


In [141]:
sumexp(a)

LoadError: UndefVarError: sumexp not defined

Obviously ```log(sum(exp(1000*a)))``` should not be ```Inf``` in order to avoid this numerical inestability we will code our oun ```logsum```function.

Nice video explayining the ```logsumexp``` trick

- https://www.youtube.com/watch?v=-RVM21Voo7Q

In [142]:
function logsum_pair(logx, logy):
    """
    Return log(x+y), avoiding arithmetic underflow/overflow.
    logx: log(x)
    logy: log(y)

    Rationale:
        x + y    = e^logx + e^logy = e^logx (1 + e^(logy-logx))
    therefore:
        log(x+y) = logx + log(1 + e^(logy-logx)) (1)

    Likewise,
    log(x+y) = logy + log(1 + e^(logx-logy)) (2)

    The computation of the exponential overflows earlier and is less precise
    for big values than for small values. Due to the presence of logy-logx
    (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred
    otherwise.
    """
    if logx == -Inf
        return logy
    elseif logx > logy
        return logx + log1p(exp(logy-logx))
    else
        return logy + log1p(exp(logx-logy))
    end
end

function logsum(logv::Array):
    """
    Return log(v[0] + v[1] + ...), avoiding arithmetic underflow/overflow.
    """
    res = -Inf
    for val in logv
        res = logsum_pair(res, val)
    end
    return res
end

#function logsum(logv::Float64):
#    """
#    Return log(v[0] + v[1] + ...), avoiding arithmetic underflow/overflow.
#    """
#    res = -Inf
#    res = logsum_pair(res, logv)
#    return res
#end



logsum (generic function with 1 method)

Using the functions from above we don´t have the ```Inf``` problem anymore

In [143]:
print(logsum(a),"\n")
print(logsum(10*a),"\n")
print(logsum(100*a),"\n")
print(logsum(1000*a),"\n")

2.7999098843442183
9.45876378447825
90.00004540096037
900.0


In [144]:
@time logsum(a)

  0.000003 seconds (5 allocations: 176 bytes)


2.7999098843442183

In [145]:
aux = [x for x in 1:50]/10;
@benchmark logsum(aux)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     3.273 μs (0.00% GC)
  median time:      3.276 μs (0.00% GC)
  mean time:        3.409 μs (0.00% GC)
  maximum time:     16.405 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8

### Efficient logsum_exp

In [146]:
function logsumexp2(X)
    alpha = -Inf
    r = 0.0
    @inbounds for x in X
        if x <= alpha
            r += exp(x - alpha)
        else
            r *= exp(alpha - x)
            r += 1.0
            alpha = x
        end
    end
    return log(r) + alpha
end



logsumexp2 (generic function with 1 method)

In [147]:
print(logsumexp2(a),"\n")
print(logsumexp2(10*a),"\n")
print(logsumexp2(100*a),"\n")
print(logsumexp2(1000*a),"\n")

2.7999098843442183
9.45876378447825
90.00004540096037
900.0


In [148]:
@time logsumexp2(a)

  0.000003 seconds (5 allocations: 176 bytes)


2.7999098843442183

In [149]:
function logsumexp_batch(X)
    alpha = maximum(X)  # Find maximum value in X
    log(sum(exp(X-alpha))) + alpha
end



logsumexp_batch (generic function with 1 method)

In [150]:
aux = [x for x in 1:50]/10.;
@benchmark logsumexp2(aux)

BenchmarkTools.Trial: 
  memory estimate:  16 bytes
  allocs estimate:  1
  --------------
  minimum time:     525.215 ns (0.00% GC)
  median time:      529.325 ns (0.00% GC)
  mean time:        578.829 ns (0.25% GC)
  maximum time:     15.338 μs (95.39% GC)
  --------------
  samples:          10000
  evals/sample:     191

In [151]:
@benchmark logsumexp_batch(aux)

BenchmarkTools.Trial: 
  memory estimate:  1.16 KiB
  allocs estimate:  8
  --------------
  minimum time:     1.108 μs (0.00% GC)
  median time:      1.275 μs (0.00% GC)
  mean time:        1.494 μs (9.67% GC)
  maximum time:     382.514 μs (95.50% GC)
  --------------
  samples:          10000
  evals/sample:     10

In [152]:
n = 10_000_000
X = 500.0*randn(n)

@time logsumexp_batch(X)
@time logsumexp2(X)

  0.331327 seconds (895 allocations: 152.631 MB, 64.28% gc time)
  0.073903 seconds (5 allocations: 176 bytes)


2613.184781452367

In [87]:
n = 10
X = 500.0*randn(n)

@time logsumexp_batch(X)
@time logsumexp2(X)

  0.000004 seconds (9 allocations: 528 bytes)
  0.000003 seconds (5 allocations: 176 bytes)


1176.1838983575794

## Computing scores for a given sequence


For convenience, we will be working with 

log-probabilities, rather than probabilities. Therefore, if we associate to each circle and arrow in the trellis a score that corresponds
to the log-probabilities above, and if we define the score of a path
connecting the ${\tt start}$ and  ${\tt stop}$ symbols as
the sum of the scores of the circles and arrows it traverses, 
then the goal of **finding the most likely sequence of states (Viterbi decoding) corresponds to finding the path with the highest score**.



The trellis scores are given by the following expressions:

- For each state $c_k$:

\begin{eqnarray}
\mathrm{score}_{\mathrm{init}}(c_k) &=&
\log P_{\mathrm{init}}(Y_{1} = c_k | \text{start}).
\end{eqnarray}


- For each position $i \in {1,\ldots,N-1}$ and each pair of states $c_k$ and $c_l$:

\begin{eqnarray}
\mathrm{score}_{\mathrm{trans}}(i, c_k, c_l) &=&
\log P_{\mathrm{trans}}(Y_{i+1} = c_k | Y_i = c_l).
\end{eqnarray}


- For each state $c_l$:

\begin{eqnarray}
\mathrm{score}_{\mathrm{final}}(c_l) &=&
\log P_{\mathrm{final}}(\text{stop} | Y_N = c_l).
\end{eqnarray}


- For each position $i \in {1,\ldots,N}$ and state $c_k$:

\begin{eqnarray}
\mathrm{score}_{\mathrm{emiss}}(i, c_k) &=&
\log P_{\mathrm{emiss}}(X_i = x_i | Y_i = c_k).
\end{eqnarray}


In [88]:
function compute_scores(hmm, sequence)
    length_sequence = length(sequence.words)
    n_states = length(hmm.possible_states)
    
    initial_scores = log.(hmm.initial_probs)
    transition_scores = log.(hmm.transition_probs)

    sequence_words_integers = [hmm.word_to_pos[x] for x in sequence.words]
    emission_scores = log.(hmm.emission_probs[:, sequence_words_integers])
    final_scores = log.(hmm.final_probs)
    
    return initial_scores, transition_scores, final_scores, emission_scores
end

compute_scores (generic function with 1 method)

In [89]:
sequence = train_sequences[1]
sequence_words_integers = [hmm.word_to_pos[x] for x in sequence.words]

4-element Array{Int64,1}:
 1
 1
 3
 2

In [90]:
hmm.emission_probs

2×4 Array{Float64,2}:
 0.75  0.0    0.25   0.0
 0.25  0.375  0.375  0.0

In [91]:
hmm.emission_probs[:, sequence_words_integers]

2×4 Array{Float64,2}:
 0.75  0.75  0.25   0.0  
 0.25  0.25  0.375  0.375

In [92]:
#log_likelihood, forward = hmm.decoder.run_forward(initial_scores, transition_scores,final_scores, emission_scores)
scores = compute_scores(hmm, train_sequences[1])

([-0.405465,-1.09861],
[-0.693147 -0.693147; -Inf -0.470004],

[-Inf,-0.980829],
[-0.287682 -0.287682 -1.38629 -Inf; -1.38629 -1.38629 -0.980829 -0.980829])

In [93]:
initial_scores, transition_scores, final_scores, emission_scores = scores;

In [94]:
initial_scores[[hmm.state_to_pos["rainy"],hmm.state_to_pos["sunny"]]]

2-element Array{Float64,1}:
 -0.405465
 -1.09861 

In [95]:
transition_scores

2×2 Array{Float64,2}:
   -0.693147  -0.693147
 -Inf         -0.470004

In [96]:
final_scores

2-element Array{Float64,1}:
 -Inf       
   -0.980829

In [97]:
emission_scores

2×4 Array{Float64,2}:
 -0.287682  -0.287682  -1.38629   -Inf       
 -1.38629   -1.38629   -0.980829    -0.980829

In [98]:
n_states = length(initial_scores)
length_sequence = size(emission_scores)[2]
print("n_states: ", n_states, "\n")
print("length sequence: ", length_sequence)

n_states: 2
length sequence: 4

## Posterior decoding


Posterior decoding consists
in picking state with the highest posterior for each position in the sequence independently; for 
each $i = 1,\ldots,N$:

\begin{equation}
y_i^* = \text{argmax}_{y_i \in \Lambda} P(Y_i=y_i | X = x).
\end{equation}

The **sequence posterior distribution** is the probability of a particular
hidden state sequence given that we have observed a particular
sequence. Moreover, we will be interested in two other posteriors distributions:
the **state posterior distribution**, corresponding to the
probability of being in a given state in a certain position given the
observed sequence; and the \textbf{transition posterior distribution},
which is the probability of making a particular transition, from position $i$ to
$i+1$, given the observed sequence. 

They are formally defined as follows:

- Sequence  Posterior
$$P(Y=y|X=x) = \frac{P(X=x,Y=y)}{P(X=x)}
$$

- State Posterior
$$
P(Y_i=y_i | X=x)
$$

- Transition Posterior
$$
P(Y_{i+1}=y_{i+1},Y_i=y_i| X=x)
$$


### Computing posteriors involves beeing able to compute $P(X=x)$
To compute the posteriors, a first step is to be able to compute the 
likelihood of
the sequence $P(X=x)$, which corresponds to summing the probability of all
possible hidden state sequences.

\begin{equation}
\mathbf{Likelihood\!:}\;\;\;\; P(X=x) = \displaystyle \sum_{y \in \Lambda^N} P(X=x,Y=y).
\end{equation}

The number of possible hidden state sequences is exponential in the
length of the sequence ($|\Lambda|^N$),
 which makes the sum over all of them hard. 
 In our simple
 example, there are $2^4 = 16$ paths, which we can actually explicitly enumerate
 and calculate their probability using Equation of the joint probability $P(x,y)$. But this is as far as it goes: for example, for Part-of-Speech
 tagging with a small tagset of 12 tags and a medium size
 sentence of length 10, there are $12^{10} = 61 917 364 224$ such
 paths. 
 
 Yet, we must be able to compute this sum (sum over $y \in \Lambda^N$) to compute the above likelihood
formula; this is called the inference problem. For sequence models, there is a well known dynamic programming algorithm,
the **Forward-Backward** (FB) algorithm, which allows the computation
to be performed in linear time, The runtime is linear with respect
to the sequence length. More precisely, 
the runtime is $O(N|\Lambda|^2)$. 
A naive enumeration would cost $O(|\Lambda|^N)$.

The FB algorithm relies on the independence of previous states
assumption, which  
is illustrated in the trellis view by having arrows only between consecutive states. 
The FB algorithm defines two auxiliary probabilities, the forward probability and the backward probability. 


## Efficient forward probability computation

The forward probability represents the probability that in position
$i$ we are in state $Y_i = c_k$ and that we have observed $x_1,\ldots,x_i$
up to that position. Therefore, its mathematical expression is:
\begin{equation}
\mathbf{Forward \ Probability\!:}\;\;\;\;  \mathrm{forward}(i, c_k) = P(Y_i = c_k, X_1=x_1,\ldots, X_i = x_i)
\end{equation}


Using the independence assumptions of the HMM we can compute $\mathrm{forward}(i, c_k)$ using all the forward computations \{$\mathrm{forward}(i -1, c)$ for $c \in \Lambda$\}. In order to facilitate the notation of the following argument we will denote by $x_{i:j}$  the assignemnt $X_i = x_i, \dots, X_j = x_j$. Therefore we can write   $\mathrm{forward}(i, y_i) $ as $P( y_i, x_{1:i } ) $ and rewrite the forward expression as follows:

\begin{equation}
  P( y_i, x_{1:i } ) =  \sum_{y_{i-1} \in \Lambda} P( y_i ,y_{i-1}, x_{1:i } )  =  \sum_{y_{i-1} \in \Lambda} P( x_i  | y_i,  y_{i-1},  x_{1:i-1 } ) \cdot P(y_i  | y_{i-1},  x_{1:i-1 }) \cdot P(y_{i-1},  x_{1:i-1 })  
\end{equation}


Using the **Observation independence** and the **Independence of previous states** properties of the first order HMM we have $P( x_i  | y_i,  y_{i-1},  x_{1:i-1 } ) = P( x_i  | y_i) $ and $P(y_i  | y_{i-1},  x_{1:i-1 })  = P(y_i  | y_{i-1})  $. Therefore the previous equation can be written, 
for $i \in \{2,\dots,N\}$ (where $N$ is the length of the sequence), as 

\begin{equation}
 \mathrm{forward}(i, y_i)  = \sum_{y_{i-1} \in \Lambda} P( x_i  | y_i, ) \cdot P(y_i  | y_{i-1}) \cdot \mathrm{forward}(i-1, y_{i-1})   
\end{equation}


The previous equation proves that  the forward probability can be defined by the
following recurrence rule: 

\begin{eqnarray}
\mathrm{forward}(1, c_k)&=& P_{\text{init}}(c_k|\text{start}) \times P_{\mathrm{emiss}}(x_1 | c_k)
 \\
 \mathrm{forward}(i, c_k) &=& \left(  \sum_{c_l \in \Lambda} P_{\mathrm{trans}}(c_k | c_l) \times \mathrm{forward}(i-1, c_l) \right) \times P_{\mathrm{emiss}}(x_i | c_k) 
 \\
  \mathrm{forward}(N+1, \text{stop}) &=& \sum_{c_l \in \Lambda} P_{\text{final}}(\text{ stop} | c_l) \times \mathrm{forward}(N, c_l).
\end{eqnarray}


Using the forward trellis one can compute the likelihood simply as:

\begin{equation}
P(X=x) = \mathrm{forward}(N+1, \text{ stop}).
\end{equation}

Although the forward probability is enough to calculate the likelihood of a given sequence, we will also need the backward probability to calculate the state posteriors. 


In [153]:
hmm.state_to_pos

Dict{String,Int64} with 2 entries:
  "sunny" => 2
  "rainy" => 1

In [154]:
hmm.word_to_pos

Dict{String,Int64} with 4 entries:
  "tennis" => 4
  "walk"   => 1
  "clean"  => 2
  "shop"   => 3

### Forward computations

In [155]:
function run_log_forward(initial_scores,
                         transition_scores,
                         final_scores,
                         emission_scores)
    """
    Compute the log_forward computations
    
    Assume there are K possible states and a sequence of length N.
    This method will compute iteritavely the log_forward quantities.
    
    * log_f is a K x N Array.
    * log_f[:,i] will contain the forward quantities at position i.
    * log_f[:,i] is a vector of size K
    
    Returns
    - log_f: Array of size K x N
    """
    length_sequence = size(emission_scores)[2]  
    n_states = length(hmm.state_to_pos)         # number of states
    
    # Forward variables initialized to Infinity because log(0) = Inf
    log_f = zeros(n_states, length_sequence) .+ Inf

    # Initialization
    log_f[:,1] = emission_scores[:,1] + initial_scores
    
    for n in 2:length_sequence
        for s in 1:n_states
            log_f[s,n] = logsum(log_f[:,n-1] + transition_scores[:,s]) + emission_scores[s,n]
        end
    end
    
    log_likelihood = logsum(log_f[:,length_sequence] + final_scores)    
    return log_likelihood, log_f
end



run_log_forward (generic function with 1 method)

In [156]:
log_likelihood, log_forward = run_log_forward(initial_scores,
                                              transition_scores,
                                              final_scores,
                                              emission_scores)

print("log_likelihood: ", log_likelihood)
print("\nlog_forward computations:"); log_forward

log_likelihood: -5.068232326005127
log_forward computations:

2×4 Array{Float64,2}:
 -0.693147  -1.67398  -3.75342  -Inf     
 -2.48491   -2.58335  -2.94018    -4.0874


## Efficient backward probability computation



The backward probability is similar to the forward probability, but operates in the inverse direction.
It represents the probability of observing $x_{i+1},\ldots,x_N$ from position $i+1$ up to $N$, given that at position $i$ we are at state $Y_i = c_l$:

\begin{equation}
\mathbf{Backward \ Probability\!:}\;\;\;\;  \text{backward}(i, c_l) = P(X_{i+1}=x_{i+1},\ldots, X_N=x_N | Y_i = c_l).
\end{equation}



Using the independence assumptions of the HMM we can compute $\text{backward}(i, c_k)$ using all the backward computations $\text{backward}(i +1, c)$ for $c \in \Lambda$.

Therefore we can write   $\text{backward}(i, y_i) $ as $P( x_{i+1:N} | y_i ) $ and rewrite the forward expression as follows:

\begin{equation}
  P( x_{i+1:N} | y_i ) =  \sum_{y_{i+1} \in \Lambda} P( x_{i+1:N}, y_{i+1} | y_i)  =  \sum_{y_{i+1} \in \Lambda} P( x_{i+2:N} | y_i, y_{i+1}, x_{i+1}) 
   P( x_{i+1}, |  y_{i+1},  y_{i}) P( y_{i+1} | y_i)
\end{equation}

Using the previous equation we have proved that the backward probability can be defined by the following recurrence rule:


\begin{eqnarray}
\mathrm{backward}(N, c_l) &=& P_{\text{final}}(\text{stop} | c_l)  \\
\text{backward}(i, c_l) &=&  \displaystyle \sum_{c_k \in \Lambda} P_{\text{trans}}(c_k | c_l) \times 
\text{backward}(i+1, c_k) \times P_{\text{emiss}}(x_{i+1} | c_k) 
 \\
  \mathrm{backward}(0, \text{start}) &=& \sum_{c_k \in \Lambda} P_{\mathrm{init}}(c_k | \text{ start}) \times \mathrm{backward}(1, c_k) \times P_{\mathrm{emiss}}(x_{1} | c_k).
 \end{eqnarray}

Using the backward trellis one can compute the likelihood simply as:

\begin{equation}
P(X=x) = \mathrm{backward}(0, \text{start}).
\end{equation}



In [157]:
function run_log_backward(initial_scores,
                          transition_scores,
                          final_scores,
                          emission_scores)
    """
    Compute the log_backward computations
    
    Assume there are K possible states and a sequence of length N.
    This method will compute iteritavely the log_forward quantities.
    
    * log_b is a K x N Array.
    * log_b[:,i] will contain the forward quantities at position i.
    * log_b[:,i] is a vector of size K
    
    Returns
    - log_b::Array{Float64,2}, size(log_b)=(K,N)
    - log_likelihood::Float64
    """
    length_sequence = size(emission_scores)[2]
    n_states = length(initial_scores)
    log_b = zeros(n_states, length_sequence) - Inf

    # Initialization
    log_b[:,length_sequence] = final_scores

    for n in length_sequence-1:-1:1
        for s in 1:n_states
             log_b[s,n] = logsum(log_b[:,n+1] + transition_scores[s,:] + emission_scores[:,n+1])
        end
    end
    
    log_likelihood = logsum(log_b[:,1] + initial_scores + emission_scores[:,1])
    
    return log_likelihood, log_b
end



run_log_backward (generic function with 1 method)

In [158]:
log_likelihood, log_backward = run_log_backward(initial_scores,
                                                transition_scores,
                                                final_scores,
                                                emission_scores)

print("\nlog_likelihood: ", log_likelihood)
print("\nlog_backward computations:"); log_backward'


log_likelihood: -5.068232326005126
log_backward computations:

4×2 Array{Float64,2}:
   -4.41864  -5.73879 
   -3.67819  -3.8825  
   -2.65481  -2.43166 
 -Inf        -0.980829


# The forward backward algorithm

Now we will see why we migh want to compute the forward and backward quantities.

We have seen how we can compute the probability of a sequence $x$ using the the forward and backward probabilities by computing  $\mathrm{forward}(N+1, \text{ stop})$ and $ \mathrm{backward}(0, \text{ start})$ respectively. Moreover,  the probability of a sequence $x$ can be computed with both forward and backward probabilities at a particular position $i$. 

The probability of a  given sequence $x$ at any position $i$ in the sequence can be computed
as follows:


\begin{eqnarray}
  P(X=x) &=& 
  \sum_{c_k \in \Lambda} P(X_1=x_1,\ldots, X_N=x_N,Y_i=c_k)\nonumber\\
  & =&
  \sum_{c_k \in \Lambda} 
  \underbrace{P(X_1=x_1,\ldots, X_i=x_i, Y_i=c_k)}_{\mathrm{forward}(i,c_k)} \times 
  \underbrace{P(X_{i+1}=x_{i+1},\ldots, X_N=x_N| Y_i=c_k)}_{\mathrm{backward}(i,c_k)}\nonumber\\
  &=& \sum_{c_k \in \Lambda} \mathrm{forward}(i,c_k) \times \mathrm{backward}(i,c_k).
\end{eqnarray}



This equation will work for any choice of $i$. Although redundant, this fact is useful when implementing an
HMM as a sanity check that the computations are being performed
correctly, since one can compute this expression for several $i$; they should all yield the same value. 

The following pseudocode shows the the forward backward algorithm. 

The reader can notice that the $forward$ and $backward$ computations in the algorithm make use of $P_{emiss}$ and $P_{trans}$. There are a couple of details that should be taken into account if the reader wants to understand the algorithm using scores instead of probabilities.


- $forward(i,x,\hat{c})$  is computed using $P_{emiss}(x_i | \hat{c})$ which does not depend on the sum over all possible states $c_k \in  \Lambda $. Therefore when taking the logarithm of the sum over all possible states the recurrence of the forward computations can be split as a sum of two logarithms.


- $backward(i,x,\hat{c})$  is computed using $ P_{\text{trans}}(c_k | \hat{c} )$ and $P_{\text{emiss}}(x_{i+1} | c_k) $ both of  which  depend on $c_k$. Therefore when taking the logarithm of the sum the expression cannot be split as a sum of logarithms.



Given the forward and backward probabilities, one can compute both the state
and transition posteriors as follows:


\begin{align}
 \mathbf{State \ Posterior\!:}\;\;\;\;  & P(Y_i = y_i| X=x) = \frac{\mathrm{forward}(i,x, y_i) \times 
 \mathrm{backward}(i,x, y_i)}{P(X=x)}\\
 \mathbf{Transition \ Posterior\!:}\;\;\;\; &
 P(Y_i = y_i, Y_{i+1} = y_{i+1} | X=x)= \nonumber\\
 &
   \frac{\mathrm{forward}(i, y_i) \times 
   P_{\mathrm{trans}}(y_{i+1}|y_i) \times
   P_{\mathrm{emiss}}(x_{i+1}|y_{i+1}) \times
 \mathrm{backward}(i+1, y_{i+1})}{P(X=x)}
\end{align}

As a practical example, given that the person performs the sequence of actions $\text{ walk} \text{ walk} \text{ shop} \text{ clean}$, we want to know the probability of having been raining in the second day. The state posterior probability for this event can be seen as the probability that the sequence of actions above was generated by a sequence of weathers and where it was raining in the second day. In this case, the possible sequences would be all the sequences which have {\tt rainy} in the second position.


Using the state posteriors, we are ready to perform posterior
decoding. 
The strategy is to compute the state posteriors 
for each position $i \in \{1,\ldots,N\}$
and each state $c_k \in \Lambda$, and 
then pick the arg-max at each position:

$$
{\widehat y_i} := \text{argmax}_{y_i \in \Lambda} P(Y_i=y_i| X=x).
$$





    def compute_posteriors(self, initial_scores, transition_scores,
                           final_scores, emission_scores):
        """
        Compute the state and transition posteriors:
        - The state posteriors are the probability of each state
        occurring at each position given the sequence of observations.
        - The transition posteriors are the joint probability of two states
        in consecutive positions given the sequence of observations.
        Both quantities are computed via the forward-backward algorithm.
        """

        length = np.size(emission_scores, 0)  # Length of the sequence.
        num_states = np.size(emission_scores, 1)  # Number of states.

        log_likelihood, forward = self.decoder.run_forward(initial_scores,
                                                           transition_scores,
                                                           final_scores,
                                                           emission_scores)
        log_likelihood, backward = self.decoder.run_backward(initial_scores,
                                                             transition_scores,
                                                             final_scores,
                                                             emission_scores)

        state_posteriors = np.zeros([length, num_states])  # State posteriors.
        for pos in xrange(length):
            state_posteriors[pos, :] = forward[pos, :] + backward[pos, :]
            state_posteriors[pos, :] -= log_likelihood

        transition_posteriors = np.zeros([length-1, num_states, num_states])
        for pos in xrange(length-1):
            for prev_state in xrange(num_states):
                for state in xrange(num_states):
                    transition_posteriors[pos, state, prev_state] = \
                        forward[pos, prev_state] + \
                        transition_scores[pos, state, prev_state] + \
                        emission_scores[pos+1, state] + \
                        backward[pos+1, state]
                    transition_posteriors[pos, state, prev_state] -= log_likelihood

        state_posteriors = np.exp(state_posteriors)
        transition_posteriors = np.exp(transition_posteriors)

        return state_posteriors, transition_posteriors, log_likelihood

In [159]:
size(emission_scores)[2]

4

In [160]:
log_forward[:,2]

2-element Array{Float64,1}:
 -1.67398
 -2.58335

### compute posterior

In [161]:
function compute_state_posteriors(initial_scores, transition_scores, final_scores, emission_scores)
    num_states = size(emission_scores)[1]  # Number of states.
    length = size(emission_scores)[2]      # Length of the sequence.
    
    log_likelihood, forward =  run_log_forward(initial_scores,
                                          transition_scores,
                                          final_scores,
                                          emission_scores)
    
    log_likelihood, backward = run_log_backward(initial_scores,
                                            transition_scores,
                                            final_scores,
                                            emission_scores)
    
    state_posteriors = zeros(num_states, length)      
    for pos in 1:length
        state_posteriors[:, pos] = forward[:, pos] + backward[:, pos] - log_likelihood
    end
    return state_posteriors
end



compute_state_posteriors (generic function with 1 method)

In [162]:
function posterior_decoding(state_posteriors)
    return mapslices(indmax, state_posteriors, 1)
end



posterior_decoding (generic function with 1 method)

In [163]:
state_posteriors = compute_state_posteriors(initial_scores, transition_scores, final_scores, emission_scores)
posterior_decoding(state_posteriors)

1×4 Array{Int64,2}:
 1  1  2  2

### Loading dataset


TODO: 1 feb: 
- Make a reader for this function. 
- Train HMM with the conll data.
- Do posterior decoding.

First sentence in  "train-02-21.conll"

    1	No	_	RB	RB	_	4	VMOD	_	_
    2	,	_	.	,	_	4	P	_	_
    3	it	_	PR	PRP	_	4	SUB	_	_
    4	was	_	VB	VBD	_	0	ROOT	_	_
    5	n't	_	RB	RB	_	4	VMOD	_	_
    6	Black	_	NN	NNP	_	7	NMOD	_	_
    7	Monday	_	NN	NNP	_	4	PRD	_	_
    8	.	_	.	.	_	4	P	_	_


In [164]:
function build_sequences_from_data(file_path; min_sequence_length_allowed=5)
    f = open(file_path, "r")
    lines = readlines(f)
    sequences = Array{Sequence}([])
    sequence_counter = 0
    min_seq_length = min_sequence_length_allowed
    max_seq_length = 0
    words = Array{String}([])
    tags = Array{String}([])

    for line in lines
        line_splitted = split(line, "\t")

        if line_splitted[1] == "\n"
            current_lenght = length(words)
            
            if (current_lenght < min_sequence_length_allowed) continue end
            if (current_lenght < min_seq_length) min_seq_length = current_lenght end
            if (current_lenght > max_seq_length) max_seq_length = current_lenght end
            
            push!(sequences, Sequence(words,tags))
            words = Array{String}([])
            tags = Array{String}([])
            sequence_counter +=1
        else
            push!(words, line_splitted[2])
            push!(tags, line_splitted[5])    
        end
    end
    print("\nNumber sequences: ", sequence_counter)
    print("\nMin sequence length: ", min_seq_length)
    print("\nMax sequence length: ", max_seq_length)
    return sequences
end



build_sequences_from_data (generic function with 1 method)

In [111]:
file_path_train = homedir() * "/Documents/Datasets/conll/train-02-21.conll"
file_path_valid = homedir() * "/Documents/Datasets/conll/dev-22.conll"
file_path_test = homedir() * "/Documents/Datasets/conll/test-23.conll"

train_seq = build_sequences_from_data(file_path_train, min_sequence_length_allowed=3);
valid_seq = build_sequences_from_data(file_path_valid, min_sequence_length_allowed=3);
test_seq = build_sequences_from_data(file_path_test, min_sequence_length_allowed=3);


Number sequences: 39642
Min sequence length: 3
Max sequence length: 141
Number sequences: 1684
Min sequence length: 3
Max sequence length: 118
Number sequences: 2408
Min sequence length: 3
Max sequence length: 67

### Train HMM and do posterior decoding with the conll data

    hmm = hmmc.HMM(corpus.word_dict, corpus.tag_dict)
    hmm.train_supervised(train_seq)
    hmm.print_transition_matrix()

In [112]:
function assign_elements_to_integers(elements)
    element_to_pos = Dict{String, Int64}()
    for (k,e) in enumerate(elements)
        element_to_pos[e] = k
    end
    return element_to_pos
end



assign_elements_to_integers (generic function with 1 method)

In [84]:
function get_possible_words_tags(sequences::Array{Sequence})
    possible_words = Set{String}([])
    possible_states = Set{String}([])
    
    for sequence in sequences
        for (word,tag) in zip(sequence.words, sequence.labels)
            push!(possible_words, word)
            push!(possible_states, tag)
        end
    end
    return possible_words, possible_states
end

get_possible_words_tags (generic function with 1 method)

In [85]:
possible_words, possible_states =  get_possible_words_tags(train_seq)

(Set(String["rearrangement","photosynthesis","Kensetsu","whiz","cost-benefit","gathered","Core","underground","Shinpan","8.63"  …  "137,200","convincing","NV","non-violent","156.7","2233.9","134,550","money-transfer","shorten","Freedman"]),Set(String["CC",".","PDT","VBP","#","VBD","WRB","VBG","NNS","NN"  …  "MD","RP","JJ","\$","JJR","TO","PRP","NNPS","UH","POS"]))

In [86]:
word_to_pos = assign_elements_to_integers(possible_words);
state_to_pos = assign_elements_to_integers(possible_states);

In [87]:
function update_initial_counts!(initial_counts, seq, state_to_pos)
    initial_counts[state_to_pos[seq.labels[1]]] = initial_counts[state_to_pos[seq.labels[1]]] + 1
end

function update_transition_counts!(transition_counts, seq, state_to_pos)
    for (t1,t2) in zip(seq.labels[1:end-1], seq.labels[2:end])
        transition_counts[state_to_pos[t1], state_to_pos[t2]] += 1 
    end    
end

function update_emission_counts!(emission_counts, seq, state_to_pos, word_to_pos)
    for (t,w) in zip(seq.labels, seq.words)
        emission_counts[state_to_pos[t], word_to_pos[w]] += 1 
    end 
end

function update_final_counts!(final_counts, seq, state_to_pos)
    final_counts[state_to_pos[seq.labels[end]]] +=1
end

update_final_counts! (generic function with 1 method)

In [88]:
function sufficient_statistics_hmm(sequences, state_to_pos, word_to_pos)
    
    n_states = length(state_to_pos)
    n_words = length(word_to_pos)
    
    initial_counts      = zeros(n_states)
    transition_counts   = zeros(n_states, n_states)
    final_counts        = zeros(n_states) 
    emission_counts     = zeros(n_states, n_words)
    for seq in sequences
        update_initial_counts!(initial_counts, seq, state_to_pos)
        update_transition_counts!(transition_counts, seq,  state_to_pos)
        update_emission_counts!(emission_counts, seq,  state_to_pos, word_to_pos) 
        update_final_counts!(final_counts, seq,  state_to_pos) 
    end
    
    return initial_counts, transition_counts, final_counts, emission_counts
end

sufficient_statistics_hmm (generic function with 1 method)

In [89]:
function fit!(hmm::Hmm, sequences::Array{Sequence})
    
    possible_words, possible_states =  get_possible_words_tags(sequences)
    word_to_pos = assign_elements_to_integers(possible_words);
    state_to_pos = assign_elements_to_integers(possible_states);
    
    hmm.word_to_pos = word_to_pos
    hmm.state_to_pos = state_to_pos
    hmm.pos_to_word = map(reverse, hmm.state_to_pos)
    hmm.pos_to_state = map(reverse, hmm.state_to_pos)

    counts = sufficient_statistics_hmm(sequences, state_to_pos, word_to_pos)
    initial_counts, transition_counts, final_counts, emission_counts = counts
    
    hmm.possible_words = possible_words
    hmm.possible_states = possible_states
    
    hmm.initial_counts = initial_counts
    hmm.transition_counts = transition_counts
    hmm.final_counts = final_counts
    hmm.emission_counts = emission_counts
    
    ### This could be rewritten using for loops and it could be much cleaarer
    hmm.initial_probs = (initial_counts / sum(initial_counts))
    hmm.transition_probs = transition_counts./(sum(transition_counts, 2) + final_counts)
    # vec is added here because hmm.final_probs is defined as a Vector and 
    # sum(transition_counts, 2) is a  matrix of size (K,1) instead of a vector of size (K,)
    hmm.final_probs =  final_counts ./ (vec(sum(transition_counts, 2)) + final_counts )
    hmm.emission_probs = (emission_counts ./ sum(emission_counts, 2))
end

fit! (generic function with 1 method)

In [90]:
hmm = Hmm()

Hmm(Set{String}(),Set{String}(),Dict{String,Int64}(),Dict{String,Int64}(),Dict{Int64,String}(),Dict{Int64,String}(),Int64[],Array{Int64}(0,0),Array{Int64}(0,0),Int64[],Float64[],Array{Float64}(0,0),Array{Float64}(0,0),Float64[],false)

In [91]:
fit!(hmm, train_seq);

### Posterior inference with a HMM

In [96]:
function compute_scores(hmm::Hmm, sequence::Sequence)
    
    length_sequence = length(sequence.words)
    n_states = length(hmm.possible_states)
    
    initial_scores = log.(hmm.initial_probs)
    transition_scores = log.(hmm.transition_probs)
    sequence_words_integers = [hmm.word_to_pos[x] for x in sequence.words]
    
    emission_scores = log.(hmm.emission_probs[:, sequence_words_integers])
    final_scores = log.(hmm.final_probs)
    
    return initial_scores, transition_scores, final_scores, emission_scores
end

compute_scores (generic function with 2 methods)

In [97]:
function compute_state_posteriors(initial_scores, transition_scores, final_scores, emission_scores)
    num_states = size(emission_scores)[1]  # Number of states.
    length = size(emission_scores)[2]      # Length of the sequence.
    
    log_likelihood, forward =  run_log_forward(initial_scores,
                                          transition_scores,
                                          final_scores,
                                          emission_scores)
    
    log_likelihood, backward = run_log_backward(initial_scores,
                                            transition_scores,
                                            final_scores,
                                            emission_scores)
    
    state_posteriors = zeros(num_states, length)      
    for pos in 1:length
        state_posteriors[:, pos] = forward[:, pos] + backward[:, pos] - log_likelihood
    end
    return state_posteriors
end

compute_state_posteriors (generic function with 1 method)

In [98]:
function posterior_decode(hmm::Hmm, sequence::Sequence; return_integers=false)  
    initial_scores, transition_scores, final_scores, emission_scores = compute_scores(hmm, sequence)
    state_posteriors = compute_state_posteriors(initial_scores, transition_scores, final_scores, emission_scores)
    predicted_tags = mapslices(indmax, state_posteriors, 1)
    
    if return_integers == false
        return vec([hmm.pos_to_state[tag] for tag in predicted_tags])
    else
        return predicted_tags
    end
end

posterior_decode (generic function with 1 method)

In [100]:
posterior_decode(hmm, train_seq[1])'

1×49 Array{String,2}:
 "IN"  "DT"  "NNP"  "CD"  "NN"  "IN"  "``"  …  "VBN"  "TO"  "NNP"  "NNP"  "."

In [101]:
posterior_decode(hmm, train_seq[1])'

1×49 Array{String,2}:
 "IN"  "DT"  "NNP"  "CD"  "NN"  "IN"  "``"  …  "VBN"  "TO"  "NNP"  "NNP"  "."

In [102]:
sum(train_seq[1].labels .== posterior_decode(hmm, train_seq[1]))

47

In [103]:
length(train_seq[1])

49

In [104]:
begin
    total_predicted_states = 0
    total_correct = 0
    for seq in train_seq
        total_correct += sum(seq.labels .== posterior_decode(hmm, seq))
        total_predicted_states += length(seq) 
    end
end

In [105]:
print("accuracy: ", total_correct/total_predicted_states)

accuracy: 0.970337716362044

### TODO

- Confusion matrix: check which labels (states) are the most incorretly predicted

# Viterbi decoding: TODO