In [1]:
push!(LOAD_PATH, ".")
using Revise
using corpus

┌ Info: Recompiling stale cache file /home/dulatf/.julia/compiled/v1.1/corpus.ji for corpus [top-level]
└ @ Base loading.jl:1184


In [3]:
tags = corpus.load_tags("tags-universal.txt");

Loading tags from tags-universal.txt


In [4]:
sentence_dict = corpus.load_corpus("brown-universal.txt", tags);

Loading corpus from brown-universal.txt


In [139]:
train_sentences, test_sentences = train_test_split(collect(values(sentence_dict)),.9);

In [140]:
length(train_sentences), length(test_sentences)

(51606, 5734)

In [15]:
train_words=unique_words(train_sentences);

In [16]:
freq=make_tag_frequencies(train_sentences, tags);

# Most frequent class baseline

In [21]:
most_common_tag = find_most_common_tag(freq);

In [128]:
evs=[evaluate_sentence(sen, x->most_common_tag[x]) for sen in train_sentences];
println("Training accuracy: ", sum(map(last,evs))/sum(map(first,evs)))
evs=[evaluate_sentence(sen, x->most_common_tag[x]) for sen in test_sentences];
println("Test accuracy: ", sum(map(last,evs))/sum(map(first, evs)))

Training accuracy: 0.9571373060961977
Test accuracy: 0.9320721309220931


# Hidden Markov model

The hidden variables are the tags and the observations are the actual words.
First we need tag unigram and bigram counts, from them we can define transition probabilities to go from one tag to another.
See [here](https://web.stanford.edu/~jurafsky/slp3/8.pdf)

In [141]:
tag_counts = corpus.unigram_counts(train_sentences, tags)

14-element Array{Int64,1}:
 132919
  75432
 130453
  50612
  34343
 123483
 248106
  13390
  44553
  26899
 164619
   1241
      0
      0

In [142]:
tag_bigram_counts = corpus.bigram_counts(train_sentences, tags);

In [143]:
transition_probs, emission_probs = corpus.hmm_parameters(train_sentences, tags);

Since we have < s> and < /s> tags now, we can check that the probability to leave any given
state is 1, except for the < /s> state. In other words \sum_{i} P(s_j,s_i) = 1 for any j.
The converse is of course not true, the probability to enter any given state is in (0,1) generally

In [144]:
[sum([get(transition_probs,(j,i), 0) for i in 1:length(tags)]) for j in 1:length(tags)]

14-element Array{Real,1}:
 1.0               
 1.0               
 1.0               
 1.0               
 0.9999999999999999
 0.9999999999999999
 1.0000000000000002
 1.0000000000000002
 0.9999999999999999
 1.0               
 1.0               
 1.0               
 1.0               
 0                 

In [145]:
collect(zip(tags,[get(emission_probs, ("the", i), 0) for i in 1:length(tags)]))

14-element Array{Tuple{String,Real},1}:
 (".", 0)                    
 ("ADJ", 0)                  
 ("ADP", 0)                  
 ("ADV", 0)                  
 ("CONJ", 0)                 
 ("DET", 0.45751237012382273)
 ("NOUN", 0)                 
 ("NUM", 0)                  
 ("PRON", 0)                 
 ("PRT", 0)                  
 ("VERB", 0)                 
 ("X", 0.0024174053182917004)
 ("<s>", 0)                  
 ("</s>", 0)                 

In [146]:
tp = Dict{Tuple{Int, Int}, Float64}()
ep = Dict{Tuple{String, Int}, Float64}()
states = [1, 2, 3]
initial = [0.3, 0.4, 0.3]
obs = ["A", "B", "C", "C", "D", "B"]
tp[(1,1)] = 0.2
tp[(1,2)] = 0.8
tp[(1,3)] = 0.0
tp[(2,1)] = 0.4
tp[(2,2)] = 0.2
tp[(2,3)] = 0.4
tp[(3,1)] = 0.4
tp[(3,2)] = 0.2
tp[(3,3)] = 0.4
ep[("A",1)] = 0.1
ep[("B",1)] = 0.2
ep[("C",1)] = 0.6
ep[("D",1)] = 0.1
ep[("A",2)] = 0.3
ep[("B",2)] = 0.2
ep[("C",2)] = 0.3
ep[("D",2)] = 0.2
ep[("A",3)] = 0.0
ep[("B",3)] = 0.5
ep[("C",3)] = 0.25
ep[("D",3)] = 0.25

0.25

In [147]:
corpus.viterbi(tp,ep,states, initial,obs)

6-element Array{Int64,1}:
 2
 3
 1
 2
 3
 3

In [148]:
initial_state=[get(transition_probs, (tag_index(tags,"<s>"),i),0) for i in 1:length(tags)];

In [149]:
prediction = corpus.viterbi(transition_probs, emission_probs, tags, initial_state, map(first, train_sentences[1]));

In [150]:
println(map(last,train_sentences[2]))
println(corpus.viterbi(transition_probs, emission_probs, tags,
        initial_state, map(first, train_sentences[2]));)
train_sentences[2]

[10, 11, 6, 7, 3, 11, 6, 7, 7, 6, 7, 7, 11, 11, 1]
[10, 11, 6, 7, 3, 11, 6, 7, 7, 6, 7, 7, 11, 11, 1]


15-element Array{Tuple{String,Int64},1}:
 ("To", 10)       
 ("minimize", 11) 
 ("the", 6)       
 ("chances", 7)   
 ("of", 3)        
 ("repeating", 11)
 ("the", 6)       
 ("Balafrej", 7)  
 ("debacle", 7)   
 ("the", 6)       
 ("Ibrahim", 7)   
 ("government", 7)
 ("was", 11)      
 ("formed", 11)   
 (".", 1)         

In [277]:
function emission_probability(word :: String, tag :: Int)
    lambda = 1.0
    get(emission_probs, (word,tag), 0.0) * lambda + (1.0 - lambda)*1.0/length(train_words)
end

emission_probability (generic function with 1 method)

In [287]:
length(twrongs)

17966

In [284]:
total_count = 0
correct_count = 0
twrongs=[]
for sen in train_sentences
    pred = corpus.viterbi(transition_probs, emission_probability, tags,
        initial_state, map(first, sen))
    total_count += length(sen)
    for i in 1:length(sen)
        if pred[i] == sen[i][2]
            correct_count += 1
        else
            push!(twrongs,sen)
        end
    end
end
println("Training set accuracy: ", 100.0 * correct_count/total_count,"%")
twrongs=unique(twrongs);

Training set accuracy: 97.54237369150614%


In [294]:
total_count = 0
correct_count = 0
wrongs=[]
for sen in test_sentences
    pred = corpus.viterbi(transition_probs, emission_probability, tags,
        initial_state, map(first, sen))
    total_count += length(sen)
    for i in 1:length(sen)
        if pred[i] == sen[i][2]
            correct_count += 1
        else
            push!(wrongs, sen)
        end
    end
end
println("Test set accuracy: ", 100.0 * correct_count/total_count,"%")
wrongs = unique(wrongs);

Test set accuracy: 76.51769119869378%


In [295]:
cwrongs=[]
for sen in wrongs
    words=map(first,sen);
    truth=map(x->tags[x],map(last, sen));
    pred=map(x->tags[x],corpus.viterbi(transition_probs,
        (w,s)->get(emission_probs, (w,s), 1/length(train_words)), tags,
        initial_state, map(first, sen)));
    c = 0
    for (t,p) in zip(truth, pred)
        if t != p
            c+=1
        end
    end
    if c > 10
        push!(cwrongs, sen)
    end
end
length(cwrongs)

16

In [298]:
sen=cwrongs[12]
words=map(first,sen);
truth=map(x->tags[x],map(last, sen));
pts, prob, vmat, bmat = corpus.viterbi(transition_probs,
        (w,s)->get(emission_probs, (w,s), 1/length(train_words)),
        tags, initial_state, map(first, sen),true)
pred=map(x->tags[x],pts);
printstyled(lpad("Word",20," "),"\tTruth\tPred\n",color=:yellow)
for (w,t,p) in zip(words,truth,pred)
    printstyled(lpad(w,20," "),"\t",t,"\t",p,"\n",color=if(t==p)
            :default
        else
            :red
            end)
end

[33m                Word	Truth	Pred[39m
[39m                Mrs.	NOUN	NOUN
[39m              Robert	NOUN	NOUN
[39m                  O.	NOUN	NOUN
[31m             Spurdle	NOUN	VERB[39m
[39m                  is	VERB	VERB
[39m            chairman	NOUN	NOUN
[39m                  of	ADP	ADP
[39m                 the	DET	DET
[39m           committee	NOUN	NOUN
[39m                   ,	.	.
[39m               which	DET	DET
[31m            includes	VERB	ADJ[39m
[39m                Mrs.	NOUN	NOUN
[39m               James	NOUN	NOUN
[39m                  A.	NOUN	NOUN
[31m               Moody	NOUN	.[39m
[39m                   ,	.	.
[39m                Mrs.	NOUN	NOUN
[39m               Frank	NOUN	NOUN
[39m                  C.	NOUN	NOUN
[39m           Wilkinson	NOUN	NOUN
[39m                   ,	.	.
[39m                Mrs.	NOUN	NOUN
[31m               Ethel	NOUN	ADP[39m
[39m               Coles	NOUN	NOUN
[39m                   ,	.	.
[39m                Mrs.	NOUN	NOUN


In [301]:
[get(emission_probs,("tiled",i),0) for i in 1:length(tags)]

14-element Array{Real,1}:
 0                    
 0                    
 0                    
 0                    
 0                    
 0                    
 0                    
 0                    
 0                    
 0                    
 1.8223898820913746e-5
 0                    
 0                    
 0                    