In [1]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_ic.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_ic.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [2]:
import logging, math, os
from pathlib import Path

In [3]:
import torch
from torch import tensor

In [4]:
from corpus import TaggedCorpus
from eval import model_cross_entropy, write_tagging
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

Set up logging.

In [5]:
log = logging.getLogger("test_ic")       # For usage, see findsim.py in earlier assignment.
logging.root.setLevel(level=logging.INFO)
logging.basicConfig(level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

Switch working directory to the directory where the data live.  You may want to edit this line.

In [6]:
os.chdir("../data")

Get vocabulary and tagset from a supervised corpus.

In [7]:
icsup = TaggedCorpus(Path("icsup"), add_oov=False)
log.info(f"Ice cream vocabulary: {list(icsup.vocab)}")
log.info(f"Ice cream tagset: {list(icsup.tagset)}")

INFO:corpus:Read 40 tokens from icsup
INFO:corpus:Created 4 tag types
INFO:corpus:Created 5 word types
INFO:test_ic:Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO:test_ic:Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']


Two ways to look at the corpus ...

In [8]:
os.system("cat icsup")   # call the shell to look at the file directly

1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C


0

In [9]:
log.info(icsup)          # print the TaggedCorpus python object we constructed from it

INFO:test_ic:1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C


Make an HMM.

In [10]:
log.info("*** Hidden Markov Model (HMM) test\n")
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab)
# Change the transition/emission initial probabilities to match the ice cream spreadsheet,
# and test your implementation of the Viterbi algorithm.  Note that the spreadsheet 
# uses transposed versions of these matrices.
hmm.B = tensor([[0.7000, 0.2000, 0.1000],    # emission probabilities
                [0.1000, 0.2000, 0.7000],
                [0.0000, 0.0000, 0.0000],
                [0.0000, 0.0000, 0.0000]])
hmm.A = tensor([[0.8000, 0.1000, 0.1000, 0.0000],   # transition probabilities
                [0.1000, 0.8000, 0.1000, 0.0000],
                [0.0000, 0.0000, 0.0000, 0.0000],
                [0.5000, 0.5000, 0.0000, 0.0000]])
log.info("*** Current A, B matrices (using initalizations from the ice cream spreadsheet)")
hmm.printAB()

INFO:test_ic:*** Hidden Markov Model (HMM) test

INFO:test_ic:*** Current A, B matrices (using initalizations from the ice cream spreadsheet)


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




Try it out on the raw data from the spreadsheet, available in `icraw``.

In [11]:
log.info("*** Viterbi results on icraw with hard coded parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_hmm.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_hmm.output")   # print the file we just created, and remove it


INFO:test_ic:*** Viterbi results on icraw with hard coded parameters
100%|██████████| 1/1 [00:00<00:00, 121.83it/s]

2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H





0

Did the parameters that we guessed above get the "correct" answer, 
as revealed in `icdev`?

In [12]:
icdev = TaggedCorpus(Path("icdev"), tagset=icsup.tagset, vocab=icsup.vocab)
log.info(f"*** Compare to icdev corpus:\n{icdev}")
from eval import viterbi_error_rate
viterbi_error_rate(hmm, icdev, show_cross_entropy=False)

INFO:test_ic:*** Compare to icdev corpus:
2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/C 3/C 3/C 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H
100%|██████████| 1/1 [00:00<00:00, 132.40it/s]
INFO:eval:Tagging accuracy: all: 90.909%, seen: 90.909%, novel: nan%


0.09090909090909094

Now let's try your training code, running it on supervised data.
To test this, we'll restart from a random initialization.
(You could also try creating this new model with `unigram=true`, 
which will affect the rest of the notebook.)

In [13]:
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab)
log.info("*** A, B matrices as randomly initialized close to uniform")
hmm.printAB()

INFO:test_ic:*** A, B matrices as randomly initialized close to uniform


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.334	0.334	0.332	0.000
H	0.334	0.332	0.334	0.000
_EOS_TAG_	0.334	0.333	0.333	0.000
_BOS_TAG_	0.333	0.334	0.334	0.000

Emission matrix B:
	1	2	3
C	0.333	0.335	0.332
H	0.333	0.333	0.334
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




In [14]:
log.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
hmm.train(corpus=icsup, loss=cross_entropy_loss, tolerance=0.0001)
log.info("*** A, B matrices after training on icsup (should "
         "match initial params on spreadsheet [transposed])")
hmm.printAB()

INFO:test_ic:*** Supervised training on icsup
100%|██████████| 4/4 [00:00<00:00, 823.42it/s]
INFO:eval:Cross-entropy: 1.4671 nats (= perplexity 4.337)


log_Z: -16.1357421875
Z (prob): 9.825082258885232e-08
log_Z: -16.141292572021484
Z (prob): 9.770700870603832e-08
log_Z: -16.1357421875
Z (prob): 9.825082258885232e-08
log_Z: -16.141292572021484
Z (prob): 9.770700870603832e-08


100%|██████████| 4/4 [00:00<00:00, 389.87it/s]


log_Z: -16.1357421875
Z (prob): 9.825082258885232e-08
log_Z: -16.141292572021484
Z (prob): 9.770700870603832e-08
log_Z: -16.1357421875
Z (prob): 9.825082258885232e-08
log_Z: -16.141292572021484
Z (prob): 9.770700870603832e-08

Expected counts A:
tensor([[16.,  2.,  2.,  0.],
        [ 2., 16.,  2.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 2.,  2.,  0.,  0.]])

Expected counts B:
tensor([[14.,  4.,  2.],
        [ 2.,  4., 14.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.]])


100%|██████████| 4/4 [00:00<00:00, 925.08it/s]
INFO:eval:Cross-entropy: 1.0584 nats (= perplexity 2.882)


log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06
log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06


100%|██████████| 4/4 [00:00<00:00, 339.64it/s]


log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06
log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06

Expected counts A:
tensor([[16.,  2.,  2.,  0.],
        [ 2., 16.,  2.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 2.,  2.,  0.,  0.]])

Expected counts B:
tensor([[14.,  4.,  2.],
        [ 2.,  4., 14.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.]])


100%|██████████| 4/4 [00:00<00:00, 930.41it/s]
INFO:eval:Cross-entropy: 1.0584 nats (= perplexity 2.882)
INFO:hmm:Saving model to my_hmm.pkl
INFO:hmm:Saved model to my_hmm.pkl
INFO:test_ic:*** A, B matrices after training on icsup (should match initial params on spreadsheet [transposed])


log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06
log_Z: -11.642183303833008
Z (prob): 8.787473234406207e-06
log_Z: -11.642181396484375
Z (prob): 8.78749051480554e-06
Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




Now that we've reached the spreadsheet's starting guess, let's again tag
the spreadsheet "sentence" (that is, the sequence of ice creams) using the
Viterbi algorithm.

In [15]:
log.info("*** Viterbi results on icraw")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_hmm.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_hmm.output")   # print the file we just created, and remove it

INFO:test_ic:*** Viterbi results on icraw
100%|██████████| 1/1 [00:00<00:00, 121.55it/s]

2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H





0

Next let's use the forward algorithm to see what the model thinks about 
the probability of the spreadsheet "sentence."

In [16]:
log.info("*** Forward algorithm on icraw (should approximately match iteration 0 "
             "on spreadsheet)")
for sentence in icraw:
    prob = math.exp(hmm.logprob(sentence, icraw))
    log.info(f"{prob} = p({sentence})")

INFO:test_ic:*** Forward algorithm on icraw (should approximately match iteration 0 on spreadsheet)
INFO:test_ic:9.12755498101095e-19 = p(2 3 3 2 3 2 3 2 2 3 1 3 3 1 1 1 2 1 1 1 3 1 2 1 1 1 2 3 3 2 3 2 2)


log_Z: -41.537818908691406
Z (prob): 9.127554454513589e-19


Finally, let's reestimate on the icraw data, as the spreadsheet does.
We'll evaluate as we go along on the *training* perplexity, and stop
when that has more or less converged.

In [17]:
log.info("*** Reestimating on icraw (perplexity should improve on every iteration)")
negative_log_likelihood = lambda model: model_cross_entropy(model, icraw)  # evaluate on icraw itself
hmm.train(corpus=icraw, loss=negative_log_likelihood, tolerance=0.0001)
for sentence in icraw:
    prob = math.exp(hmm.logprob(sentence, icraw))
    log.info(f"{prob} = p({sentence})")

INFO:test_ic:*** Reestimating on icraw (perplexity should improve on every iteration)
100%|██████████| 1/1 [00:00<00:00, 264.61it/s]
INFO:eval:Cross-entropy: 1.2217 nats (= perplexity 3.393)


log_Z: -41.537818908691406
Z (prob): 9.127554454513589e-19


100%|██████████| 1/1 [00:00<00:00, 79.15it/s]


log_Z: -41.537818908691406
Z (prob): 9.127554454513589e-19

Expected counts A:
tensor([[1.8163e+18, 2.2598e+17, 3.1730e+16, 0.0000e+00],
        [2.3948e+17, 2.2395e+18, 1.0956e+17, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.8234e+16, 1.2305e+17, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[3.8400e+18, 2.8772e+18, 1.7099e+18],
        [1.2676e+18, 7.2802e+18, 3.1618e+18],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 293.62it/s]
INFO:eval:Cross-entropy: 1.1590 nats (= perplexity 3.187)


log_Z: -39.40589141845703
Z (prob): 7.695535035363572e-18


  0%|          | 0/1 [00:00<?, ?it/s]

log_Z: -39.40589141845703
Z (prob): 7.695535035363572e-18


100%|██████████| 1/1 [00:00<00:00, 73.37it/s]



Expected counts A:
tensor([[9.9021e+16, 8.1350e+15, 4.5800e+14, 0.0000e+00],
        [8.3952e+15, 7.5103e+16, 5.4999e+15, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.9777e+14, 5.7602e+15, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[1.8276e+17, 5.9987e+16, 1.1689e+17],
        [1.8828e+16, 8.6304e+16, 1.7975e+17],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 298.25it/s]
INFO:eval:Cross-entropy: 1.1202 nats (= perplexity 3.065)


log_Z: -38.086204528808594
Z (prob): 2.879861066338888e-17


100%|██████████| 1/1 [00:00<00:00, 78.17it/s]


log_Z: -38.086204528808594
Z (prob): 2.879861066338888e-17

Expected counts A:
tensor([[3.5568e+16, 2.6680e+15, 3.3981e+13, 0.0000e+00],
        [2.6929e+15, 2.8827e+16, 2.1459e+15, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.0099e+12, 2.1709e+15, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[6.2818e+16, 4.1117e+16, 2.1642e+16],
        [5.2492e+15, 6.7541e+16, 3.4567e+16],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 331.88it/s]
INFO:eval:Cross-entropy: 1.1069 nats (= perplexity 3.025)


log_Z: -37.63473892211914
Z (prob): 4.523145350426123e-17


  0%|          | 0/1 [00:00<?, ?it/s]

log_Z: -37.63473892211914
Z (prob): 4.523145350426123e-17


100%|██████████| 1/1 [00:00<00:00, 76.93it/s]



Expected counts A:
tensor([[2.3372e+16, 1.6499e+15, 3.9290e+12, 0.0000e+00],
        [1.6531e+15, 1.8545e+16, 1.4092e+15, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [6.4254e+11, 1.4125e+15, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[4.3570e+16, 1.3288e+16, 2.7800e+16],
        [3.0565e+15, 2.1327e+16, 4.2337e+16],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 298.42it/s]
INFO:eval:Cross-entropy: 1.1065 nats (= perplexity 3.024)


log_Z: -37.620330810546875
Z (prob): 4.588787095115881e-17


  0%|          | 0/1 [00:00<?, ?it/s]

log_Z: -37.620330810546875
Z (prob): 4.588787095115881e-17


100%|██████████| 1/1 [00:00<00:00, 70.37it/s]



Expected counts A:
tensor([[2.3283e+16, 1.6348e+15, 5.7773e+11, 0.0000e+00],
        [1.6353e+15, 1.8945e+16, 1.4212e+15, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [5.9646e+10, 1.4218e+15, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[4.3928e+16, 2.6956e+16, 1.4142e+16],
        [3.2185e+15, 4.2963e+16, 2.2140e+16],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 198.58it/s]

log_Z: -37.468162536621094
Z (prob): 5.34298258437315e-17



INFO:eval:Cross-entropy: 1.1020 nats (= perplexity 3.010)
100%|██████████| 1/1 [00:00<00:00, 82.37it/s]


log_Z: -37.468162536621094
Z (prob): 5.34298258437315e-17

Expected counts A:
tensor([[1.9803e+16, 1.3847e+15, 7.5664e+10, 0.0000e+00],
        [1.3847e+15, 1.6118e+16, 1.2090e+15, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.7762e+09, 1.2091e+15, 0.0000e+00, 0.0000e+00]])

Expected counts B:
tensor([[3.7591e+16, 1.1412e+16, 2.3431e+16],
        [2.8796e+15, 1.8425e+16, 3.6923e+16],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])


100%|██████████| 1/1 [00:00<00:00, 281.29it/s]
INFO:eval:Cross-entropy: 1.1070 nats (= perplexity 3.025)
INFO:hmm:Saving model to my_hmm.pkl
INFO:hmm:Saved model to my_hmm.pkl


log_Z: -37.637115478515625
Z (prob): 4.51240887694743e-17


In [18]:
log.info("*** A, B matrices after reestimation on icraw"
         "should match final params on spreadsheet [transposed])")
hmm.printAB()

INFO:test_ic:*** A, B matrices after reestimation on icrawshould match final params on spreadsheet [transposed])


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.935	0.065	0.000	0.000
H	0.074	0.861	0.065	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.000	1.000	0.000	0.000

Emission matrix B:
	1	2	3
C	0.519	0.158	0.323
H	0.049	0.316	0.634
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




Now let's try out a randomly initialized CRF on the ice cream data. Notice how
the initialized A and B matrices now hold non-negative potentials,
rather than probabilities that sum to 1.

In [19]:
log.info("*** Conditional Random Field (CRF) test\n")
crf = ConditionalRandomField(icsup.tagset, icsup.vocab)
log.info("*** Current A, B matrices (potentials from small random parameters)")
crf.printAB()

INFO:test_ic:*** Conditional Random Field (CRF) test



NotImplementedError: 

Now let's try your training code, running it on supervised data. To test this,
we'll restart from a random initialization. 

Note that the logger reports the CRF's *conditional* cross-entropy, 
log p(tags | words) / n.  This is much lower than the HMM's *joint* 
cross-entropy log p(tags, words) / n, but that doesn't mean the CRF
is worse at tagging.  The CRF is just predicting less information.

In [None]:
log.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
crf.train(corpus=icsup, loss=cross_entropy_loss, lr=0.1, tolerance=0.0001)
log.info("*** A, B matrices after training on icsup")
crf.printAB()

Let's again tag the spreadsheet "sentence" (that is, the sequence of ice creams) 
using the Viterbi algorithm (this may not match the HMM).

In [None]:
log.info("*** Viterbi results on icraw with trained parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_crf.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_crf.output")   # print the file we just created, and remove it