This notebook shows how to retrain the NLTK backoff tagger.
- You'll see an example in which some recipe text has some errors in tagging, most likely because the training data did not have many examples of the target sentence structure.  
- Next, you'll see the affects of adding a few sentences of training data with the missing sentence structure on the accuracy of the tagger.
- Your assignment is to do something similar on your adopted text.


In [None]:
import nltk, re
from nltk.corpus import brown
from nltk import word_tokenize

Define functions for training and evaluating a backoff tagger.

In [None]:
def create_data_sets(sentences):
    size = int(len(sentences) * 0.9)
    train_sents = sentences[:size]
    test_sents = sentences[size:]
    return train_sents, test_sents

def build_backoff_tagger(train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2


def train_tagger(already_tagged_sents):
    train_sents, test_sents = create_data_sets(already_tagged_sents)
    ngram_tagger = build_backoff_tagger(train_sents)
    print ("%0.3f pos accuracy on test set" % ngram_tagger.evaluate(test_sents))
    return ngram_tagger


Make a specialized function for training a tagger on the brown corpus.

In [None]:
def train_tagger_on_brown():
    brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
    'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance','science_fiction'])
    return train_tagger(brown_tagged_sents)

Functions for creating an NLTK corpus object, so we can operate on it using nltk.tokenize_text()

In [None]:
def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences    
    return [nltk.word_tokenize(word) for word in raw_sents]

def create_corpus(f):
    with open(f, 'r') as text_file:
        new_corpus = text_file.read()
    return new_corpus


Now train and evaluate an ngram backoff tagger, using the brown corpus as the training and testing set.  (This takes a few moments to complete.)

In [48]:
brown_tagger = train_tagger_on_brown()

0.911 pos accuracy on test set


Next, read in a file of recipes and tokenize it.

In [49]:
cookbook_file = './cookbooks.txt'
cookbook_sents = tokenize_text(create_corpus(cookbook_file))

In this collection,  imperative sentences (sentences that being with a verb) are always mistagged.  The POS tagger marks the initial verb as NN instead of VB.  (There may be other kinds of errors too, but we are only looking at imperative sentences here.) In order to see the sentences where the errors are occuring, the code below finds sentences that begin with imperatives, tags them with the tagger, and returns them in a list. 

In [50]:
def get_cookbook_imperatives(sents, tagger):
    cooking_commands = ["Wash", "Stir", "Moisten", "Drain", "Cook", "Pour", "Chop", "Slice", "Season", "Mix", "Fry", "Bake", "Roast", "Wisk"]        
    return [tagger.tag(sent) for sent in sents if sent[0] in cooking_commands]       


Let's look at those sentences.

In [51]:
imperatives = get_cookbook_imperatives(cookbook_sents, brown_tagger)
imperatives[0:5]

[[('Wash', 'NN'),
  ('a', 'AT'),
  ('quarter', 'NN'),
  ('of', 'IN'),
  ('a', 'AT'),
  ('pound', 'NN'),
  ('of', 'IN'),
  ('best', 'JJT'),
  ('pearl', 'NN'),
  ('sago', 'NN'),
  ('thoroughly', 'RB'),
  (',', ','),
  ('then', 'RB'),
  ('stew', 'NN'),
  ('it', 'PPS'),
  ('quite', 'QL'),
  ('tender', 'JJ'),
  ('and', 'CC'),
  ('very', 'QL'),
  ('View', 'NN'),
  ('page', 'NN'),
  ('[', ','),
  ('32', 'CD'),
  (']', ','),
  ('thick', 'JJ'),
  ('in', 'IN'),
  ('water', 'NN'),
  ('or', 'CC'),
  ('thick', 'JJ'),
  ('broth', 'NN'),
  (';', '.'),
  ('(', '('),
  ('it', 'PPS'),
  ('will', 'MD'),
  ('require', 'VB'),
  ('nearly', 'QL'),
  ('or', 'CC'),
  ('quite', 'QL'),
  ('a', 'AT'),
  ('quart', 'NN'),
  ('of', 'IN'),
  ('liquid', 'NN'),
  (',', ','),
  ('which', 'WDT'),
  ('should', 'MD'),
  ('be', 'BE'),
  ('poured', 'VBN'),
  ('to', 'TO'),
  ('it', 'PPO'),
  ('cold', 'JJ'),
  ('and', 'CC'),
  ('heated', 'VBN'),
  ('slowly', 'RB'),
  (';', '.'),
  (')', ')'),
  ('then', 'RB'),
  ('mix', 'VB'),

Notice that most of the initial words are incorrectly tagged as nouns rather than verbs.  How can we fix this?  One way is to label a few rather generic sentences with the structure we are interested in, add them to the start of the training data, and then retrain the tagger.

In [64]:
def train_tagger_on_brown_augmented_with_cooking_sents():

    cooking_action_sents = [[('Strain', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Mix', 'VB'), ('them', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Season', 'VB'), ('them', 'PPS'), ('with', 'IN'), ('pepper', 'NN'), ('.', '.')], 
                        [('Wash', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Chop', 'VB'), ('the', 'AT'), ('greens', 'NNS'), ('.', '.')],
                        [('Slice', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Bake', 'VB'), ('the', 'AT'), ('cake', 'NN'), ('.', '.')],
                        [('Pour', 'VB'), ('into', 'IN'), ('a', 'AT'), ('mold', 'NN'), ('.', '.')],
                        [('Stir', 'VB'), ('the', 'AT'), ('mixture', 'NN'), ('.', '.')],
                        [('Moisten', 'VB'), ('the', 'AT'), ('grains', 'NNS'), ('.', '.')],
                        [('Cook', 'VB'), ('the', 'AT'), ('duck', 'NN'), ('.', '.')],
                        [('Drain', 'VB'), ('for', 'IN'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]]

    brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
    'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance', 'science_fiction'])
    
    #append hand-tagged cooking sentences to the front of the training data
    print(brown_tagged_sents)
    all_tagged_sents = cooking_action_sents + brown_tagged_sents
    return train_tagger(all_tagged_sents)


[[('Assembly', 'NN-HL'), ('session', 'NN-HL'), ('brought', 'VBD-HL'), ('much', 'AP-HL'), ('good', 'NN-HL')], [('The', 'AT'), ('General', 'JJ-TL'), ('Assembly', 'NN-TL'), (',', ','), ('which', 'WDT'), ('adjourns', 'VBZ'), ('today', 'NR'), (',', ','), ('has', 'HVZ'), ('performed', 'VBN'), ('in', 'IN'), ('an', 'AT'), ('atmosphere', 'NN'), ('of', 'IN'), ('crisis', 'NN'), ('and', 'CC'), ('struggle', 'NN'), ('from', 'IN'), ('the', 'AT'), ('day', 'NN'), ('it', 'PPS'), ('convened', 'VBD'), ('.', '.')], ...]
0.911 pos accuracy on test set


<BigramTagger: size=21172>

Let's retrain the tagger.

In [54]:
brown_and_cooking_tagger = train_tagger_on_brown_augmented_with_cooking_sents()


0.911 pos accuracy on test set


How well is this working on the cookbook imperatives now? Is more training data needed to change the behavior of the tagger?

In [55]:
better_imperatives = get_cookbook_imperatives(cookbook_sents, brown_and_cooking_tagger)
better_imperatives

[[('Wash', 'VB'),
  ('a', 'AT'),
  ('quarter', 'NN'),
  ('of', 'IN'),
  ('a', 'AT'),
  ('pound', 'NN'),
  ('of', 'IN'),
  ('best', 'JJT'),
  ('pearl', 'NN'),
  ('sago', 'NN'),
  ('thoroughly', 'RB'),
  (',', ','),
  ('then', 'RB'),
  ('stew', 'NN'),
  ('it', 'PPS'),
  ('quite', 'QL'),
  ('tender', 'JJ'),
  ('and', 'CC'),
  ('very', 'QL'),
  ('View', 'NN'),
  ('page', 'NN'),
  ('[', ','),
  ('32', 'CD'),
  (']', ','),
  ('thick', 'JJ'),
  ('in', 'IN'),
  ('water', 'NN'),
  ('or', 'CC'),
  ('thick', 'JJ'),
  ('broth', 'NN'),
  (';', '.'),
  ('(', '('),
  ('it', 'PPS'),
  ('will', 'MD'),
  ('require', 'VB'),
  ('nearly', 'QL'),
  ('or', 'CC'),
  ('quite', 'QL'),
  ('a', 'AT'),
  ('quart', 'NN'),
  ('of', 'IN'),
  ('liquid', 'NN'),
  (',', ','),
  ('which', 'WDT'),
  ('should', 'MD'),
  ('be', 'BE'),
  ('poured', 'VBN'),
  ('to', 'TO'),
  ('it', 'PPO'),
  ('cold', 'JJ'),
  ('and', 'CC'),
  ('heated', 'VBN'),
  ('slowly', 'RB'),
  (';', '.'),
  (')', ')'),
  ('then', 'RB'),
  ('mix', 'VB'),

It worked quite well.  It would be worth experimenting to see if it would still work if I'd supplied fewer of the cooking verbs.

##Assignment:##

Rewrite this notebook to do the following:
- Tag your adopted text with an NLTK backoff tagger
- Identify a common type of error that is amenable to fix by making a pattern of training data, similar to what we see with the recipe examples.  You'll want to focus on a particular pattern so that making a few tweaks will have a impact on the results of training.
- Show the before and after effects on the output of the tagger.  Ideally you'll see the errors get fixed not just on the specific examples you fixed, but on similar examples with different words.  In the case of recipes, imperative verbs beyond those in the hardcoded list would be fixed because the tagger would recognize the pattern that verbs can occur at the start of the sentence.

In [74]:
mybook = "the_history_of_north_america.txt"
mybook_sents = tokenize_text(create_corpus(mybook))
trained_tagger = train_tagger_on_brown()
for sent in mybook_sents[:100]:
    print(trained_tagger.tag(sent))

# Notice that "not" is tagged as "*",
def train_tagger_on_brown_augmented_with_my_sents():

    my_action_sents = [[('Where', 'BED'), ('not', 'ADV'), ('the', 'AT'), ('men', 'NNS')],
                    [('.', '.'),('why', 'WRB'), ('not', 'ADV'), ('to', 'TO'), ('.', '.')],
                    [('.', '.'),('those', 'DTS'), ('not', 'ADV'), ('of', 'IN'), ('.', '.')],
                    [('.', '.'),('every', 'AT'), ('instance', 'NN'), ('not', 'ADV'), ('.', '.')],
                    [('.', '.'),('Though', 'CS'), ('not', 'ADV'), ('so', 'QL'), ('.', '.')],
                    [('.', '.'),('it', 'PPO'), ('not', 'ADV'), ('been', 'BEN'), ('.', '.')],
                    [('.', '.'),('has', 'HVZ'), ('not', 'ADV'), ('.', '.')]]
    brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
    'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance', 'science_fiction'])
    
    #append hand-tagged sentences to the front of the training data
    all_tagged_sents =  my_action_sents + brown_tagged_sents
    return train_tagger(all_tagged_sents)

all_tagged_sents = train_tagger_on_brown_augmented_with_my_sents()

0.911 pos accuracy on test set
[('The', 'AT'), ('History', 'NN-TL'), ('of', 'IN-TL'), ('North', 'JJ-TL'), ('America', 'NP-TL'), (':', ':'), ('The', 'AT'), ('colonization', 'NN'), ('of', 'IN'), ('the', 'AT'), ('Middle', 'JJ-TL'), ('States', 'NNS-TL'), ('...', 'NN')]
[('Guy', 'NP-TL'), ('Carleton', 'NP-TL'), ('Lee', 'NP'), (',', ','), ('F.', 'NP'), ('N.', 'NP'), ('Thorpe', 'NP'), ('THE', 'NN'), ('HISTORY', 'NN'), ('OF', 'NN'), ('NORTH', 'NN'), ('AMERICA', 'NN'), ('VOLUME', 'NN'), ('FOUR', 'NN'), ('THE', 'NN'), ('COLONIZATION', 'NN'), ('OF', 'NN'), ('THE', 'NN'), ('MIDDLE', 'NN'), ('STATES', 'NN'), ('AND', 'NN'), ('MARTLAND', 'NN'), ('FREDERICK', 'NN'), ('ROBERTSON', 'NN'), ('JONES', 'NN'), (',', ','), ('Ph.D.', 'NN'), ('OF', 'NN'), ('THE', 'NN'), ('FACULTY', 'NN'), ('OF', 'NN'), ('RRYN', 'NN'), ('MAWR', 'NN'), ('COLLEGE', 'NN'), (',', ','), ('LATE', 'NN'), ('ASSISTANT', 'NN'), ('PROFESSOR', 'NN'), ('OF', 'NN'), ('HISTORY', 'NN'), ('AND', 'NN'), ('SOCIOLOGY', 'NN'), ('AT', 'NN'), ('UNION'

In [75]:
my_tag_final = [all_tagged_sents.tag(sent) for sent in mybook_sents]
my_tag_final

[[('The', 'AT'),
  ('History', 'NN-TL'),
  ('of', 'IN-TL'),
  ('North', 'JJ-TL'),
  ('America', 'NP-TL'),
  (':', ':'),
  ('The', 'AT'),
  ('colonization', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('Middle', 'JJ-TL'),
  ('States', 'NNS-TL'),
  ('...', 'NN')],
 [('Guy', 'NP-TL'),
  ('Carleton', 'NP-TL'),
  ('Lee', 'NP'),
  (',', ','),
  ('F.', 'NP'),
  ('N.', 'NP'),
  ('Thorpe', 'NP'),
  ('THE', 'NN'),
  ('HISTORY', 'NN'),
  ('OF', 'NN'),
  ('NORTH', 'NN'),
  ('AMERICA', 'NN'),
  ('VOLUME', 'NN'),
  ('FOUR', 'NN'),
  ('THE', 'NN'),
  ('COLONIZATION', 'NN'),
  ('OF', 'NN'),
  ('THE', 'NN'),
  ('MIDDLE', 'NN'),
  ('STATES', 'NN'),
  ('AND', 'NN'),
  ('MARTLAND', 'NN'),
  ('FREDERICK', 'NN'),
  ('ROBERTSON', 'NN'),
  ('JONES', 'NN'),
  (',', ','),
  ('Ph.D.', 'NN'),
  ('OF', 'NN'),
  ('THE', 'NN'),
  ('FACULTY', 'NN'),
  ('OF', 'NN'),
  ('RRYN', 'NN'),
  ('MAWR', 'NN'),
  ('COLLEGE', 'NN'),
  (',', ','),
  ('LATE', 'NN'),
  ('ASSISTANT', 'NN'),
  ('PROFESSOR', 'NN'),
  ('OF', 'NN'),
  ('H