# Creating a POS tagger

We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are

In [6]:
from nltk.corpus import brown
from nltk import FreqDist

suffix_fdist = FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    
suffix_fdist

FreqDist({'e': 202946, ',': 175002, '.': 152999, 's': 128722, 'd': 105687, 't': 94459, 'he': 92084, 'n': 87889, 'a': 74912, 'of': 72978, ...})

In [7]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']

Next, we'll define a feature extractor function which checks a given word for these suffixes:

In [10]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

pos_features('test')

{'endswith(e)': False,
 'endswith(,)': False,
 'endswith(.)': False,
 'endswith(s)': False,
 'endswith(d)': False,
 'endswith(t)': True,
 'endswith(he)': False,
 'endswith(n)': False,
 'endswith(a)': False,
 'endswith(of)': False,
 'endswith(the)': False,
 'endswith(y)': False,
 'endswith(r)': False,
 'endswith(to)': False,
 'endswith(in)': False,
 'endswith(f)': False,
 'endswith(o)': False,
 'endswith(ed)': False,
 'endswith(nd)': False,
 'endswith(is)': False,
 'endswith(on)': False,
 'endswith(l)': False,
 'endswith(g)': False,
 'endswith(and)': False,
 'endswith(ng)': False,
 'endswith(er)': False,
 'endswith(as)': False,
 'endswith(ing)': False,
 'endswith(h)': False,
 'endswith(at)': False,
 'endswith(es)': False,
 'endswith(or)': False,
 'endswith(re)': False,
 'endswith(it)': False,
 'endswith(``)': False,
 'endswith(an)': False,
 "endswith('')": False,
 'endswith(m)': False,
 'endswith(;)': False,
 'endswith(i)': False,
 'endswith(ly)': False,
 'endswith(ion)': False,
 'endsw

Now that we've defined our feature extractor, we can use it to train a new decision tree classifier:

In [11]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

({'endswith(e)': True,
  'endswith(,)': False,
  'endswith(.)': False,
  'endswith(s)': False,
  'endswith(d)': False,
  'endswith(t)': False,
  'endswith(he)': True,
  'endswith(n)': False,
  'endswith(a)': False,
  'endswith(of)': False,
  'endswith(the)': True,
  'endswith(y)': False,
  'endswith(r)': False,
  'endswith(to)': False,
  'endswith(in)': False,
  'endswith(f)': False,
  'endswith(o)': False,
  'endswith(ed)': False,
  'endswith(nd)': False,
  'endswith(is)': False,
  'endswith(on)': False,
  'endswith(l)': False,
  'endswith(g)': False,
  'endswith(and)': False,
  'endswith(ng)': False,
  'endswith(er)': False,
  'endswith(as)': False,
  'endswith(ing)': False,
  'endswith(h)': False,
  'endswith(at)': False,
  'endswith(es)': False,
  'endswith(or)': False,
  'endswith(re)': False,
  'endswith(it)': False,
  'endswith(``)': False,
  'endswith(an)': False,
  "endswith('')": False,
  'endswith(m)': False,
  'endswith(;)': False,
  'endswith(i)': False,
  'endswith(ly)': 

In [13]:
from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

In [15]:
train_set[0]

({'endswith(e)': False,
  'endswith(,)': False,
  'endswith(.)': False,
  'endswith(s)': False,
  'endswith(d)': False,
  'endswith(t)': False,
  'endswith(he)': False,
  'endswith(n)': False,
  'endswith(a)': False,
  'endswith(of)': False,
  'endswith(the)': False,
  'endswith(y)': False,
  'endswith(r)': True,
  'endswith(to)': False,
  'endswith(in)': False,
  'endswith(f)': False,
  'endswith(o)': False,
  'endswith(ed)': False,
  'endswith(nd)': False,
  'endswith(is)': False,
  'endswith(on)': False,
  'endswith(l)': False,
  'endswith(g)': False,
  'endswith(and)': False,
  'endswith(ng)': False,
  'endswith(er)': False,
  'endswith(as)': False,
  'endswith(ing)': False,
  'endswith(h)': False,
  'endswith(at)': False,
  'endswith(es)': False,
  'endswith(or)': False,
  'endswith(re)': False,
  'endswith(it)': False,
  'endswith(``)': False,
  'endswith(an)': False,
  "endswith('')": False,
  'endswith(m)': False,
  'endswith(;)': False,
  'endswith(i)': False,
  'endswith(ly)'

In [16]:
# classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. 
# Therefore, this may take forever. For speed, use scikit-learn for the classifiers.
# accuracy(classifier, test_set)

KeyboardInterrupt: 

In [None]:
# classifier.classify(pos_features('cats'))