# 6. Learning to Classify Text

## 6.1 Supervised Classification

In [1]:
import nltk

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [3]:
gender_features('Shrek')

{'last_letter': 'k'}

In [4]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
    [(name, 'female') for name in names.words('female.txt')])

In [5]:
import random
random.shuffle(labeled_names)
print(labeled_names[:20])

[('Alayne', 'female'), ('Winne', 'female'), ('Felix', 'male'), ('Ruella', 'female'), ('Rowena', 'female'), ('Hallie', 'female'), ('Rosario', 'female'), ('Barbee', 'female'), ('Shurlock', 'male'), ('Esma', 'female'), ('Cecil', 'female'), ('Ximenez', 'male'), ('Noble', 'male'), ('Fiona', 'female'), ('Jessee', 'male'), ('Fancie', 'female'), ('Abbott', 'male'), ('Alyce', 'female'), ('Derrin', 'male'), ('Kelley', 'male')]


In [6]:
featuresets = [(gender_features(n), g) for (n,g) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_features('Neo'))

'male'

In [8]:
classifier.classify(gender_features('Trinity'))

'female'

In [9]:
print (nltk.classify.accuracy(classifier, test_set))

0.73


In [10]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.2 : 1.0
             last_letter = 'k'              male : female =     29.3 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


In [11]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [12]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features


In [13]:
gender_features2('John')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'firstletter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'lastletter': 'n'}

In [14]:
featuresets = [(gender_features2(n), g) for (n,g) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.756


In [15]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [16]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, devtest_set))

0.754


In [17]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
len(errors)

246

In [18]:
for (tag, guess, name) in sorted(errors):
    print ('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Alex                          
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Arlyn                         
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Brier                         
correct=female   guess=male     name=Brigit                        
correct=female   guess=male     name=Carol-Jean                    
correct=female   guess=male     name=Carolann                      
correct=female   guess=male     name=Charis     

In [19]:
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [20]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, devtest_set))

0.777


In [21]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [22]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]

In [23]:
list(all_words.keys())[:20]

['pollock',
 'whammy',
 'land',
 'spared',
 'overachieving',
 'mesh',
 'hairpiece',
 'soho',
 'focussed',
 'jamahl',
 'singe',
 'unlikable',
 'incredulity',
 'veloz',
 'lope',
 'clear',
 'impulse',
 'demesne',
 'unstable',
 'disengaging']

In [24]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [25]:
print (document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(families)': False, 'contains(hotties)': False, 'contains(dialectic)': False, 'contains(lowbrow)': False, 'contains(repessed)': False, 'contains(chimp)': False, 'contains(wwii)': False, 'contains(comprehending)': False, 'contains(treacherous)': False, 'contains(tantrum)': False, 'contains(fever)': False, 'contains(undisputed)': False, 'contains(leit)': False, 'contains(cherbourg)': False, 'contains(chats)': False, 'contains(fairy)': False, 'contains(natasha)': False, 'contains(cooperate)': False, 'contains(antarctic)': False, 'contains(biehn)': False, 'contains(opponent)': False, 'contains(reasons)': False, 'contains(pictorial)': False, 'contains(extravaganza)': False, 'contains(charmless)': False, 'contains(supergenius)': False, 'contains(diddly)': False, 'contains(gleam)': False, 'contains(astronomically)': False, 'contains(videoshelves)': False, 'contains(ratchet)': False, 'contains(mesh)': False, 'contains(manhole)': False, 'contains(serves)': False, 'contains(baffles)': 

In [26]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [27]:
print (nltk.classify.accuracy(classifier, test_set))

0.64


In [28]:
classifier.show_most_informative_features(10)

Most Informative Features
          contains(slip) = True              pos : neg    =     11.8 : 1.0
          contains(3000) = True              neg : pos    =     10.9 : 1.0
     contains(uplifting) = True              pos : neg    =      8.7 : 1.0
  contains(refreshingly) = True              pos : neg    =      8.4 : 1.0
      contains(reminder) = True              pos : neg    =      8.4 : 1.0
      contains(republic) = True              pos : neg    =      6.4 : 1.0
     contains(pregnancy) = True              neg : pos    =      6.3 : 1.0
        contains(kombat) = True              neg : pos    =      6.3 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
          contains(noah) = True              pos : neg    =      5.7 : 1.0


In [29]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]]+=1
    suffix_fdist[word[-2:]]+=1
    suffix_fdist[word[-3:]]+=1

In [30]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [31]:
print (common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [32]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)

    return features

In [33]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [34]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [35]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [36]:
classifier.classify(pos_features('cats'))

'NNS'

In [37]:
print (classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [38]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]

    return features

In [39]:
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [40]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []

In [41]:
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag) )

In [42]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [43]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [44]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


## 6.2 Further Examples of Supervised Classification

In [45]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [46]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [47]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [48]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [49]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [50]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [51]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [52]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668


In [53]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [54]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print(extractor.text_words)

{'Russia', 'terrorism.', 'binds', 'operation', 'former', 'Soviet', 'fledgling', 'fight', 'Davudi', 'Co', 'four', 'Parviz', 'that', 'meeting', 'Organisation', 'China', 'association', 'together', 'Asia', 'was', 'central', 'Shanghai', 'representing', 'republics', 'SCO', 'at', 'Iran'}


In [55]:
print(extractor.hyp_words)

{'member', 'China', 'SCO.'}


In [56]:
print(extractor.overlap('word'))

set()


In [57]:
print(extractor.overlap('ne'))

{'China'}


In [58]:
print(extractor.hyp_extra('word'))

{'member'}


## 6.3 Evaluation

In [59]:
import random
from nltk.corpus import brown

In [60]:
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [61]:
tagged_sents[0]

[('Meanwhile', 'RB'),
 (',', ','),
 ('there', 'EX'),
 ('appears', 'VBZ'),
 ('to', 'TO'),
 ('be', 'BE'),
 ('enough', 'AP'),
 ('money', 'NN'),
 ('in', 'IN'),
 ('the', 'AT'),
 ("road's", 'NN$'),
 ('reserve', 'NN'),
 ('fund', 'NN'),
 ('to', 'TO'),
 ('cover', 'VB'),
 ('the', 'AT'),
 ('interest', 'NN'),
 ('deficiency', 'NN'),
 ('for', 'IN'),
 ('eight', 'CD'),
 ('more', 'AP'),
 ('years', 'NNS'),
 ('.', '.')]

In [62]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

ValueError: too many values to unpack (expected 2)

In [63]:
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack (expected 2)

In [64]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [65]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack (expected 2)

In [66]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

In [67]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack (expected 2)

In [68]:
train_set[0]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [70]:
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

In [71]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]


In [72]:
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]


In [73]:
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))


In [74]:
cm = nltk.ConfusionMatrix(gold, test)


In [75]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |                                         N                      |
    |      N      I      A      J             N             V      N |
    |      N      N      T      J      .      S      ,      B      P |
----+----------------------------------------------------------------+
 NN | <11.8%>  0.0%      .   0.2%      .   0.0%      .   0.3%   0.0% |
 IN |   0.0%  <9.0%>     .      .      .   0.0%      .      .      . |
 AT |      .      .  <8.6%>     .      .      .      .      .      . |
 JJ |   1.6%      .      .  <4.0%>     .      .      .   0.0%   0.0% |
  . |      .      .      .      .  <4.8%>     .      .      .      . |
NNS |   1.5%      .      .      .      .  <3.3%>     .      .   0.0% |
  , |      .      .      .      .      .      .  <4.4%>     .      . |
 VB |   0.9%      .      .   0.0%      .      .      .  <2.4%>     . |
 NP |   1.0%      .      .   0.0%      .      .      .      .  <1.9%>|
----+----------------------------------------------------------------+
(row =

## 6.4 Decision Trees

In [76]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [77]:
print(entropy(['male', 'male', 'male', 'male']))
print(entropy(['male', 'female', 'male', 'male']))
print(entropy(['female', 'male', 'female', 'male']))
print(entropy(['female', 'female', 'male', 'female']))
print(entropy(['female', 'female', 'female', 'female'])) 

-0.0
0.8112781244591328
1.0
0.8112781244591328
-0.0


## 6.5 Naive Bayes Classifiers

# 6.6 Maximum Entropy Classifiers

## 6.7 Modeling Linguistic Patterns