In [1]:
# Classification and Feature Sets in the NLTK, Part 2

# POS Tagging Classifier

# for each item to be classified build the features of that item as a dictionary that maps each feature name to a value
# A feature set is the feature dictionary together with the label of the item to be classified

In [2]:
# by looking at suffixes of words and building features

import nltk
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()

for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1 # suffixes of length 1
    suffix_fdist[word[-2:]] +=1 # suffixes of length 2
    suffix_fdist[word[-3:]] +=1 # suffixes of length 3

common_suffixes = []

for suffix in suffix_fdist.most_common(100):
    common_suffixes.append(str(suffix.__getitem__(0)))

print (common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [3]:
# a function that will take a word and create the features for that word

def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix]=word.lower().endswith(suffix)
    return features

In [4]:
pos_features('lovely')

{'endswith(e)': False,
 'endswith(,)': False,
 'endswith(.)': False,
 'endswith(s)': False,
 'endswith(d)': False,
 'endswith(t)': False,
 'endswith(he)': False,
 'endswith(n)': False,
 'endswith(a)': False,
 'endswith(of)': False,
 'endswith(the)': False,
 'endswith(y)': True,
 'endswith(r)': False,
 'endswith(to)': False,
 'endswith(in)': False,
 'endswith(f)': False,
 'endswith(o)': False,
 'endswith(ed)': False,
 'endswith(nd)': False,
 'endswith(is)': False,
 'endswith(on)': False,
 'endswith(l)': False,
 'endswith(g)': False,
 'endswith(and)': False,
 'endswith(ng)': False,
 'endswith(er)': False,
 'endswith(as)': False,
 'endswith(ing)': False,
 'endswith(h)': False,
 'endswith(at)': False,
 'endswith(es)': False,
 'endswith(or)': False,
 'endswith(re)': False,
 'endswith(it)': False,
 'endswith(``)': False,
 'endswith(an)': False,
 "endswith('')": False,
 'endswith(m)': False,
 'endswith(;)': False,
 'endswith(i)': False,
 'endswith(ly)': True,
 'endswith(ion)': False,
 'endswi

In [5]:
# a new POS feature function that takes an entire sentence 
# use the previous word in the sentence

def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [6]:
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [7]:
brown.sents()[0][8]

'investigation'

In [8]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [9]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []

for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent) # get an untagged sentence
    for i, (word, tag) in enumerate(tagged_sent): # get a list that pairs the index number of each word with the word and tag
        featuresets.append( (pos_features(untagged_sent, i), tag))

In [10]:
for f in featuresets[:5]:
    print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')


In [11]:
# separate corpus into training and test sets
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

print('Size of train_set: ' + str(len(train_set)))
print('Size of test_set: ' + str(len(test_set)))

Size of train_set: 90499
Size of test_set: 10055


In [12]:
# train a Naïve Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [13]:
# Sentence Segmentation

# a classification task that labels each punctuation symbol that could end a sentence with whether it does end a sentence or not

In [14]:
# sents() function -: return a list where each element is a sentence represented as a list of tokens
sents = nltk.corpus.treebank_raw.sents()
sents[:2]

[['.', 'START'],
 ['Pierre',
  'Vinken',
  ',',
  '61',
  'years',
  'old',
  ',',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'Nov',
  '.',
  '29',
  '.']]

In [15]:
# to get data for classifiying without sentence boundaries, 
# merge the sentences into one long list of tokens for classifying
# keep track of the index numbers where the sentence boundaries are

tokens = [ ]
boundaries = set()
offset = 0

for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset - 1)

In [16]:
tokens[:5]

['.', 'START', 'Pierre', 'Vinken', ',']

In [17]:
# a feature extractor function that works on each token
# it returns
# capitalized next word
# previous word
# the actual token (called ‘punct’)
# if the previous word is one character long

def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prevword': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [18]:
# for any token that is potential sentence ender,
# i.e. a “.”, “?”, or “!”
# build a feature set for that occurrence of the token
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens) - 1)
                if tokens[i] in '.?!']

In [19]:
# separate the feature sets into training and test sets
size = int(len(Sfeaturesets) * 0.1)
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)
nltk.classify.accuracy(Sclassifier, Stest_set)

0.9461279461279462

In [20]:
# take list of tokens/words and whenever one is a “.”, “?”, or “!”, we apply the classifier to label it. 
# If it is an end of sentence marker, we have a special START symbol into the stream of tokens

def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and Sclassifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [21]:
len(tokens)

101797

In [22]:
smalltokens = tokens[:50]

In [23]:
for s in segment_sentences(smalltokens):
    print (s)

['.']
['START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated']


In [24]:
# Text Classification (aka Text Categorization)

# items to be classified are documents (datasets that label each document with a topic category)
# each document is labeled either ‘pos’ for positive or ‘neg’ for negative

In [25]:
from nltk.corpus import movie_reviews
import random
movie_reviews.categories()

['neg', 'pos']

In [26]:
# movie review documents are separated into file directories by category
# create the list of documents where each document is paired with its label

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)
            ]

In [27]:
random.shuffle(documents)

In [28]:
documents[0]

(['having',
  'not',
  'seen',
  ',',
  '"',
  'who',
  'framed',
  'roger',
  'rabbit',
  '"',
  'in',
  'over',
  '10',
  'years',
  ',',
  'and',
  'not',
  'remembering',
  'much',
  'besides',
  'that',
  'i',
  'liked',
  'it',
  'then',
  ',',
  'i',
  'decided',
  'to',
  'rent',
  'it',
  'recently',
  '.',
  'watching',
  'it',
  'i',
  'was',
  'struck',
  'by',
  'just',
  'how',
  'brilliant',
  'a',
  'film',
  'it',
  'is',
  '.',
  'aside',
  'from',
  'the',
  'fact',
  'that',
  'it',
  "'",
  's',
  'a',
  'milestone',
  'in',
  'animation',
  'in',
  'movies',
  '(',
  'it',
  "'",
  's',
  'the',
  'first',
  'film',
  'to',
  'combine',
  'real',
  'actors',
  'and',
  'cartoon',
  'characters',
  ',',
  'have',
  'them',
  'interact',
  ',',
  'and',
  'make',
  'it',
  'convincingly',
  'real',
  ')',
  'and',
  'a',
  'great',
  'entertainment',
  'it',
  "'",
  's',
  'also',
  'quite',
  'an',
  'effective',
  'comedy',
  '/',
  'mystery',
  '.',
  'while',
 

In [29]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]

In [30]:
word_features[:5]

['plot', ':', 'two', 'teen', 'couples']

In [31]:
# define the features for each document
# feature label will be ‘contains(keyword)’ for each keyword in the word_features set
# value of the feature will be Boolean

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [32]:
featuresets = [(document_features(d), c) for (d,c) in documents]
# featuresets[0]

In [33]:
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.86


In [34]:
# top ranked features according to the ratio of one label to the other one
classifier.show_most_informative_features(20)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.2 : 1.0
    contains(schumacher) = True              neg : pos    =      7.3 : 1.0
          contains(mena) = True              neg : pos    =      6.9 : 1.0
        contains(suvari) = True              neg : pos    =      6.9 : 1.0
        contains(turkey) = True              neg : pos    =      6.5 : 1.0
       contains(singers) = True              pos : neg    =      6.4 : 1.0
        contains(shoddy) = True              neg : pos    =      6.3 : 1.0
     contains(atrocious) = True              neg : pos    =      6.1 : 1.0
       contains(miscast) = True              neg : pos    =      5.8 : 1.0
       contains(unravel) = True              pos : neg    =      5.7 : 1.0
        contains(justin) = True              neg : pos    =      5.7 : 1.0
           contains(ugh) = True              neg : pos    =      5.7 : 1.0
       contains(bronson) = True              neg : pos    =      5.6 : 1.0

In [35]:
# EXERCISE

In [36]:
new_documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

In [37]:
new_documents[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

In [38]:
new_word_features = list(all_words.keys())[2000:3000]
# selecting only 1000 words
new_word_features[:5]

['works', 'usually', 'schlock', 'halfway', 'goodnight']

In [39]:
new_featuresets = [(document_features(d), c) for (d,c) in new_documents]

In [40]:
new_train_set, new_test_set = new_featuresets[100:], new_featuresets[:100]
new_classifier = nltk.NaiveBayesClassifier.train(new_train_set)
print (nltk.classify.accuracy(new_classifier, new_test_set))

0.78


In [41]:
# When using 2000 words, the accuracy is 0.82
# But, when using 1000 words the accuracy has dropped to 0.78