In [86]:
import nltk
from nltk.corpus import names
import os
import random
from operator import itemgetter
from __future__ import print_function

**Naive Bayes Classifier**

In [6]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [7]:
gender_features('Shrek')

{'last_letter': 'k'}

In [20]:
names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [23]:
featuresets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [13]:
classifier.classify(gender_features('Neo'))

'male'

In [14]:
classifier.classify(gender_features('Trinity'))

'female'

In [24]:
print(nltk.classify.accuracy(classifier, test_set))
# Likelihood ratios
classifier.show_most_informative_features(5)

0.766
Most Informative Features
             last_letter = u'a'           female : male   =     34.5 : 1.0
             last_letter = u'k'             male : female =     32.6 : 1.0
             last_letter = u'f'             male : female =     17.2 : 1.0
             last_letter = u'v'             male : female =     11.2 : 1.0
             last_letter = u'p'             male : female =     11.2 : 1.0


In [18]:
def more_features(word):
    return {'last_letter': word[-1], 'length': len(word), 'first_letter': word[0]}

In [28]:
featuresets = [(more_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.778
Most Informative Features
             last_letter = u'a'           female : male   =     34.5 : 1.0
             last_letter = u'k'             male : female =     32.6 : 1.0
             last_letter = u'f'             male : female =     17.2 : 1.0
             last_letter = u'v'             male : female =     11.2 : 1.0
             last_letter = u'p'             male : female =     11.2 : 1.0


In [26]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [29]:
# Example of overfitting especially when dealing with small training sets
featuresets = [(gender_features2(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.746
Most Informative Features
              lastletter = u'a'           female : male   =     34.5 : 1.0
              lastletter = u'k'             male : female =     32.6 : 1.0
              lastletter = u'f'             male : female =     17.2 : 1.0
              lastletter = u'v'             male : female =     11.2 : 1.0
              lastletter = u'p'             male : female =     11.2 : 1.0


In [32]:
# Last two characters are also important features
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [31]:
featuresets = [(more_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.778
Most Informative Features
             last_letter = u'a'           female : male   =     34.5 : 1.0
             last_letter = u'k'             male : female =     32.6 : 1.0
             last_letter = u'f'             male : female =     17.2 : 1.0
             last_letter = u'v'             male : female =     11.2 : 1.0
             last_letter = u'p'             male : female =     11.2 : 1.0


**Document Classification**

In [36]:
from nltk.corpus import movie_reviews

In [40]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [50]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, count) in all_words.most_common(2000)]

In [55]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [56]:
print(list(document_features(movie_reviews.words('pos/cv957_8737.txt')).items())[:5])

[(u'contains(waste)', False), (u'contains(lot)', False), (u'contains(*)', True), (u'contains(black)', False), (u'contains(rated)', False)]


In [58]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))

0.73


In [59]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
         contains(mulan) = True              pos : neg    =      8.8 : 1.0
        contains(seagal) = True              neg : pos    =      8.0 : 1.0
         contains(damon) = True              pos : neg    =      7.7 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.6 : 1.0


**POS**

In [103]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [109]:
common_suffixes = [s for (s, c) in suffix_fdist.most_common(100)]
common_suffixes[:10]

[u'e', u',', u'.', u's', u'd', u't', u'he', u'n', u'a', u'of']

In [110]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

In [114]:
tagged_words = brown.tagged_words(categories='news')[:10000]
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.684

In [115]:
classifier.classify(pos_features('cats'))

u'NNS'

In [116]:
print(classifier.pseudocode(depth=4))

if endswith(he) == False: 
  if endswith(s) == False: 
    if endswith(,) == False: 
      if endswith(.) == False: return u'``'
      if endswith(.) == True: return u'.'
    if endswith(,) == True: return u','
  if endswith(s) == True: 
    if endswith(was) == False: 
      if endswith(is) == False: return u'NN'
      if endswith(is) == True: return u'BEZ'
    if endswith(was) == True: return u'BEDZ'
if endswith(he) == True: 
  if endswith(the) == False: return u'PPS'
  if endswith(the) == True: return u'AT'



In [123]:
# Use context
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
            "suffix(2)": sentence[i][-2:],
            "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [124]:
pos_features(brown.sents()[0], 8)

{'prev-word': u'an',
 'suffix(1)': u'n',
 'suffix(2)': u'on',
 'suffix(3)': u'ion'}

In [142]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append(
            (pos_features(untagged_sent, i), tag))

In [143]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# classifier = nltk.DecisionTreeClassifier.train(train_set)

In [144]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

**Sequence Classification**

In [150]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

In [153]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [154]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.798052851182


In [159]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]
    return -sum([p * math.log(p, 2) for p in probs])

In [162]:
print(entropy(['male', 'male', 'male', 'male']))
print(entropy(['male', 'female', 'male', 'male']))
print(entropy(['male', 'female', 'female', 'male']))

-0.0
0.811278124459
1.0
