In [1]:
import base64
import collections
import datetime
import itertools
import json
import operator
import os
import pickle
import random
import re
import sys
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
#import PIL
import pylab
import scipy
#import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm
import xgboost as xgb

np.random.seed(1337)

%matplotlib inline

#sns.set(font_scale=1.0)
mpl.rcParams['figure.figsize'] = 10, 6
#sns.set_style('whitegrid')
#sns.set_palette(sns.color_palette('muted'))



# Learning to Classify Text

## Supervised Classification

### Gender Identification

In [9]:
def gender_features(word):
    return {
        'first_letter': word[0],
        'last_letter': word[-1],
        'length': len(word)}

gender_features('Shrek')

{'first_letter': 'S', 'last_letter': 'k', 'length': 5}

In [10]:
names = ([(name, 'male') for name in nltk.corpus.names.words('male.txt')] +
         [(name, 'female') for name in nltk.corpus.names.words('female.txt')])
random.shuffle(names)

In [11]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
print classifier.classify(gender_features('Neo'))
print classifier.classify(gender_features('Trinity'))

male
male


In [13]:
print nltk.classify.accuracy(classifier, test_set)

0.796


In [14]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     33.1 : 1.0
             last_letter = u'k'             male : female =     30.8 : 1.0
             last_letter = u'f'             male : female =     17.3 : 1.0
             last_letter = u'p'             male : female =     12.6 : 1.0
             last_letter = u'v'             male : female =     10.5 : 1.0


In [15]:
# When working with large corpora, constructing a single list that contains the
# features of every instance can use up a large amount of memory. In these cases,
# use the function nltk.classify.apply_features, which returns an object that
# acts like a list but does not store all the feature sets in memory:

train_set = nltk.classify.apply_features(gender_features, names[500:])
test_set = nltk.classify.apply_features(gender_features, names[:500])

### Chosing the Right Features

In [17]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [19]:
print gender_features2('John')

{'count(u)': 0, 'has(d)': False, 'count(b)': 0, 'count(w)': 0, 'has(b)': False, 'count(l)': 0, 'count(q)': 0, 'count(n)': 1, 'has(j)': True, 'count(s)': 0, 'count(h)': 1, 'has(h)': True, 'has(y)': False, 'count(j)': 1, 'has(f)': False, 'has(o)': True, 'count(x)': 0, 'has(m)': False, 'count(z)': 0, 'has(k)': False, 'has(u)': False, 'count(d)': 0, 'has(s)': False, 'count(f)': 0, 'lastletter': 'n', 'has(q)': False, 'has(w)': False, 'has(e)': False, 'has(z)': False, 'count(t)': 0, 'count(c)': 0, 'has(c)': False, 'has(x)': False, 'count(v)': 0, 'count(m)': 0, 'has(a)': False, 'has(v)': False, 'count(p)': 0, 'count(o)': 1, 'has(i)': False, 'count(i)': 0, 'has(r)': False, 'has(g)': False, 'count(k)': 0, 'firstletter': 'j', 'count(y)': 0, 'has(n)': True, 'has(l)': False, 'count(e)': 0, 'has(t)': False, 'count(g)': 0, 'count(r)': 0, 'count(a)': 0, 'has(p)': False}


In [20]:
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

0.782


In [21]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [22]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [23]:
print nltk.classify.accuracy(classifier, devtest_set)

0.776


In [24]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [30]:
for (tag, guess, name) in sorted(errors)[:10]: # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Amber                         
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anet                          
correct=female   guess=male     name=Ardys                         
correct=female   guess=male     name=Blondell                      
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Brigid                        
correct=female   guess=male     name=Cal                           
correct=female   guess=male     name=Candis                        


In [27]:
# Looking through this list of errors makes it clear that some suffixes that are more
# than one letter can be indicative of name genders. For example, names ending in yn
# appear to be predominantly female, despite the fact that names ending in n tend to be
# male; and names ending in ch are usually male, even though names that end in h tend
# to be female. We therefore adjust our feature extractor to include features for
# two-letter suffixes.

In [31]:
def gender_features3(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["suffix1"] = name[-1:]
    features["suffix2"] = name[-2:]
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [32]:
featuresets = [(gender_features3(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

0.812


### Document Classification