## Choosing the right features

Overfitting: the case where the built algorithm relies on the idiosyncrasies of the training data that do not generalize well to new data. 

To avoid overfitting: use error analysis, which refines the created feature set by: 
- Selecting development set: divide this set into training and dev-test sets

Training set is used to train the model while the dev test is used to perform error analysis. The test set is the used in the final evaluation of our model 


In [71]:
#importing libraries
import nltk
from nltk.corpus import movie_reviews, names, brown
from string import ascii_lowercase
import random

In [21]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
[(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

In [23]:
def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in ascii_lowercase:
        features[f"count({letter})"] = name.lower().count(letter)
        features[f"has({letter})"] = letter in name.lower()
    return features

In [24]:
print(gender_features("Eugene"))

{'first_letter': 'e', 'last_letter': 'e', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 3, 'has(e)': True, 'count(f)': 0, 'has(f)': False, 'count(g)': 1, 'has(g)': True, 'count(h)': 0, 'has(h)': False, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 0, 'has(o)': False, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 1, 'has(u)': True, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [25]:

feature_sets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set , test_set = feature_sets[500:], feature_sets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))


0.746


In [29]:
# Let's improve the accuracy by creating the development test set
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.77


In [31]:
# let's use the dev test set to generate a list of
# errors the classifier makes when predicting name genders

errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

In [37]:
for (tag, guess, name) in sorted(errors):
    print(f"correct = {tag}, guess = {guess}, name = {name}")

correct = female, guess = male, name = Ambur
correct = female, guess = male, name = Aphrodite
correct = female, guess = male, name = Ardys
correct = female, guess = male, name = Berget
correct = female, guess = male, name = Betsey
correct = female, guess = male, name = Betty
correct = female, guess = male, name = Bridget
correct = female, guess = male, name = Brunhilde
correct = female, guess = male, name = Cam
correct = female, guess = male, name = Chad
correct = female, guess = male, name = Christel
correct = female, guess = male, name = Chrystal
correct = female, guess = male, name = Corey
correct = female, guess = male, name = Corliss
correct = female, guess = male, name = Corny
correct = female, guess = male, name = Demeter
correct = female, guess = male, name = Devonne
correct = female, guess = male, name = Doe
correct = female, guess = male, name = Doreen
correct = female, guess = male, name = Dotty
correct = female, guess = male, name = Enid
correct = female, guess = male, name

Looking through this list of errors makes it clear that some suffixes that are more than one letter can be indicative of name genders. For example, names ending in yn appear to be predominantly female, despite the fact that names ending in n tend to be male; and names ending in ch are usually male, even though names that end in h tend to be female. We therefore adjust our feature extractor to include features for two-letter suffixes:

In [47]:
def gender_features(word):
    return {'suffix1': word[-1:],             
            'suffix2': word[-2:]}

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

## this error analysis process doesn't improve my model, however it can be repeated
## many times by checking patterns in the errors to imrpove the performance

0.766


## Document classification

In [64]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

## most frequent words in the corpora
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f" contains({word})"] = (word in document_words)
    return features

In [69]:
feature_sets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = feature_sets[100:], feature_sets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.85


In [70]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.8 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
         contains(damon) = True              pos : neg    =      7.9 : 1.0
         contains(mulan) = True              pos : neg    =      7.7 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.4 : 1.0


In these documents, a review that mentions seagal is 8 times more likely to be negative than positive. Interesting.

## Part-of-Speech Tagging

In [74]:
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [84]:
from unicodedata import category


def pos_features(word):
    feature = {}
    for suffix in common_suffixes:
        feature[f"endswith({suffix})"] = word.lower().endswith(suffix)
    return feature

## training a decision tree classifier
tagged_words = brown.tagged_words(categories= 'news')
feature_sets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(feature_sets) * 0.1) #10% of the data

train_set, test_set = feature_sets[size:], feature_sets[:size]

In [85]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

## Exploiting context

In [87]:
def pos_features(sentence, i):
    features = {"suffix1": sentence[i][-1:], 
                "suffix2": sentence[i][-2:], 
                "suffix3": sentence[i][-3:]}
    if i == 0:
        features['prev_word'] = "<START>"
    else:
        features['prev_word'] = sentence[i-1]
    return features

pos_features(brown.sents()[0], 8)

{'suffix1': 'n', 'suffix2': 'on', 'suffix3': 'ion', 'prev_word': 'an'}

In [89]:
brown.sents()[0][-:]

['.']