## Supervised Classification

### Gender Identification

In [None]:
def gender_features(word):
    #return {'last_letter': word[-1]}
    return {'last_letter': word[-1], 'first_letter': word[0]}

gender_features('Shrek')

In [None]:
labeled_names = ([(name.strip(), 'male') for name in open('../datasets/male.txt')] + 
                 [(name.strip(), 'female') for name in open('../datasets/female.txt')])

In [None]:
labeled_names[:10]

In [None]:
import random
random.shuffle(labeled_names)

In [None]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [None]:
featuresets[:10]

In [None]:
# split feature sets into training and test sets
train_set, test_set = featuresets[500:], featuresets[:500]

# build a classifier based on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.classify(gender_features('Neo'))

In [None]:
classifier.classify(gender_features('Catherine'))

In [None]:
classifier.classify(gender_features('Supreet'))

In [None]:
# Lets evaluate the classifier on a much larger quantity of unseen data - robustness of our model
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
# Precision: the number of True Positives divided by the number of True Positives and False Positives
# --> it is the number of positive predictions divided by the total number of positive class values predicted
# --> effectively a measure of a classifier's exactness (low precision indicates a large number of False Positives)


# Recall: the number of True Positives divided by the number of True Positives and the number of False Negatives
# --> number of positive predictions divided by the number of positive class values in the test data.
# --> effectively a measure of a classifier's completeness (low recall indicates many False Negatives)


# F1 Score: (also called F measure) conveys the balance between the precision and recall


In [None]:
# Finally, we can examine the classifier to determine which features it found most 
# effective for distinguishing the names' genders:

classifier.show_most_informative_features(12)

In [None]:
for k,v in featuresets:
    if k['last_letter'] == 'd':
        print k,v


In [None]:
# let's add another feature: first letter...

### Choosing Features

In [None]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [None]:
gender_features2('John')

In [None]:
random.shuffle(labeled_names)
featuresets2 = [(gender_features2(n), gender) for (n, gender) in labeled_names]


In [None]:
featuresets2[0]

In [None]:
train_set2, test_set2 = featuresets2[500:], featuresets2[:500]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

print(nltk.classify.accuracy(classifier2, test_set2))

In [None]:
classifier2.show_most_informative_features(15)

### Movie Reviews Corpus

In [None]:
# download movie_reviews from nltk

import nltk
nltk.download()

In [None]:
import nltk
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [None]:
documents[:10]

In [None]:
# define feature extractor
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
word_features

In [None]:
# identify words from our word_feature array
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [None]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

In [None]:
# train classifier - to label new movie reviews

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
classifier.show_most_informative_features(5)