## Supervised Classification

### Gender Identification

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}
    #return {'last_letter': word[-1], 'first_letter': word[1]}

gender_features('Shrek')

{'last_letter': 'k'}

In [2]:
labeled_names = ([(name.strip(), 'male') for name in open('../datasets/male.txt')] + 
                 [(name.strip(), 'female') for name in open('../datasets/female.txt')])

In [3]:
labeled_names

[('rahul', 'male'),
 ('aman', 'male'),
 ('anmol', 'male'),
 ('abhishek', 'male'),
 ('abhimanyu', 'male'),
 ('shivam', 'male'),
 ('abhinav', 'male'),
 ('shubham', 'male'),
 ('anrab', 'male'),
 ('khitij', 'male'),
 ('arihant', 'male'),
 ('arpit', 'male'),
 ('abhinay', 'male'),
 ('deepak', 'male'),
 ('karan', 'male'),
 ('arjun', 'male'),
 ('siddharth', 'male'),
 ('harsh', 'male'),
 ('vardhan', 'male'),
 ('sachin', 'male'),
 ('gautam', 'male'),
 ('virat', 'male'),
 ('virender', 'male'),
 ('suresh', 'male'),
 ('ramesh', 'male'),
 ('mahendra', 'male'),
 ('pranav', 'male'),
 ('ankit', 'male'),
 ('tanmay', 'male'),
 ('chirag', 'male'),
 ('pradeep', 'male'),
 ('bhaskar', 'male'),
 ('salman', 'male'),
 ('arbas', 'male'),
 ('sohail', 'male'),
 ('saurav', 'male'),
 ('nikhil', 'male'),
 ('naman', 'male'),
 ('aditya', 'male'),
 ('adit', 'male'),
 ('rupesh', 'male'),
 ('shushruth', 'male'),
 ('praveen', 'male'),
 ('dhruv', 'male'),
 ('hardik', 'male'),
 ('anuj', 'male'),
 ('vivek', 'male'),
 ('vishal

In [4]:
import random
random.shuffle(labeled_names)

In [5]:
import nltk
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [6]:
featuresets

[({'last_letter': 't'}, 'female'),
 ({'last_letter': 'i'}, 'male'),
 ({'last_letter': 'a'}, 'male'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'h'}, 'male'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'u'}, 'male'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 'm'}, 'female'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'p'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'h'}, 'male'),
 ({'last_letter': 'h'}, 'male'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'h'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'a'}, '

In [7]:
# split feature sets into training and test sets
train_set, test_set = featuresets[500:], featuresets[:500]

# build a classifier based on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
classifier.classify(gender_features('Neo'))

'male'

In [9]:
classifier.classify(gender_features('Catherine'))

'female'

In [10]:
classifier.classify(gender_features('Trinity'))

'male'

In [11]:
# Lets evaluate the classifier on a much larger quantity of unseen data - robustness of our model
print(nltk.classify.accuracy(classifier, test_set))

0.86


In [None]:
# Precision: the number of True Positives divided by the number of True Positives and False Positives
# --> it is the number of positive predictions divided by the total number of positive class values predicted
# --> effectively a measure of a classifier's exactness (low precision indicates a large number of False Positives)


# Recall: the number of True Positives divided by the number of True Positives and the number of False Negatives
# --> number of positive predictions divided by the number of positive class values in the test data.
# --> effectively a measure of a classifier's completeness (low recall indicates many False Negatives)


# F1 Score: (also called F measure) conveys the balance between the precision and recall



In [14]:
# Finally, we can examine the classifier to determine which features it found most 
# effective for distinguishing the names' genders:

classifier.show_most_informative_features(12)

Most Informative Features
             last_letter = 'd'              male : female =     59.2 : 1.0
             last_letter = 'h'              male : female =     24.2 : 1.0
             last_letter = 't'              male : female =     19.1 : 1.0
             last_letter = 'j'              male : female =     16.7 : 1.0
             last_letter = 'r'              male : female =     16.6 : 1.0
             last_letter = 'k'              male : female =     13.4 : 1.0
             last_letter = 'g'              male : female =     11.9 : 1.0
             last_letter = 'p'              male : female =     10.7 : 1.0
             last_letter = 'n'              male : female =      8.7 : 1.0
             last_letter = 's'              male : female =      8.5 : 1.0
             last_letter = 'i'            female : male   =      5.8 : 1.0
             last_letter = 'b'              male : female =      4.6 : 1.0


In [None]:
# let's add another feature: first letter...

### Choosing Features

In [15]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [16]:
gender_features2('John')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [17]:
random.shuffle(labeled_names)
featuresets2 = [(gender_features2(n), gender) for (n, gender) in labeled_names]
#featuresets2[:2]

In [18]:
train_set2, test_set2 = featuresets2[500:], featuresets2[:500]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

print(nltk.classify.accuracy(classifier2, test_set2))

0.804


In [19]:
classifier2.show_most_informative_features(15)

Most Informative Features
             last_letter = 'd'              male : female =     58.8 : 1.0
             last_letter = 'j'              male : female =     27.7 : 1.0
             last_letter = 'h'              male : female =     27.2 : 1.0
             last_letter = 'r'              male : female =     17.3 : 1.0
             last_letter = 't'              male : female =     16.7 : 1.0
             last_letter = 'k'              male : female =     14.5 : 1.0
             last_letter = 'p'              male : female =     11.5 : 1.0
             last_letter = 'g'              male : female =     11.4 : 1.0
             last_letter = 's'              male : female =     10.1 : 1.0
             last_letter = 'n'              male : female =      8.6 : 1.0
                count(v) = 2                male : female =      6.6 : 1.0
             last_letter = 'i'            female : male   =      5.9 : 1.0
                  has(z) = True           female : male   =      4.8 : 1.0

### Movie Reviews Corpus

In [None]:
# download movie_reviews from nltk

import nltk
nltk.download()

In [20]:
import nltk
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [21]:
documents[:10]

[([u'renee',
   u'zellweger',
   u'stars',
   u'as',
   u'sonia',
   u',',
   u'a',
   u'young',
   u'jewish',
   u'wife',
   u'and',
   u'mother',
   u'frustrated',
   u'by',
   u'the',
   u'constraints',
   u'of',
   u'her',
   u'hasidic',
   u'community',
   u'in',
   u'brooklyn',
   u'.',
   u'her',
   u'husband',
   u'(',
   u'glenn',
   u'fitzgerald',
   u')',
   u'is',
   u'a',
   u'religious',
   u'scholar',
   u'whose',
   u'all',
   u'-',
   u'in',
   u'-',
   u'a',
   u'-',
   u'day',
   u"'",
   u's',
   u'-',
   u'work',
   u'attitude',
   u'on',
   u'sex',
   u'fails',
   u'to',
   u'tame',
   u'the',
   u'"',
   u'fire',
   u'"',
   u'she',
   u'feels',
   u'within',
   u',',
   u'as',
   u'so',
   u'she',
   u'confesses',
   u'to',
   u'the',
   u'rebbe',
   u'(',
   u'after',
   u'hearing',
   u'her',
   u'fiery',
   u'confession',
   u',',
   u'the',
   u'rebbe',
   u'suddenly',
   u'gets',
   u'frisky',
   u'with',
   u'his',
   u'pleasantly',
   u'surprised',
   u'w

In [23]:
# define feature extractor
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
word_features

[u'sucess',
 u'sonja',
 u'askew',
 u'woods',
 u'spiders',
 u'bazooms',
 u'hanging',
 u'francesca',
 u'comically',
 u'localized',
 u'disobeying',
 u'hennings',
 u'canet',
 u'scold',
 u'originality',
 u'caned',
 u'rickman',
 u'slothful',
 u'wracked',
 u'stipulate',
 u'capoeira',
 u'rawhide',
 u'taj',
 u'bringing',
 u'unsworth',
 u'liaisons',
 u'grueling',
 u'sommerset',
 u'wooden',
 u'wednesday',
 u'broiled',
 u'circuitry',
 u'crotch',
 u'elgar',
 u'stereotypical',
 u'shows',
 u'gavan',
 u'rebuilding',
 u'snuggles',
 u'francesco',
 u'feasibility',
 u'miniatures',
 u'gorman',
 u'woody',
 u'consenting',
 u'scraped',
 u'inanimate',
 u'errors',
 u'reopens',
 u'cooking',
 u'fonzie',
 u'opportunists',
 u'islamic',
 u'joely',
 u'designing',
 u'numeral',
 u'succumb',
 u'shocks',
 u'chins',
 u'crooned',
 u'jubilantly',
 u'rocque',
 u'ching',
 u'china',
 u'shandling',
 u'confronts',
 u'wiseguy',
 u'natured',
 u'existentialist',
 u'kids',
 u'uplifting',
 u'controversy',
 u'crowdpleasing',
 u'neurol

In [24]:
# identify words from our word_feature array
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [25]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 



In [26]:
# train classifier - to label new movie reviews

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [27]:
print(nltk.classify.accuracy(classifier, test_set))

0.69


In [28]:
classifier.show_most_informative_features(5)

Most Informative Features
     contains(uplifting) = True              pos : neg    =      8.1 : 1.0
          contains(sans) = True              neg : pos    =      7.8 : 1.0
          contains(hugo) = True              pos : neg    =      7.5 : 1.0
    contains(mediocrity) = True              neg : pos    =      7.1 : 1.0
     contains(dismissed) = True              pos : neg    =      6.9 : 1.0
