## Choosing the right features

Overfitting: the case where the built algorithm relies on the idiosyncrasies of the training data that do not generalize well to new data. 

To avoid overfitting: use error analysis, which refines the created feature set by: 
- Selecting development set: divide this set into training and dev-test sets

Training set is used to train the model while the dev test is used to perform error analysis. The test set is the used in the final evaluation of our model 


In [20]:
#importing libraries
import nltk
from nltk.corpus import movie_reviews, names
from string import ascii_lowercase
import random

In [21]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
[(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

In [23]:
def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in ascii_lowercase:
        features[f"count({letter})"] = name.lower().count(letter)
        features[f"has({letter})"] = letter in name.lower()
    return features

In [24]:
print(gender_features("Eugene"))

{'first_letter': 'e', 'last_letter': 'e', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 3, 'has(e)': True, 'count(f)': 0, 'has(f)': False, 'count(g)': 1, 'has(g)': True, 'count(h)': 0, 'has(h)': False, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 0, 'has(o)': False, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 1, 'has(u)': True, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [25]:

feature_sets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set , test_set = feature_sets[500:], feature_sets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))


0.746


In [29]:
# Let's improve the accuracy by creating the development test set
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.77


In [31]:
# let's use the dev test set to generate a list of errors the classifier makes when predicting name genders

errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

In [37]:
for (tag, guess, name) in sorted(errors):
    print(f"correct = {tag}, guess = {guess}, name = {name}")

correct = female, guess = male, name = Ambur
correct = female, guess = male, name = Aphrodite
correct = female, guess = male, name = Ardys
correct = female, guess = male, name = Berget
correct = female, guess = male, name = Betsey
correct = female, guess = male, name = Betty
correct = female, guess = male, name = Bridget
correct = female, guess = male, name = Brunhilde
correct = female, guess = male, name = Cam
correct = female, guess = male, name = Chad
correct = female, guess = male, name = Christel
correct = female, guess = male, name = Chrystal
correct = female, guess = male, name = Corey
correct = female, guess = male, name = Corliss
correct = female, guess = male, name = Corny
correct = female, guess = male, name = Demeter
correct = female, guess = male, name = Devonne
correct = female, guess = male, name = Doe
correct = female, guess = male, name = Doreen
correct = female, guess = male, name = Dotty
correct = female, guess = male, name = Enid
correct = female, guess = male, name

Looking through this list of errors makes it clear that some suffixes that are more than one letter can be indicative of name genders. For example, names ending in yn appear to be predominantly female, despite the fact that names ending in n tend to be male; and names ending in ch are usually male, even though names that end in h tend to be female. We therefore adjust our feature extractor to include features for two-letter suffixes:

In [39]:
def gender_features(word):
    return {'suffix1': word[-1:], 
            'suffix2': word[-2:]}

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_set]
test_set = [(gender_features(n), gender) for (n, gender) in test_set]



TypeError: unhashable type: 'slice'

In [19]:
documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]

LookupError: 
**********************************************************************
  Resource [93mmovie_reviews[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('movie_reviews')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/movie_reviews[0m

  Searched in:
    - '/Users/eugen/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [13]:
nltk.download('movie_reviews')

[nltk_data] Error loading movie_reviews: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [15]:
nltk.download('movie_reviews')

[nltk_data] Error loading movie_reviews: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [16]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Error loading movie_reviews: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

: 