<a href="https://colab.research.google.com/github/dinesh-umkc/kdm/blob/main/Gender_Prediction_by_Name_ICP2_a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Name Gender Identifier

In [12]:
# Feature extractor
def gender_features(word):
    return {'last_letter': word[-1]}
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features
def gender_features3(word):
    features = {}
    features["first_letter"] = word[0].lower()
    features["suffix1"] = word[-1].lower()
    features["suffix2"] = word[-2:].lower()
    features["suffix3"] = word[-3:].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = word.lower().count(letter)
        features["has({})".format(letter)] = (letter in word.lower())
    return features

import nltk
import random
nltk.download('names')
from nltk.corpus import names
labeled_names = ([(name, 'female') for name in names.words('female.txt')] + [(name, 'male') for name in names.words('male.txt')])
random.shuffle(labeled_names) # We shuffle the data so that we can split it by index into training and test data.
labeled_names[:5]

#Convert labeled names into feature sets v3
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
featuresets2 = [(gender_features2(n), gender) for (n, gender) in labeled_names]
featuresets3 = [(gender_features3(n), gender) for (n, gender) in labeled_names]
featuresets3[:5]

TRAIN_SET_SIZE = round(len(featuresets3) * .8)
train_set, test_set= featuresets[:TRAIN_SET_SIZE], featuresets[TRAIN_SET_SIZE:] 
train_set2, test_set2 = featuresets2[:TRAIN_SET_SIZE], featuresets2[TRAIN_SET_SIZE:]
train_set3, test_set3 = featuresets3[:TRAIN_SET_SIZE], featuresets3[TRAIN_SET_SIZE:]

from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)
classifier2 = NaiveBayesClassifier.train(train_set2)
classifier3 = NaiveBayesClassifier.train(train_set3)

from nltk.classify import accuracy
print('Classifier 1 Accuraccy: '+str(round(accuracy(classifier, test_set), 2)))
print('Classifier 2 Accuraccy: '+str(round(accuracy(classifier2, test_set2), 2)))
print('Classifier 3 Accuraccy: '+str(round(accuracy(classifier3, test_set3), 2)))

classifier3.show_most_informative_features(15)
classifier3.classify(gender_features3('Madison'))

import collections

# Classifier 1
refsets = collections.defaultdict(set) # y true
testsets = collections.defaultdict(set) # y pred

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    
# Classifier 2
refsets2 = collections.defaultdict(set)
testsets2 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set2):
    refsets2[label].add(i)
    observed = classifier2.classify(feats)
    testsets2[observed].add(i)

# Classifier 3
refsets3 = collections.defaultdict(set)
testsets3 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set3):
    refsets3[label].add(i)
    observed = classifier3.classify(feats)
    testsets3[observed].add(i)

from nltk.metrics.scores import (precision, recall, f_measure)


args = (
    round(precision(refsets['female'], testsets['female']), 2),
    round(precision(refsets['male'], testsets['male']), 2),
    round(recall(refsets['female'], testsets['female']), 2),
    round(recall(refsets['male'], testsets['male']), 2),
    round(f_measure(refsets['female'], testsets['female']), 2),
    round(f_measure(refsets['male'], testsets['male']), 2)
)

args2 = (
    round(precision(refsets2['female'], testsets2['female']), 2),
    round(precision(refsets2['male'], testsets2['male']), 2),
    round(recall(refsets2['female'], testsets2['female']), 2),
    round(recall(refsets2['male'], testsets2['male']), 2),
    round(f_measure(refsets2['female'], testsets2['female']), 2),
    round(f_measure(refsets2['male'], testsets2['male']), 2)
)
args3 = (
    round(precision(refsets3['female'], testsets3['female']), 2),
    round(precision(refsets3['male'], testsets3['male']), 2),
    round(recall(refsets3['female'], testsets3['female']), 2),
    round(recall(refsets3['male'], testsets3['male']), 2),
    round(f_measure(refsets3['female'], testsets3['female']), 2),
    round(f_measure(refsets3['male'], testsets3['male']), 2)
)
print('''
CLASSIFIER 1
------------ 
Female precision: {0}
Male precision: {1}
Female recall: {2}
Male recall: {3}
Female F1 score: {4}
Male F1 score: {5}

CLASSIFIER 2
------------ 
Female precision: {6}
Male precision: {7}
Female recall: {8}
Male recall: {9}
Female F1 score: {10}
Male F1 score: {11}

CLASSIFIER 3
------------ 
Female precision: {12}
Male precision: {13}
Female recall: {14}
Male recall: {15}
Female F1 score: {16}
Male F1 score: {17}
'''.format(*args, *args2, *args3))

train_names, test_names = labeled_names[:round(len(featuresets) * .8)], labeled_names[round(len(featuresets) * .8):]

errors = []
for (name, tag) in test_names:
    guess = classifier3.classify(gender_features3(name))
    if guess != tag:
        errors.append((tag, guess, name))

errors[:5]

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


Classifier 1 Accuraccy: 0.76
Classifier 2 Accuraccy: 0.78
Classifier 3 Accuraccy: 0.82
Most Informative Features
                 suffix2 = 'na'           female : male   =     86.9 : 1.0
                 suffix2 = 'la'           female : male   =     62.1 : 1.0
                 suffix2 = 'ia'           female : male   =     45.2 : 1.0
                 suffix3 = 'ard'            male : female =     42.8 : 1.0
                 suffix1 = 'a'            female : male   =     39.0 : 1.0
                 suffix2 = 'rd'             male : female =     38.2 : 1.0
                 suffix1 = 'k'              male : female =     37.5 : 1.0
                 suffix2 = 'us'             male : female =     32.8 : 1.0
                 suffix2 = 'ra'           female : male   =     31.2 : 1.0
                 suffix2 = 'sa'           female : male   =     31.1 : 1.0
                 suffix3 = 'tta'          female : male   =     23.5 : 1.0
                 suffix2 = 'ta'           female : male   =   

[('female', 'male', 'Rory'),
 ('male', 'female', 'Julie'),
 ('male', 'female', 'Verney'),
 ('female', 'male', 'Marry'),
 ('male', 'female', 'Sebastien')]

## More classifiers

Scikit-learn (sklearn) is a popular library which features various classification, regression and clustering algorithms including support vector machines, random forests, gradient boosting, k-means and DBSCAN.

NLTK provides an API to quickly use sklearn classifiers in `nltk.classify.scikitlearn`. The other option is to import and use sklearn directly.

For an example of integrating sklearn with NLTK, you can check out [this](https://www.kaggle.com/alvations/basic-nlp-with-nltk) notebook on Kaggle. Kaggle is a great website for NLP and machine learning in general, creating an account is highly recommended.


Scikit Learn:
```python
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB() 

# To train the classifier, simple do 
clf.fit(train_set, train_tags) 
```

NLTK: 

```
ConditionalExponentialClassifier


MaxentClassifier

NaiveBayesClassifier

WekaClassifier
```

## Maximum entropy classifier

Scikit-learn (sklearn) is a popular library which features various classification, regression and clustering algorithms including support vector machines, random forests, gradient boosting, k-means and DBSCAN.

NLTK provides an API to quickly use sklearn classifiers in `nltk.classify.scikitlearn`. The other option is to import and use sklearn directly.

For an example of integrating sklearn with NLTK, you can check out [this](https://www.kaggle.com/alvations/basic-nlp-with-nltk) notebook on Kaggle. Kaggle is a great website for NLP and machine learning in general, creating an account is highly recommended.

The principle of **maximum entropy** states that the probability distribution which best represents the current state of knowledge is the one with largest entropy.

The principle of maximum entropy is invoked when we have some piece(s) of information about a probability distribution, but not enough to characterize it completely—likely because we do not have the means or resources to do so. As an example, if all we know about a distribution is its average, we can imagine infinite shapes that yield a particular average. The principle of maximum entropy says that we should humbly choose the distribution that maximizes the amount of unpredictability contained in the distribution.

Taking the idea to the extreme, it wouldn’t be scientific to choose a distribution that simply yields the average value 100% of the time.

From all the models that fit our training data, the Maximum Entropy classifier selects the one which has the largest entropy. Due to the minimum assumptions that the Maximum Entropy classifier makes, it is usually used when we don’t know anything about the prior distributions and when it is unsafe to make any assumptions. Also, the maximum entropy classifier is used when we can’t assume the conditional independence of the features.

In [None]:
from nltk import MaxentClassifier

me_classifier = MaxentClassifier.train(train_set3, max_iter=25) # max_iter has default value 100. In this example, the performance in terms of accuracy on the test set starts significantly improving beyond the previous model's at around 25 iterations.

My training improved when I used another classifier
```python
 ==> Training (25 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.370
             2          -0.60155        0.630
             3          -0.57970        0.630
             4          -0.55965        0.637
             5          -0.54133        0.666
             6          -0.52462        0.701
             7          -0.50938        0.730
             8          -0.49549        0.753
             9          -0.48283        0.765
            10          -0.47126        0.778
            11          -0.46068        0.787
            12          -0.45098        0.792
            13          -0.44208        0.798
            14          -0.43389        0.800
            15          -0.42633        0.802
            16          -0.41935        0.805
            17          -0.41289        0.806
            18          -0.40689        0.808
            19          -0.40132        0.809
            20          -0.39612        0.811
            21          -0.39128        0.812
            22          -0.38674        0.812
            23          -0.38250        0.812
            24          -0.37851        0.813
         Final          -0.37477        0.814
        
```

Test Accuracy: 
```python
round(accuracy(me_classifier, test_set3), 2) # Test accuracy
0.8
```

In [None]:
me_classifier.show_most_informative_features(10)

```
-1.978 suffix2=='ia' and label is 'male'
-1.921 suffix2=='na' and label is 'male'
-1.515 suffix2=='sa' and label is 'male'
-1.463 suffix1=='a' and label is 'male'
-1.290 suffix2=='ra' and label is 'male'
-1.278 suffix1=='k' and label is 'female'
-1.197 suffix2=='rd' and label is 'female'
-1.169 suffix2=='do' and label is 'female'
-1.167 suffix2=='us' and label is 'female'
-1.166 suffix2=='ta' and label is 'male'
```