# I. Male name? Female name?

In [1]:
# Consider the document classification task discussed in Chapter 6 of NLTK (Section 1.3).
# Using the same training and test data, and the same feature extractor,
# build three classifiers for the task: a decision tree, a naive Bayes classifier, and a Maxi- mum Entropy classifier.
# Compare the performance of the three classifiers on your se- lected task.
# How do you think that your results might be different if you used a different feature extractor?

In [2]:
from nltk.corpus import names
from random import shuffle
from sklearn import svm
import pandas as pd
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from sklearn import tree
from nltk.classify import MaxentClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from nltk.metrics import ConfusionMatrix

In [3]:
# raw data: male names and female names
male = [(name, 'male') for name in names.words('male.txt')]
female = [(name, 'female') for name in names.words('female.txt') if name not in names.words('male.txt')]
print('number of male name:', len(male))
print('number of female name:', len(female))

all_name = male + female
shuffle(all_name)

number of male name: 2943
number of female name: 4636


In [5]:
# labels: male & female
labels = [t[1] for t in all_name]

In [6]:
# label vectorization
lb = preprocessing.LabelBinarizer()
lb.fit(['male', 'female'])

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [7]:
y = lb.transform(labels)

In [9]:
# feature: bag-of-characters
text = [t[0].lower() for t in all_name]

In [10]:
# features vectorizaion
cv = CountVectorizer(analyzer='char')
X = csr_matrix(cv.fit_transform(text)).toarray()

In [11]:
# training-testing split
split = int(0.7 * len(all_name))
Xtrain, Xtest = X[:split], X[split:]
ytrain, ytest = y[:split], y[split:]

In [12]:
# classifier: Decision Tree
clf_tree = tree.DecisionTreeClassifier()
clf_tree.fit(Xtrain, ytrain)
pred_tree = clf_tree.predict(Xtest)

In [13]:
print('Decision Tree Result:')
print('Accuracy:', round(accuracy_score(ytest, pred_tree), 3))

Decision Tree Result:
Accuracy: 0.691


In [14]:
print(pd.DataFrame(confusion_matrix(ytest, pred_tree), index=['female', 'male'], 
             columns=['predict female','predict male']))
print('='*20)

        predict female  predict male
female            1044           321
male               381           528


In [15]:
# classifier: Naive Bayes
clf_nb = MultinomialNB()
clf_nb.fit(Xtrain, ytrain)
pred_nb = clf_nb.predict(Xtest)

  y = column_or_1d(y, warn=True)


In [16]:
print('Naive Bayes Result:')
print('Accuracy:', round(accuracy_score(ytest, pred_nb), 3))

Naive Bayes Result:
Accuracy: 0.69


In [17]:
print(pd.DataFrame(confusion_matrix(ytest, pred_nb), index=['female', 'male'], 
             columns=['predict female','predict male']))
print('='*20)

        predict female  predict male
female            1206           159
male               546           363


In [18]:
# classifier: Maximum Entropy
features = [nltk.FreqDist(t) for t in list(text)]
sets = [(features[i], labels[i]) for i in range(len(features))]
sets_train, sets_test = sets[:split], sets[split:]

In [19]:
clf_mx = MaxentClassifier.train(sets_train)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.383
             2          -0.60943        0.623
             3          -0.58417        0.654
             4          -0.56480        0.676
             5          -0.54968        0.686
             6          -0.53765        0.694
             7          -0.52791        0.700
             8          -0.51988        0.702
             9          -0.51315        0.704
            10          -0.50744        0.708
            11          -0.50253        0.709
            12          -0.49827        0.710
            13          -0.49453        0.713
            14          -0.49122        0.714
            15          -0.48828        0.714
            16          -0.48563        0.714
            17          -0.48325        0.716
            18          -0.48108        0.717
            19          -0.47911        0.720
 

In [20]:
print('Maximum Entropy Result:')
print('Accuracy:', round(nltk.classify.accuracy(clf_mx, sets_test), 3))

Maximum Entropy Result:
Accuracy: 0.728


In [26]:
pred_mx = [clf_mx.classify(sets_test[i][0]) for i in range(len(sets_test))]
label_test = [sets_test[i][1] for i in range(len(sets_test))]

In [29]:
print(ConfusionMatrix(label_test, pred_mx))

       |    f      |
       |    e      |
       |    m    m |
       |    a    a |
       |    l    l |
       |    e    e |
-------+-----------+
female |<1171> 194 |
  male |  425 <484>|
-------+-----------+
(row = reference; col = test)

