## A quick Gender Recognition model
Grabbed from [nlpforhackers](https://nlpforhackers.io/introduction-machine-learning/) webpage.
1. Firstly speaks about how to convert the dataset into a numpy array

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

In [21]:
names = pd.read_csv('names_dataset.csv')
print names.head(10)
 
print "%d names in dataset" % len(names) 

   index       name sex
0      0       Mary   F
1      1       Anna   F
2      2       Emma   F
3      3  Elizabeth   F
4      4     Minnie   F
5      5   Margaret   F
6      6        Ida   F
7      7      Alice   F
8      8     Bertha   F
9      9      Sarah   F
95025 names in dataset


In [3]:
# Get the data out of the dataframe into a numpy matrix and keep only the name and gender columns
names = names.as_matrix()[:, 1:]
print names
 
# We're using 80% of the data for training
TRAIN_SPLIT = 0.8

[['Mary' 'F']
 ['Anna' 'F']
 ['Emma' 'F']
 ...
 ['Ziyu' 'M']
 ['Zykir' 'M']
 ['Zyus' 'M']]


In [6]:
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

# Feature Extraction
print features("Alex")

{'first2-letters': 'al', 'last-letter': 'x', 'first-letter': 'a', 'last2-letters': 'ex', 'last3-letters': 'lex', 'first3-letters': 'ale'}


In [7]:
# Vectorize the features function
features = np.vectorize(features)
print features(["Anna", "Hannah", "Paul"])
# [ array({'first2-letters': 'an', 'last-letter': 'a', 'first-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna', 'first3-letters': 'ann'}, dtype=object)
#   array({'first2-letters': 'ha', 'last-letter': 'h', 'first-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah', 'first3-letters': 'han'}, dtype=object)
#   array({'first2-letters': 'pa', 'last-letter': 'l', 'first-letter': 'p', 'last2-letters': 'ul', 'last3-letters': 'aul', 'first3-letters': 'pau'}, dtype=object)]
 
# Extract the features for the whole dataset
X = features(names[:, 0]) # X contains the features
 
# Get the gender column
y = names[:, 1]           # y contains the targets
 
# Test if we built the dataset correctly
print "Name: %s, features=%s, gender=%s" % (names[0][0], X[0], y[0])

[{'first2-letters': 'an', 'last-letter': 'a', 'first-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna', 'first3-letters': 'ann'}
 {'first2-letters': 'ha', 'last-letter': 'h', 'first-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah', 'first3-letters': 'han'}
 {'first2-letters': 'pa', 'last-letter': 'l', 'first-letter': 'p', 'last2-letters': 'ul', 'last3-letters': 'aul', 'first3-letters': 'pau'}]
Name: Mary, features={'first2-letters': 'ma', 'last-letter': 'y', 'first-letter': 'm', 'last2-letters': 'ry', 'last3-letters': 'ary', 'first3-letters': 'mar'}, gender=F


In [8]:
X, y = shuffle(X, y)
X_train, X_test = X[:int(TRAIN_SPLIT * len(X))], X[int(TRAIN_SPLIT * len(X)):]
y_train, y_test = y[:int(TRAIN_SPLIT * len(y))], y[int(TRAIN_SPLIT * len(y)):]
# Check to see if the datasets add up
print len(X_train), len(X_test), len(y_train), len(y_test)

76020 19005 76020 19005


In [9]:
print features(["Mary", "John"])
vectorizer = DictVectorizer()
vectorizer.fit(X_train)
transformed = vectorizer.transform(features(["Mary", "John"]))
print transformed

print type(transformed) # <class 'scipy.sparse.csr.csr_matrix'>
print transformed.toarray()[0][12]    # 1.0
print vectorizer.feature_names_[12]   # first-letter=m


[{'first2-letters': 'ma', 'last-letter': 'y', 'first-letter': 'm', 'last2-letters': 'ry', 'last3-letters': 'ary', 'first3-letters': 'mar'}
 {'first2-letters': 'jo', 'last-letter': 'n', 'first-letter': 'j', 'last2-letters': 'hn', 'last3-letters': 'ohn', 'first3-letters': 'joh'}]
  (0, 12)	1.0
  (0, 242)	1.0
  (0, 2709)	1.0
  (0, 4478)	1.0
  (0, 4788)	1.0
  (0, 5115)	1.0
  (1, 9)	1.0
  (1, 197)	1.0
  (1, 2255)	1.0
  (1, 4467)	1.0
  (1, 4604)	1.0
  (1, 7152)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
1.0
first-letter=m


In [10]:
clf = DecisionTreeClassifier()
clf.fit(vectorizer.transform(X_train), y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [11]:
# Accuracy on training set
print clf.score(vectorizer.transform(X_train), y_train) 
 
# Accuracy on test set
print clf.score(vectorizer.transform(X_test), y_test)

0.987687450670876
0.869665877400684


In [20]:
print clf.predict(vectorizer.transform(features(["CHASTITY"])))

['F']
