## A quick Gender Recognition model
Grabbed from [nlpforhackers](https://nlpforhackers.io/introduction-machine-learning/) webpage.
1. Firstly convert the dataset into a numpy array to keep only gender and names
2. Set the feature parameters which takes in different parameters
3. Vectorize the parametes
4. Get varied train, test split and test it for validity by checking out the count of the train test split
5. Transform lists of feature-value mappings to vectors. (When feature values are strings, this transformer will do a binary one-hot (aka one-of-K) coding: one boolean-valued feature is constructed for each of the possible string values that the feature can take on)
6. Train a decision tree classifier on this and save the model as a pickle file

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
names = pd.read_csv('names_dataset.csv')
print(names.head(10))
 
print("%d names in dataset" % len(names))

   index       name sex
0      0       Mary   F
1      1       Anna   F
2      2       Emma   F
3      3  Elizabeth   F
4      4     Minnie   F
5      5   Margaret   F
6      6        Ida   F
7      7      Alice   F
8      8     Bertha   F
9      9      Sarah   F
95025 names in dataset


In [3]:
# Get the data out of the dataframe into a numpy matrix and keep only the name and gender columns
names = names.as_matrix()[:, 1:]
print(names)
 
# We're using 90% of the data for training
TRAIN_SPLIT = 0.90

[['Mary' 'F']
 ['Anna' 'F']
 ['Emma' 'F']
 ...
 ['Ziyu' 'M']
 ['Zykir' 'M']
 ['Zyus' 'M']]


In [4]:
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1], # Last letter
        'last2-letters': name[-2:], # Last 2 letters
        'last3-letters': name[-3:], # Last 3 letters
    }

# Feature Extraction
print(features("Alex"))

{'first2-letters': 'al', 'last-letter': 'x', 'first-letter': 'a', 'last2-letters': 'ex', 'last3-letters': 'lex', 'first3-letters': 'ale'}


In [5]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Paul"]))
# [ array({'first2-letters': 'an', 'last-letter': 'a', 'first-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna', 'first3-letters': 'ann'}, dtype=object)
#   array({'first2-letters': 'ha', 'last-letter': 'h', 'first-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah', 'first3-letters': 'han'}, dtype=object)
#   array({'first2-letters': 'pa', 'last-letter': 'l', 'first-letter': 'p', 'last2-letters': 'ul', 'last3-letters': 'aul', 'first3-letters': 'pau'}, dtype=object)]
 
# Extract the features for the whole dataset
X = features(names[:, 0]) # X contains the features
 
# Get the gender column
y = names[:, 1]           # y contains the targets
 
# Test if we built the dataset correctly
print("\n\nName: %s, features=%s, gender=%s" % (names[0][0], X[0], y[0]))

[{'first2-letters': 'an', 'last-letter': 'a', 'first-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna', 'first3-letters': 'ann'}
 {'first2-letters': 'ha', 'last-letter': 'h', 'first-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah', 'first3-letters': 'han'}
 {'first2-letters': 'pa', 'last-letter': 'l', 'first-letter': 'p', 'last2-letters': 'ul', 'last3-letters': 'aul', 'first3-letters': 'pau'}]


Name: Mary, features={'first2-letters': 'ma', 'last-letter': 'y', 'first-letter': 'm', 'last2-letters': 'ry', 'last3-letters': 'ary', 'first3-letters': 'mar'}, gender=F


In [6]:
X, y = shuffle(X, y)
X_train, X_test = X[:int(TRAIN_SPLIT * len(X))], X[int(TRAIN_SPLIT * len(X)):]
y_train, y_test = y[:int(TRAIN_SPLIT * len(y))], y[int(TRAIN_SPLIT * len(y)):]

# Check to see if the datasets add up
print len(X_train), len(X_test), len(y_train), len(y_test)

85522 9503 85522 9503


In [7]:
# Transforms lists of feature-value mappings to vectors.
vectorizer = DictVectorizer()
vectorizer.fit(X_train)
transformed = vectorizer.transform(features(["Mary", "John"]))
print transformed

print type(transformed) # <class 'scipy.sparse.csr.csr_matrix'>
print transformed.toarray()[0][12]    # 1.0
print vectorizer.feature_names_[12]   # first-letter=m

  (0, 12)	1.0
  (0, 244)	1.0
  (0, 2766)	1.0
  (0, 4636)	1.0
  (0, 4955)	1.0
  (0, 5290)	1.0
  (1, 9)	1.0
  (1, 198)	1.0
  (1, 2300)	1.0
  (1, 4625)	1.0
  (1, 4762)	1.0
  (1, 7416)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
1.0
first-letter=m


In [8]:
clf = DecisionTreeClassifier(criterion = 'gini')
clf.fit(vectorizer.transform(X_train), y_train)

# Accuracy on training set
print clf.score(vectorizer.transform(X_train), y_train) 
 
# Accuracy on test set
print clf.score(vectorizer.transform(X_test), y_test)

0.9865180889127944
0.8706724192360308


In [11]:
# Therefore, we are getting a decent result from the names
print clf.predict(vectorizer.transform(features(["SMYSLOV", "CHASTITY", "MISS PERKY", "SHARON", "ALONSO", "SECONDARY OFFICER"])))

['M' 'F' 'F' 'F' 'M' 'M']


In [12]:
# Save the model using pickle
import pickle

In [13]:
pickle_out = open("gender_recog.pickle", "wb")
pickle.dump(clf, pickle_out)
pickle_out.close()