In [136]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline   # pipeline of transforms with a final estimator
from sklearn.model_selection import GridSearchCV  # search over parameter values for an estimator
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin

def timer(func):
    def wrapper(*args, **kwargs):
        t_start = time.time()
        res = func(*args, **kwargs)
        print("{} # elapsed time: {:.0f} m {:.0f}s".format(
            func.__name__.upper(), *divmod(time.time() - t_start, 60)))
        return res
    return wrapper

class WordLengths(BaseEstimator, TransformerMixin):
    """extract word lengths from a full name"""

    def fit(self, x, y=None):
        return self

    def transform(self, d):
        # d is a 1-column data frame with full names
        return(pd.concat([d.apply(lambda x: len(str(x).split()[0])), 
                          d.apply(lambda x: len(str(x).split()[-1])),
                          d.str.len(),
                          d.apply(lambda x: len(str(x).split()))], axis=1))

In [137]:
class ChineseNameDetector(object):

    def __init__(self):
        """
        read up data into a data frame that look like below             

                                 full_name  is_chinese
        0      dianne  van eck           0
        1         chen zaichun           1

        """
        self.data = pd.read_csv('~/Data/names/chinesenames-data.csv')
        print("name dataset contains {} names ({} chinese)".format(len(self.data), Counter(self.data.is_chinese)[1]))

        assert set(self.data.columns) == set({"full_name", "is_chinese"}), print("wrong column names in data csv...")
        assert sum(list(Counter(self.data.is_chinese).values())) == len(self.data), print(
            "seems like not all names in data are labelled...")

    @timer
    def train_model(self):
        
        pipeline = Pipeline([
                            ('features', FeatureUnion([
                                    ('lengths', WordLengths()),
                                    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,3)))])),
            #  -- regularized linear model with stochastic gradient descent (SGD) learning
            #  max_iter = number of epochs
            ('clf', SGDClassifier(max_iter=100, learning_rate='optimal'))])  
        
        parameters = {'features__vect__ngram_range': ((1,3), (1,4), (1,5)),  # unigrams or bigrams
                        'clf__alpha': (0.00001, 0.000001),
                        'clf__penalty': ('l2', 'elasticnet')}
        
        self.grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
        
        X_train, X_test, y_train, y_test = train_test_split(self.data['full_name'], 
                                                            self.data['is_chinese'], 
                                                            stratify=self.data['is_chinese'],
                                                            test_size=0.3, random_state=31)
        
        self.grid_search.fit(X_train, y_train)
        
        print("best score: {:.2f}".format(self.grid_search.best_score_))
        print("best parameters: {}".format(self.grid_search.best_params_))
        y_true, y_pred = y_test, self.grid_search.predict(X_test)
        print(classification_report(y_true, y_pred))
        print(confusion_matrix(y_true, y_pred))

In [138]:
if __name__ == '__main__':
    cnd = ChineseNameDetector()
    cnd.train_model()

name dataset contains 681592 names (240396 chinese)
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 11.4min finished


best score: 1.00
best parameters: {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'features__vect__ngram_range': (1, 4)}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00    132359
          1       1.00      1.00      1.00     72119

avg / total       1.00      1.00      1.00    204478

[[132215    144]
 [   245  71874]]
TRAIN_MODEL # elapsed time: 12 m 9s


In [149]:
cnd.grid_search.predict(pd.Series(['carlos de']))

array([1])