In [1]:
import json
import pandas as pd
from sklearn.externals import joblib
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from unidecode import unidecode
from sklearn.cluster import KMeans, MeanShift
from ethnicity_features import (Selector, WordCount, NameLength, 
                                FirstLast, DictFirstNameFeatures, 
                                VowelsShare, ModelTransformer, MakeDense)
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
class EP:
    
    def __init__(self, ethnicity):  
        
        assert ethnicity in ('iranian polish italian german japanese vietnamese'
                             ' portuguese fijian indian croatian greek'
                             ' serbian spanish korean thai turkish' 
                             ' english cambodian russian').split(), f'incorrect ethnicity: {ethnicity}!'
        
        datafile = f'/Users/ik/Data/names/training-{ethnicity}.csv'
        self.target_col = f'is_{ethnicity}'
        
        self.data = pd.read_csv(datafile).sample(frac=1.)
        self.data['full_name'] = self.data['full_name'].apply(lambda _: unidecode(_))
        
        assert self.target_col in self.data.columns, f'there is no {self.target_col} in data file!'  
        
        try:
            self.NODOUBT_FIRST_NAMES = list({line.strip() for line in open(f'/Users/ik/Data/names/real_{ethnicity}.txt','r').readlines() if line.strip()})
        except:
            self.NODOUBT_FIRST_NAMES = []
        print(self.NODOUBT_FIRST_NAMES)
        
    def add_nodoubt_names(self):
        
        print(f'adding {len(self.NODOUBT_FIRST_NAMES)} first names..')
        
        self.data = pd.concat([self.data, pd.DataFrame({'full_name': self.NODOUBT_FIRST_NAMES, 
                                                        self.target_col: [1]*len(self.NODOUBT_FIRST_NAMES)})], 
                              ignore_index=True).sample(frac=1.)
        return self

In [9]:
if __name__ == '__main__':
    
    ETHN = 'fijian'
    
    ic = EP(ETHN).add_nodoubt_names()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop([ic.target_col], axis=1), 
                                                        ic.data[ic.target_col],  test_size=0.25, 
                                                        random_state=391, stratify=ic.data[ic.target_col])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                     ('features', FeatureUnion(
                         [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(2, 4))),
                          ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,3))),
                          ('word_count', WordCount()),
                          ('dict_fname', DictFirstNameFeatures(ic.NODOUBT_FIRST_NAMES)),
                          ('vow_share', VowelsShare()),
                          ('full_name_length', NameLength()),  
                          ('firstlast', FirstLast())])),
#                               transformer_weights={
#                                             'char_level': 0.8,
#                                             'word_level': 0.5,
#                                             'word_lengths': 1.0,})),
#                          ])),
                     ('normalise', Normalizer()),
#                      ('pca', TruncatedSVD(n_components=120)),
#                      ('clf', VotingClassifier(estimators=[('sgd', SGDClassifier(max_iter=1000)),
#                                                            ('rf', RandomForestClassifier(class_weight='balanced')),
#                                                             ('svc', SVC(C=0.0001)),
#                                                              ('abst', AdaBoostClassifier(n_estimators=200)),
#                                                              ('gb', GradientBoostingClassifier(n_estimators=200)),
#                                                          ('kmeans', KMeans(n_clusters=2))], 
#                                                                     voting='hard'))])
                     ('clf', FeatureUnion([('sgd', ModelTransformer(SGDClassifier(max_iter=1000))),
                                             ('rf', ModelTransformer(RandomForestClassifier(class_weight='balanced'))),
                                             ('kmeans', ModelTransformer(KMeans(n_clusters=2))),
                                             ('gb', ModelTransformer(GradientBoostingClassifier(n_estimators=200))),
                                             ('abst', ModelTransformer(AdaBoostClassifier(n_estimators=200))),
                                             ('svc', ModelTransformer(SVC(C=0.0001)))
                                            ])),
                     ('final_cls', GaussianNB())])
    
    param_grid = {
#         'features__transformer_weights': [{'char_level': 0.2,
#                                                      'word_level': 0.1,
#                                                      'word_count':0.4,
#                                                     'full_name_length': 0.5}, 
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.2,
#                                                      'word_lengths':0.9,
#                                                    'full_name_length': 0.3},
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.3,
#                                                      'word_count':0.2,
#                                                    'full_name_length': 0.7}],
                    'clf__sgd__loss': ['modified_huber'], # also 'hinge', 'perceptron', 'log'
                    'clf__sgd__penalty': ['l2'],  # 'elasticnet', 'l1'
                    'clf__rf__n_estimators': [100, 200, 300],
                    'clf__rf__max_depth': [None, 2, 3],
                    'clf__svc__C': [1e-6, 1e-5, 1e-4, 1e-2, 1.]}  # default is 1.
#                  'pca__n_components': [30, 120, 400]}
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
# pipe.fit(X_train, y_train)
    
    grid_search.fit(X_train, y_train)
    
#     joblib.dump(grid_search.best_estimator_, f'{ETHN}_model.pkl', compress = 1)

['siteri', 'alexade', 'elisapeci', 'iloilo', 'ilisapeci', 'lelea', 'ilikimi', 'miliana', 'josivini', 'asenaca', 'joni', 'nete', 'makereta', 'sovaia', 'tarlae', 'yasi', 'apenisa', 'semesa', 'nimlesh', 'kalara', 'kona', 'selai', 'esiteri', 'karalaini', 'alitia', 'elenoa', 'walota', 'iloilovatu', 'keresi', 'ifereimi', 'maciu']
adding 31 first names..
Fitting 3 folds for each of 45 candidates, totalling 135 fits
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06 


ValueError: Invalid parameter max_depth for estimator ModelTransformer(model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)). Check the list of available parameters with `estimator.get_params().keys()`.

In [7]:
print(classification_report(y_test, pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.92      0.97      0.94       253
          1       0.94      0.83      0.88       130

avg / total       0.93      0.92      0.92       383



In [None]:
test_data = pd.DataFrame({'full_name': ['han walota', 'xi','babak', 'igor souza', 'yash patel', 'david perron',
                                        'andrzej woods','otto floms', 'suzuki', 'iloilovatu', 
                                        'edilson', 'kawasaki','silva', 'james tran', 
                                        'bo nguyen', 'zlatan ibrahimovic', 'ronaldo barbosa', 'natalia klanchinkova', 
                                        'bob', 'keresi right','kalara',  'kona reed', 'andreas vlachos', 'con', 'nemanja vidic']})

test_data['prediction'] = grid_search.predict(test_data)
test_data['prediction'] = test_data['prediction'].apply(lambda _: 'yes' if _ else 'no')

In [None]:
grid_search.best_params_

In [None]:
for t in zip(X_test.values, grid_search.predict(X_test), y_test.values):
    if t[1] != t[2]:
        print(f'{t[0][0].upper()} predicted {t[1]} but it is {t[2]}')

In [None]:
grid_search.best_estimator_