In [1]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from unidecode import unidecode
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
class EP:
    
    def __init__(self, ethnicity):  
        
        assert ethnicity in ('iranian polish italian german japanese vietnamese' 
                            ' portuguese fijian indian croatian greek'
                             ' serbian spanish korean thai turkish english cambodian').split(), f'incorrect ethnicity: {ethnicity}!'
        
        datafile = f'/Users/ik/Data/names/training-{ethnicity}.csv'
        self.target_col = f'is_{ethnicity}'
        
        self.data = pd.read_csv(datafile).sample(frac=1.)
        self.data['full_name'] = self.data['full_name'].apply(lambda _: unidecode(_))
        
        assert self.target_col in self.data.columns, f'there is no {self.target_col} in data file!'  
        
        try:
            self.NODOUBT_FIRST_NAMES = list({line.strip() for line in open(f'/Users/ik/Data/names/real_{ethnicity}.txt','r').readlines() if line.strip()})
        except:
            self.NODOUBT_FIRST_NAMES = []
        
    def add_nodoubt_names(self):
        
        print(f'adding {len(self.NODOUBT_FIRST_NAMES)} first names..')
        
        self.data = pd.concat([self.data, pd.DataFrame({'full_name': self.NODOUBT_FIRST_NAMES, 
                                                        self.target_col: [1]*len(self.NODOUBT_FIRST_NAMES)})], 
                              ignore_index=True).sample(frac=1.)
        return self

In [3]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return '_start_' + x[self.col_name] + '_end_'

class WordCount(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: len(_.split())).values.reshape(x.shape[0],1)
        return res

class NameLength(BaseEstimator, TransformerMixin):
    """
    return the length of the full name
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.str.len().values.reshape(x.shape[0],1)
        return res

class FirstLast(BaseEstimator, TransformerMixin):
    """
    is the first word longer than the last one
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: np.argmax([len(p) for i, p in enumerate(_.split()) if i in [0,len(_.split())-1]])).values.reshape(x.shape[0],1)
        return res

In [4]:
if __name__ == '__main__':
    
    ic = EP('cambodian')
    #.add_nodoubt_names()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop([ic.target_col], axis=1), 
                                                        ic.data[ic.target_col],  test_size=0.25, 
                                                        random_state=391, stratify=ic.data[ic.target_col])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                        ('features', FeatureUnion(
                             [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(1, 4))),
                              ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,2))),
                              ('word_count', WordCount()),
                              ('full_name_length', NameLength()),
                              ('firstlast', FirstLast())],
                                 transformer_weights={
                                            'char_level': 0.8,
                                            'word_level': 0.5,
                                            'word_lengths': 1.0,
                                                        })
                     ),
                     ('normalise', Normalizer()),
#                      ('pca', TruncatedSVD(n_components=120)),
                     ('clf', VotingClassifier(estimators=[('sgd', SGDClassifier(max_iter=1000)), 
                                                           ('rf', RandomForestClassifier()),
                                                            ('svc', SVC(C=0.0001)),
                                                             ('abst', AdaBoostClassifier(n_estimators=100)),
                                                             ('gb', GradientBoostingClassifier())], voting='hard'))
#                     ('clf', SGDClassifier(max_iter=1000))
                    ])
    
    param_grid = {
#         'features__transformer_weights': [{'char_level': 0.2,
#                                                      'word_level': 0.1,
#                                                      'word_count':0.4,
#                                                     'full_name_length': 0.5}, 
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.2,
#                                                      'word_lengths':0.9,
#                                                    'full_name_length': 0.3},
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.3,
#                                                      'word_count':0.2,
#                                                    'full_name_length': 0.7}],
                    "clf__sgd__loss": ['modified_huber'], # also 'hinge', 'perceptron', 'log'
                     'clf__sgd__penalty': ['l2'],  # 'elasticnet', 'l1'
                  'clf__rf__n_estimators': [100, 200],
                  'clf__rf__max_depth': [None, 3],
                    'clf__svc__C': [1e-5, 1e-4, 1e-2, 1]}  # default is 1.
#                  'pca__n_components': [30, 60, 120, 180]}
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    
    grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.6s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9196393571148569, total=  32.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9188553508428068, total=  31.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9192156862745098, total=  32.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9200313602508821, total=  35.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.6min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9184633477067816, total=  36.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.4min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9215686274509803, total=  37.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.2min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9204233633869071, total=  34.1s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.0min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9188553508428068, total=  37.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  6.7min remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9219607843137255, total=  33.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9200313602508821, total=  33.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9188553508428068, total=  32.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9243137254901961, total=  33.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9196393571148569, total=  35.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9196393571148569, total=  35.1s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9219607843137255, total=  35.1s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9184633477067816, total=  35.1s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9188553508428068, total=  35.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9203921568627451, total=  38.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9176793414347315, total=  37.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9200313602508821, total=  36.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9211764705882353, total=  35.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9192473539788318, total=  35.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9172873382987063, total=  35.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9231372549019607, total=  35.2s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9098392787142298, total=  29.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9035672285378283, total=  29.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9074509803921569, total=  32.1s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9094472755782046, total=  32.0s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9035672285378283, total=  31.0s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9133333333333333, total=  31.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9082712661701293, total=  30.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9027832222657781, total=  31.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9109803921568628, total=  31.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9094472755782046, total=  31.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9027832222657781, total=  31.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9168627450980392, total=  31.2s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9086632693061545, total=  32.4s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.9031752254018032, total=  31.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, score=0.912156862745098, total=  32.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9082712661701293, total=  31.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9027832222657781, total=  31.0s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, score=0.9094117647058824, total=  32.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9039592316738534, total=  31.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9027832222657781, total=  29.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, score=0.9094117647058824, total=  30.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9051352410819287, total=  30.0s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9039592316738534, total=  30.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 34.8min finished


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1, score=0.9090196078431373, total=  32.0s


In [5]:
print(classification_report(y_test, grid_search.predict(X_test)))

             precision    recall  f1-score   support

          0       0.96      0.90      0.93      1275
          1       0.90      0.97      0.93      1276

avg / total       0.93      0.93      0.93      2551



  if diff:


In [6]:
test_data = pd.DataFrame({'full_name': ['han solo', 'xi','babak', 'igor souza', 'yash patel', 'david perron',
                                        'andrzej woods','otto floms', 'suzuki', 'toyota', 
                                        'edilson', 'kawasaki','silva', 'james tran', 
                                        'bo nguyen', 'zlatan ibrahimovic', 'ronaldo barbosa', 'ahmad lopes zeto', 
                                        'bob', 'mario razzi', 'john reed', 'andreas vlachos', 'con', 'nemanja vidic']})

test_data['prediction'] = grid_search.predict(test_data)
test_data['prediction'] = test_data['prediction'].apply(lambda _: 'yes' if _ else 'no')

  if diff:


In [7]:
test_data

Unnamed: 0,full_name,prediction
0,han solo,no
1,xi,no
2,babak,no
3,igor souza,no
4,yash patel,no
5,david perron,yes
6,andrzej woods,no
7,otto floms,yes
8,suzuki,no
9,toyota,no


In [8]:
grid_search.best_params_

{'clf__rf__max_depth': None,
 'clf__rf__n_estimators': 100,
 'clf__sgd__loss': 'modified_huber',
 'clf__sgd__penalty': 'l2',
 'clf__svc__C': 1}

In [9]:
for t in zip(X_test.values, grid_search.predict(X_test), y_test.values):
    if t[1] != t[2]:
        print(f'{t[0][0].upper()} predicted {t[1]} but it is {t[2]}')

IREM EREN predicted 1 but it is 0
MARGARET BACKHOUSE predicted 0 but it is 1
MARCOS LLORENTE predicted 1 but it is 0
KENNY PRINCE REDONDO predicted 1 but it is 0
SPENCER PERCEVAL predicted 0 but it is 1
NICK SKORICH predicted 1 but it is 0
SERDAR AYDIN predicted 1 but it is 0
REN XIONG predicted 1 but it is 0
TAYE ASHBY-HAMMOND predicted 0 but it is 1
NIKKI WATERMAN predicted 0 but it is 1
HAN MAHLSOOK predicted 1 but it is 0
MATTHEW PAVLICH predicted 1 but it is 0
VIJAY KUMAR predicted 1 but it is 0
BENJAMIN DISRAELI predicted 0 but it is 1
ALEX CARBONELL predicted 1 but it is 0
ALFRED LEVY predicted 1 but it is 0
SID VICIOUS predicted 0 but it is 1
AMIR KHAN predicted 0 but it is 1
IDRIS ELBA predicted 0 but it is 1
GERMAN DATIDIS predicted 1 but it is 0
CLAUDIA ROTH predicted 1 but it is 0
BENEDIKT SALLER predicted 1 but it is 0
JOHN LIMBERT predicted 1 but it is 0
ADALBERT STIFTER predicted 1 but it is 0
CORNELIA FUNKE predicted 1 but it is 0
ROBIN HACK predicted 1 but it is 0
ANGU

  if diff:


In [10]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('select_fullname', Selector(col_name='full_name')), ('features', FeatureUnion(n_jobs=1,
       transformer_list=[('char_level', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True...      warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None))])

In [11]:
X_test

Unnamed: 0,full_name
5550,lee min-a
7799,peter jones
4328,paul huddy
9923,jack rodwell
7410,kim chang-hun
9597,percy smith
8258,barbara honigmann
1033,gerry gow
3971,kevin drinkell
1241,rupeni rabici
