In [61]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from unidecode import unidecode
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [62]:
class EP:
    
    def __init__(self, ethnicity):  
        
        assert ethnicity in 'iranian polish italian german japanese vietnamese portuguese fijian'.split(), f'incorrect ethnicity: {ethnicity}!'
        
        datafile = f'/Users/ik/Data/names/training-{ethnicity}.csv'
        self.target_col = f'is_{ethnicity}'
        
        self.data = pd.read_csv(datafile).sample(frac=1.)
        self.data['full_name'] = self.data['full_name'].apply(lambda _: unidecode(_))
        
        assert self.target_col in self.data.columns, f'there is no {self.target_col} in data file!'  
        
        try:
            self.NODOUBT_FIRST_NAMES = list({line.strip() for line in open(f'/Users/ik/Data/names/real_{ethnicity}.txt','r').readlines() if line.strip()})
        except:
            self.NODOUBT_FIRST_NAMES = []
        
    def add_nodoubt_names(self):
        
        print(f'adding {len(self.NODOUBT_FIRST_NAMES)} first names..')
        
        self.data = pd.concat([self.data, pd.DataFrame({'full_name': self.NODOUBT_FIRST_NAMES, 
                                                        self.target_col: [1]*len(self.NODOUBT_FIRST_NAMES)})], 
                              ignore_index=True).sample(frac=1.)
        return self

In [63]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return '_start_' + x[self.col_name] + '_end_'

class WordCount(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: len(_.split())).values.reshape(x.shape[0],1)
        return res

class NameLength(BaseEstimator, TransformerMixin):
    """
    return the length of the full name
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.str.len().values.reshape(x.shape[0],1)
        return res

class FirstLast(BaseEstimator, TransformerMixin):
    """
    is the first word longer than the last one
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: np.argmax([len(p) for i, p in enumerate(_.split()) if i in [0,len(_.split())-1]])).values.reshape(x.shape[0],1)
        return res

In [64]:
if __name__ == '__main__':
    
    ic = EP('fijian')
    #.add_nodoubt_names()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop([ic.target_col], axis=1), 
                                                        ic.data[ic.target_col],  test_size=0.2, 
                                                        random_state=391, stratify=ic.data[ic.target_col])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                        ('features', FeatureUnion(
                             [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(1, 4))),
                              ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,2))),
                              ('word_count', WordCount()),
                              ('full_name_length', NameLength()),
                              ('firstlast', FirstLast())],
                                 transformer_weights={
                                            'char_level': 0.8,
                                            'word_level': 0.5,
                                            'word_lengths': 1.0,
                                                        })
                     ),
                     ('normalise', Normalizer()),
#                      ('pca', TruncatedSVD(n_components=120)),
                     ('clf', VotingClassifier(estimators=[('sgd', SGDClassifier(max_iter=1000)), 
                                                   ('rf', RandomForestClassifier())], voting='soft'))
#                     ('clf', SGDClassifier(max_iter=1000))
                    ])
    
    param_grid = {
#         'features__transformer_weights': [{'char_level': 0.2,
#                                                      'word_level': 0.1,
#                                                      'word_count':0.4,
#                                                     'full_name_length': 0.5}, 
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.2,
#                                                      'word_lengths':0.9,
#                                                    'full_name_length': 0.3},
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.3,
#                                                      'word_count':0.2,
#                                                    'full_name_length': 0.7}],
                    "clf__sgd__loss": ['modified_huber', 'log'], # also 'hinge', 'perceptron'
                     'clf__sgd__penalty': ['l1','l2','elasticnet'],
                  'clf__rf__n_estimators': [100, 200],
                  'clf__rf__max_depth': [None, 3]}
#                  'pca__n_components': [30, 60, 120, 180]}
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    
    grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.9029850746268657, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8678304239401496, total=   0.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.1s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8827930174563591, total=   0.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.8s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.9104477611940298, total=   0.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.4s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8628428927680798, total=   0.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.2s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.885286783042394, total=   0.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.3s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.9129353233830846, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.4s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8678304239401496, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.5s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8827930174563591, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8905472636815921, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8453865336658354, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8603491271820449, total=   0.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8880597014925373, total=   0.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8329177057356608, total=   0.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.85785536159601, total=   0.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8855721393034826, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8379052369077307, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8553615960099751, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8980099502487562, total=   1.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8628428927680798, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8827930174563591, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.9129353233830846, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8703241895261845, total=   0.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8927680798004988, total=   0.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.9154228855721394, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8728179551122195, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.885286783042394, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.900497512437811, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8379052369077307, total=   1.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8553615960099751, total=   1.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.900497512437811, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8553615960099751, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.85785536159601, total=   1.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8830845771144279, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8329177057356608, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.85785536159601, total=   1.3s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8905472636815921, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8678304239401496, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8678304239401496, total=   0.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.9104477611940298, total=   0.4s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8728179551122195, total=   0.4s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8778054862842892, total=   0.4s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.900497512437811, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8678304239401496, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8877805486284289, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8805970149253731, total=   0.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8329177057356608, total=   0.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8503740648379052, total=   0.7s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.845771144278607, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8179551122194514, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8154613466334164, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.845771144278607, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8229426433915212, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=100, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8104738154613467, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8905472636815921, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8603491271820449, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l1, score=0.8778054862842892, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.900497512437811, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8603491271820449, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, score=0.8802992518703242, total=   0.5s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.9054726368159204, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8628428927680798, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=modified_huber, clf__sgd__penalty=elasticnet, score=0.8753117206982544, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8756218905472637, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8403990024937655, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l1, score=0.8453865336658354, total=   0.8s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8432835820895522, total=   0.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8254364089775561, total=   0.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=l2, score=0.8154613466334164, total=   0.6s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8432835820895522, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8179551122194514, total=   0.9s
[CV] clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  1.2min finished


[CV]  clf__rf__max_depth=3, clf__rf__n_estimators=200, clf__sgd__loss=log, clf__sgd__penalty=elasticnet, score=0.8104738154613467, total=   0.9s


In [65]:
print(classification_report(y_test, grid_search.predict(X_test)))

             precision    recall  f1-score   support

          0       0.87      0.95      0.91       188
          1       0.90      0.77      0.83       113

avg / total       0.88      0.88      0.88       301



  if diff:


In [66]:
test_data = pd.DataFrame({'full_name': ['han solo', 'xi','babak', 'igor souza', 'ai tran', 'david perron',
                                        'andrzej woods','otto floms', 'suzuki', 'toyota', 
                                        'edilson', 'kawasaki','silva', 'james tran', 
                                        'bo nguyen', 'subaru', 'ronaldo', 'ahmad lopes zeto', 
                                        'bob', 'mario razzi', 'john reed']})

test_data['prediction'] = grid_search.predict(test_data)
test_data['prediction'] = test_data['prediction'].apply(lambda _: 'yes' if _ else 'no')

  if diff:


In [67]:
test_data

Unnamed: 0,full_name,prediction
0,han solo,no
1,xi,no
2,babak,no
3,igor souza,no
4,ai tran,no
5,david perron,yes
6,andrzej woods,no
7,otto floms,no
8,suzuki,no
9,toyota,no


In [68]:
grid_search.best_params_

{'clf__rf__max_depth': None,
 'clf__rf__n_estimators': 200,
 'clf__sgd__loss': 'modified_huber',
 'clf__sgd__penalty': 'l2'}

In [69]:
for t in zip(X_test.values, grid_search.predict(X_test), y_test.values):
    if t[1] != t[2]:
        print(f'{t[0]} predicted {t[1]} is {t[2]}')

['richard broadbridge'] predicted 0 is 1
['waqa blake'] predicted 0 is 1
['manabu saito'] predicted 1 is 0
['vishwa chandra'] predicted 1 is 0
['rosario damiano maddaloni'] predicted 1 is 0
['daisuke watabe'] predicted 1 is 0
['lawrence little'] predicted 0 is 1
['emasi qovu'] predicted 0 is 1
['cheyenne rova'] predicted 0 is 1
['charlie moore'] predicted 0 is 1
['jagannath sami'] predicted 0 is 1
['sidiq koya'] predicted 0 is 1
['dan costello snr'] predicted 0 is 1
['aseri laing'] predicted 0 is 1
['noor dean'] predicted 0 is 1
['muthu swamy'] predicted 0 is 1
['ilimotama jese'] predicted 0 is 1
['sachin raikar'] predicted 1 is 0
['edwin sahayan'] predicted 0 is 1
['voula liakakos'] predicted 1 is 0
['jioji konrote'] predicted 0 is 1
['neumi leweni'] predicted 0 is 1
['brayden wiliame'] predicted 0 is 1
['faresa tautalafua'] predicted 1 is 0
['ilaria mauro'] predicted 1 is 0
['david ariu christopher'] predicted 0 is 1
['pio seci'] predicted 0 is 1
['mahendra sukhdeo'] predicted 0 is 1

  if diff:


In [70]:
X_test

Unnamed: 0,full_name
703,ryuji bando
136,alessandro gori
116,nguyen thi kim anh
838,dominiko waqaniburotu
91,sairusi niulevu
1368,misato ishihara
337,priscilla matthews
341,makoto mimura
190,mizuki arai
1223,melanie behringer
