In [1]:
import json
import pandas as pd

from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from unidecode import unidecode
from ethnicity_features import Selector, WordCount, NameLength, FirstLast, DictFirstNameFeatures
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
class EP:
    
    def __init__(self, ethnicity):  
        
        assert ethnicity in ('iranian polish italian german japanese vietnamese'
                             ' portuguese fijian indian croatian greek'
                             ' serbian spanish korean thai turkish' 
                             ' english cambodian russian').split(), f'incorrect ethnicity: {ethnicity}!'
        
        datafile = f'/Users/ik/Data/names/training-{ethnicity}.csv'
        self.target_col = f'is_{ethnicity}'
        
        self.data = pd.read_csv(datafile).sample(frac=1.)
        self.data['full_name'] = self.data['full_name'].apply(lambda _: unidecode(_))
        
        assert self.target_col in self.data.columns, f'there is no {self.target_col} in data file!'  
        
        try:
            self.NODOUBT_FIRST_NAMES = list({line.strip() for line in open(f'/Users/ik/Data/names/real_{ethnicity}.txt','r').readlines() if line.strip()})
        except:
            self.NODOUBT_FIRST_NAMES = []
        print(self.NODOUBT_FIRST_NAMES)
        
    def add_nodoubt_names(self):
        
        print(f'adding {len(self.NODOUBT_FIRST_NAMES)} first names..')
        
        self.data = pd.concat([self.data, pd.DataFrame({'full_name': self.NODOUBT_FIRST_NAMES, 
                                                        self.target_col: [1]*len(self.NODOUBT_FIRST_NAMES)})], 
                              ignore_index=True).sample(frac=1.)
        return self

In [3]:
# class Selector(BaseEstimator, TransformerMixin):
#     """
#     select a columns from a data frame and return as a list
#     """
#     def __init__(self, col_name):
#         self.col_name = col_name
    
#     def fit(self, x, y=None):
#         return self

#     def transform(self, x):
#         return '_start_' + x[self.col_name] + '_end_'

# class WordCount(BaseEstimator, TransformerMixin):
#     """
#     select a columns from a data frame and return as a list
#     """
#     def __init__(self):
#         pass
    
#     def fit(self, x, y=None):
#         return self

#     def transform(self, x):
#         res = x.apply(lambda _: len(_.split())).values.reshape(x.shape[0],1)
#         return res

# class NameLength(BaseEstimator, TransformerMixin):
#     """
#     return the length of the full name
#     """
#     def __init__(self):
#         pass
    
#     def fit(self, x, y=None):
#         return self

#     def transform(self, x):
#         res = x.str.len().values.reshape(x.shape[0],1)
#         return res

# class FirstLast(BaseEstimator, TransformerMixin):
#     """
#     is the first word longer than the last one
#     """
#     def __init__(self):
#         pass
    
#     def fit(self, x, y=None):
#         return self

#     def transform(self, x):
#         res = x.apply(lambda _: np.argmax([len(p) for i, p in enumerate(_.split()) if i in [0,len(_.split())-1]])).values.reshape(x.shape[0],1)
#         return res

In [None]:
if __name__ == '__main__':
    
    ic = EP('fijian').add_nodoubt_names()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop([ic.target_col], axis=1), 
                                                        ic.data[ic.target_col],  test_size=0.3, 
                                                        random_state=391, stratify=ic.data[ic.target_col])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                     ('features', FeatureUnion(
                         [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(2, 4))),
                          ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,3))),
                          ('word_count', WordCount()),
                          ('dict_fname', DictFirstNameFeatures(ic.NODOUBT_FIRST_NAMES)),
                          ('full_name_length', NameLength()),
                          ('firstlast', FirstLast())],
                              transformer_weights={
                                            'char_level': 0.8,
                                            'word_level': 0.5,
                                            'word_lengths': 1.0,})),
                     ('normalise', Normalizer()),
                     ('pca', TruncatedSVD(n_components=120)),
                     ('clf', VotingClassifier(estimators=[('sgd', SGDClassifier(max_iter=1000)), 
                                                           ('rf', RandomForestClassifier(class_weight='balanced')),
                                                            ('svc', SVC(C=0.0001)),
                                                             ('abst', AdaBoostClassifier(n_estimators=200)),
                                                             ('gb', GradientBoostingClassifier(n_estimators=200))], 
                                                                    voting='hard'))])
    
    param_grid = {
#         'features__transformer_weights': [{'char_level': 0.2,
#                                                      'word_level': 0.1,
#                                                      'word_count':0.4,
#                                                     'full_name_length': 0.5}, 
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.2,
#                                                      'word_lengths':0.9,
#                                                    'full_name_length': 0.3},
#                                                    {'char_level': 0.9,
#                                                      'word_level': 0.3,
#                                                      'word_count':0.2,
#                                                    'full_name_length': 0.7}],
                    'clf__sgd__loss': ['modified_huber'], # also 'hinge', 'perceptron', 'log'
                    'clf__sgd__penalty': ['l2'],  # 'elasticnet', 'l1'
                    'clf__rf__n_estimators': [100, 200],
                    'clf__rf__max_depth': [None, 3],
                    'clf__svc__C': [1e-6, 1e-5, 1e-4, 1e-2, 1.],  # default is 1.
                 'pca__n_components': [30, 120, 400]}
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    
    grid_search.fit(X_train, y_train)

['tarlae', 'asenaca', 'makereta', 'keresi', 'siteri', 'josivini', 'joni', 'alexade', 'alitia', 'karalaini', 'iloilo', 'esiteri', 'elisapeci', 'sovaia', 'miliana', 'iloilovatu', 'semesa', 'apenisa', 'kalara', 'selai', 'ifereimi', 'walota', 'kona', 'maciu', 'elenoa', 'nete', 'lelea', 'ilikimi', 'ilisapeci', 'yasi', 'nimlesh']
adding 31 first names..
Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30, score=0.8379888268156425, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.9s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30, score=0.8687150837988827, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.4s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=30, score=0.851123595505618, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.4s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120, score=0.8519553072625698, total=   3.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.3s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120, score=0.8687150837988827, total=   3.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   16.1s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=120, score=0.8623595505617978, total=   3.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.3s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400, score=0.8854748603351955, total=  10.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   38.1s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400, score=0.8966480446927374, total=  10.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400 


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   49.3s remaining:    0.0s


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-06, pca__n_components=400, score=0.8735955056179775, total=  10.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30, score=0.8296089385474861, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30, score=0.8547486033519553, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=30, score=0.8539325842696629, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120, score=0.8519553072625698, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120, score=0.88268156424581, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=120, score=0.8679775280898876, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400, score=0.8743016759776536, total=  11.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400, score=0.888268156424581, total=  11.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1e-05, pca__n_components=400, score=0.8792134831460674, total=  11.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30, score=0.8379888268156425, total=   2.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30, score=0.8659217877094972, total=   1.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=30, score=0.8651685393258427, total=   1.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120, score=0.8659217877094972, total=   3.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120, score=0.8798882681564246, total=   3.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=120, score=0.8595505617977528, total=   3.7s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400, score=0.8687150837988827, total=  11.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400, score=0.8743016759776536, total=  11.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.0001, pca__n_components=400, score=0.8792134831460674, total=  11.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30, score=0.8435754189944135, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30, score=0.840782122905028, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=30, score=0.848314606741573, total=   1.3s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120, score=0.8575418994413407, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120, score=0.8743016759776536, total=   3.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=120, score=0.8595505617977528, total=   3.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400, score=0.8659217877094972, total=  11.2s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400, score=0.888268156424581, total=  11.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=0.01, pca__n_components=400, score=0.8848314606741573, total=  12.8s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30, score=0.8491620111731844, total=   1.5s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30, score=0.8798882681564246, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=30, score=0.8455056179775281, total=   1.4s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120, score=0.8715083798882681, total=   3.9s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120, score=0.8659217877094972, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=120, score=0.848314606741573, total=   3.6s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=400 


  if diff:
  if diff:


[CV]  clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=400, score=0.88268156424581, total=  11.0s
[CV] clf__rf__max_depth=None, clf__rf__n_estimators=100, clf__sgd__loss=modified_huber, clf__sgd__penalty=l2, clf__svc__C=1.0, pca__n_components=400 


  if diff:


In [None]:
print(classification_report(y_test, grid_search.predict(X_test)))

In [None]:
test_data = pd.DataFrame({'full_name': ['han walota', 'xi','babak', 'igor souza', 'yash patel', 'david perron',
                                        'andrzej woods','otto floms', 'suzuki', 'iloilovatu', 
                                        'edilson', 'kawasaki','silva', 'james tran', 
                                        'bo nguyen', 'zlatan ibrahimovic', 'ronaldo barbosa', 'natalia klanchinkova', 
                                        'bob', 'keresi right','kalara',  'kona reed', 'andreas vlachos', 'con', 'nemanja vidic']})

test_data['prediction'] = grid_search.predict(test_data)
test_data['prediction'] = test_data['prediction'].apply(lambda _: 'yes' if _ else 'no')

In [None]:
test_data

In [None]:
grid_search.best_params_

In [None]:
for t in zip(X_test.values, grid_search.predict(X_test), y_test.values):
    if t[1] != t[2]:
        print(f'{t[0][0].upper()} predicted {t[1]} but it is {t[2]}')

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(grid_search.best_estimator_, 'fijian_model.pkl', compress = 1)

In [None]:
m = joblib.load('russian_model.pkl')