In [25]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer

In [26]:
class IranianNames:
    
    def __init__(self):        
        self.data = pd.read_csv('/Users/ik/Data/names/training-polish.csv')
        print(f'iranian names: {Counter(self.data["is_polish"])[1]}/{len(self.data)}')

In [27]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return '_start_' + x[self.col_name] + '_end_'

class WordLengths(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: len(_.split())).values.reshape(x.shape[0],1)
        return res

class FirstLast(BaseEstimator, TransformerMixin):
    """
    is the first word longer than the last one
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: np.argmax([len(p) for i, p in enumerate(_.split()) if i in [0,len(_.split())-1]])).values.reshape(x.shape[0],1)
        return res

In [35]:
if __name__ == '__main__':
    
    ic = IranianNames()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop(['is_polish'], axis=1), 
                                                        ic.data['is_polish'],  test_size=0.2, 
                                                        random_state=391, stratify=ic.data['is_polish'])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                        ('features', FeatureUnion(
                             [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(1, 4))),
                              ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,2))),
                              ('word_lengths', WordLengths()),
                              ('firstlast', FirstLast())],
                                 transformer_weights={
                                            'char_level': 0.8,
                                            'word_level': 0.5,
                                            'word_lengths': 1.0,
                                                        })
                     ),
                     ('normalise', Normalizer()),
                    ('clf', SGDClassifier(max_iter=1000))])
    
    param_grid = {'features__transformer_weights': [{'char_level': 0.2,
                                                     'word_level': 0.1,
                                                     'word_lengths':0.4}, 
                                                   {'char_level': 0.4,
                                                     'word_level': 0.6,
                                                     'word_lengths':0.8},
                                                   {'char_level': 0.9,
                                                     'word_level': 0.3,
                                                     'word_lengths':0.5}],
                    "clf__loss": ['hinge', 'modified_huber', 'log', 'perceptron'],
                     'clf__penalty': ['l1','l2','elasticnet']}
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    
    grid_search.fit(X_train, y_train)

iranian names: 2484/9736
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.973815941470928, total=   3.8s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9714946070878274, total=   4.0s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9703275529865125, total=   3.7s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8} 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.1s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8}, score=0.9745860608394301, total=   3.8s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8} 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   17.4s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8}, score=0.9734206471494607, total=   3.9s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8} 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.9s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8}, score=0.9722543352601156, total=   4.0s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5} 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   26.2s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5}, score=0.9726607624181748, total=   3.5s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5} 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   30.2s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5}, score=0.9718798151001541, total=   3.8s
[CV] clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5} 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   34.5s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5}, score=0.9714836223506744, total=   3.9s
[CV] clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   38.9s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9765113592606854, total=   1.9s
[CV] clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9757318952234206, total=   1.9s
[CV] clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9749518304431599, total=   2.3s
[CV] clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8} 
[CV]  clf__loss=hinge, clf__penalty=l2, features__transformer_weights={'char_level': 0.4, 'word_l

[CV]  clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9776665383134386, total=   1.8s
[CV] clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9768875192604006, total=   2.0s
[CV] clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9764932562620424, total=   2.2s
[CV] clf__loss=modified_huber, clf__penalty=l2, features__transformer_weights={'char_level': 0.4, 'word_level': 0.6, 'word_lengths': 0.8} 
[CV]  clf__loss=modified_huber, clf__penalt

[CV]  clf__loss=log, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5}, score=0.9622350674373795, total=   4.0s
[CV] clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9688101655756642, total=   2.5s
[CV] clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9622496147919877, total=   2.4s
[CV] clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=log, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'w

[CV]  clf__loss=perceptron, clf__penalty=l1, features__transformer_weights={'char_level': 0.9, 'word_level': 0.3, 'word_lengths': 0.5}, score=0.9529865125240847, total=   3.1s
[CV] clf__loss=perceptron, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=perceptron, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9772814786291876, total=   1.8s
[CV] clf__loss=perceptron, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=perceptron, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4}, score=0.9711093990755008, total=   1.7s
[CV] clf__loss=perceptron, clf__penalty=l2, features__transformer_weights={'char_level': 0.2, 'word_level': 0.1, 'word_lengths': 0.4} 
[CV]  clf__loss=perceptron, clf__penalty=l2, features__transformer_

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  6.9min finished


In [36]:
print(classification_report(y_test, grid_search.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98      1451
          1       0.97      0.94      0.95       497

avg / total       0.98      0.98      0.98      1948



In [37]:
grid_search.predict(pd.DataFrame({'full_name': ['mohammad kumar', 'mehri','nejad myriam','hosein', 'QUI CHAN','babak']}))

array([0, 0, 0, 0, 0, 1])

In [38]:
grid_search.best_params_

{'clf__loss': 'modified_huber',
 'clf__penalty': 'elasticnet',
 'features__transformer_weights': {'char_level': 0.2,
  'word_lengths': 0.4,
  'word_level': 0.1}}