In [34]:
import json
import pandas as pd
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [35]:
class IranianNames:
    
    def __init__(self):        
        self.data = pd.read_csv('/Users/ik/Data/names/training-polish.csv')
        print(f'iranian names: {Counter(self.data["is_polish"])[1]}/{len(self.data)}')

In [36]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return '_start_' + x[self.col_name] + '_end_'

class WordLengths(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame and return as a list
    """
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        res = x.apply(lambda _: len(_.split())).values.reshape(x.shape[0],1)
        return res

In [37]:
if __name__ == '__main__':
    
    ic = IranianNames()
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop(['is_polish'], axis=1), 
                                                        ic.data['is_polish'],  test_size=0.2, 
                                                        random_state=391, stratify=ic.data['is_polish'])
    

    pipe = Pipeline([('select_fullname', Selector('full_name')),  # df with 1 column
                     ('features', FeatureUnion(
                         [('char_level', CountVectorizer(strip_accents='ascii', analyzer='char', 
                                                          ngram_range=(1, 4))),
                         ('word_level', CountVectorizer(strip_accents='ascii', analyzer='word',
                                                       ngram_range=(1,2))),
                         ('word_lengths', WordLengths())])
                     ),
                    ('clf', SGDClassifier(max_iter=1000))])
    
    param_grid = dict(clf__loss=['hinge', 'modified_huber', 'log', 'perceptron'],
                     clf__penalty=['l1','l2','elasticnet'])
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    
    grid_search.fit(X_train, y_train)

iranian names: 2484/9736
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] clf__loss=hinge, clf__penalty=l1 ................................
[CV]  clf__loss=hinge, clf__penalty=l1, score=0.9715055833654216, total=   4.5s
[CV] clf__loss=hinge, clf__penalty=l1 ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, score=0.9687981510015409, total=   4.7s
[CV] clf__loss=hinge, clf__penalty=l1 ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l1, score=0.9695568400770713, total=   4.7s
[CV] clf__loss=hinge, clf__penalty=l2 ................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l2, score=0.9776665383134386, total=   1.6s
[CV] clf__loss=hinge, clf__penalty=l2 ................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   16.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l2, score=0.975346687211094, total=   1.7s
[CV] clf__loss=hinge, clf__penalty=l2 ................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.8s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=l2, score=0.9761078998073218, total=   1.6s
[CV] clf__loss=hinge, clf__penalty=elasticnet ........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   20.7s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=elasticnet, score=0.9788217173661917, total=   4.5s
[CV] clf__loss=hinge, clf__penalty=elasticnet ........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.5s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=elasticnet, score=0.9722650231124808, total=   4.4s
[CV] clf__loss=hinge, clf__penalty=elasticnet ........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   30.2s remaining:    0.0s


[CV]  clf__loss=hinge, clf__penalty=elasticnet, score=0.9772639691714836, total=   4.8s
[CV] clf__loss=modified_huber, clf__penalty=l1 .......................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   35.3s remaining:    0.0s


[CV]  clf__loss=modified_huber, clf__penalty=l1, score=0.9726607624181748, total=   4.6s
[CV] clf__loss=modified_huber, clf__penalty=l1 .......................
[CV]  clf__loss=modified_huber, clf__penalty=l1, score=0.9699537750385208, total=   4.5s
[CV] clf__loss=modified_huber, clf__penalty=l1 .......................
[CV]  clf__loss=modified_huber, clf__penalty=l1, score=0.9714836223506744, total=   4.7s
[CV] clf__loss=modified_huber, clf__penalty=l2 .......................
[CV]  clf__loss=modified_huber, clf__penalty=l2, score=0.9776665383134386, total=   1.7s
[CV] clf__loss=modified_huber, clf__penalty=l2 .......................
[CV]  clf__loss=modified_huber, clf__penalty=l2, score=0.9699537750385208, total=   1.5s
[CV] clf__loss=modified_huber, clf__penalty=l2 .......................
[CV]  clf__loss=modified_huber, clf__penalty=l2, score=0.976878612716763, total=   1.5s
[CV] clf__loss=modified_huber, clf__penalty=elasticnet ...............
[CV]  clf__loss=modified_huber, clf__pena

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  2.3min finished


In [38]:
print(classification_report(y_test, grid_search.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98      1451
          1       0.96      0.95      0.96       497

avg / total       0.98      0.98      0.98      1948



In [39]:
grid_search.predict(pd.DataFrame({'full_name': ['mohammad kumar', 'mehri','maryam ziaei nejad','hosein', 'QUI CHAN','babak']}))

array([0, 0, 1, 0, 0, 1])