In [44]:
import json
import pandas as pd
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC

In [45]:
class IranianClassifier:
    
    def __init__(self):
        
        self.data = pd.read_csv('/Users/ik/Data/names/training-iranian.csv')
        
    def _into_words(self):
        """
        split full names into words and add these to the dataset with the corresponding labels
        """
        _ir = set()
        _nonir = set()
        
        for row in self.data.iterrows():
            _ir.add(row[1]['full_name'].split()[0]) if row[1].is_iranian == 1 else _nonir.update(set(row[1]['full_name'].split()))
        
        # iranian name words not found in the non-iranian name words
        self.data = pd.concat([self.data,
                               pd.DataFrame({'full_name': list(_ir - _nonir), 'is_iranian': 1}),
                                  pd.DataFrame({'full_name': list(_nonir - _ir), 'is_iranian': 0})]).sample(frac=1.)
        
        return self

In [63]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame
    """
    
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[self.col_name]
    
class LastName(BaseEstimator, TransformerMixin):
    """
    Extract features from a Series of first names
    """
    def _ngram(self, s, n):
        """
        extract n-gram counts from string s
        """
        if len(s) < n:
            return {}
        
        f = defaultdict(int)
        
        for i, c in enumerate(s, 1):
            if i + n <= len(s):
                ngram = s[i-1: i+n]            
                if ngram in f:
                    f[ngram] += 1 
                else:
                    f[ngram] = 1
        return f
    
    def _last_n(self, s, n):
        """
        extract ending (last n letters) from s
        """
        if len(s[-n:]) == n:
            return s[-n:]
                    
    def _lastname_features(self, s):
        """
        take a string presumably first name and extract features as a dictionary
        """
        
        # keep features here
        fn_feats = defaultdict()
        
        # if s not even a string, no features
        if not isinstance(s, str):
            return fn_feats
        
        if len(s) < 2:
            return fn_feats
        
        for p in (set(punctuation) - {"'","-"}):
            s = s.lower().replace(p,'')
        
        word1 = s.split()[0]
        
        # if s is one letter (maybe an initial), no features
        if len(s) < 2:
            return fn_feats
        
        # prefix to name features
        pref = 'lname_'
        # how many words in first name?
        fn_feats['num_' + pref + 'nwords'] = len(s.split())
        # first letter of the first word
        fn_feats['nom_' + pref + 'first_letter'] = s[0]
        # length of first word
        fn_feats['num_' + pref + 'len'] = len(s.split()[0])
        # last letter of the first word
        fn_feats['nom_' + pref + 'last_letter'] = s.split()[0][-1]
        # words themselves
        for j, w in enumerate(s.split(), 1):
            fn_feats['nom_' + pref + 'word_' + str(j)] = w
        # any hyphens?
        if '-' in s:
            fn_feats['bin_' + pref + 'hyphen'] = 1
        # any apostrophs?
        if "'" in s:
            fn_feats['bin_' + pref + 'apostr'] = 1
        # letter counts (first word)
        for c in Counter(word1).items():
            fn_feats['num_' + pref + 'letter_' + c[0]] = c[1]
        
        # ending - last n letters
        for n in range(1,5):
            ending = self._last_n(word1, n)
            if ending:
                fn_feats['bin_' + pref + str(n) + '_last_lettes_' + ending] = 1 
            
        for n in range(1,5):
            for gr, v in self._ngram(word1, n).items():
                fn_feats['num_' + pref + str(n)+ '_gram_' + gr] = v
        
        return fn_feats
        

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        
        res =  pd.DataFrame.from_dict({t[0]: t[1] for t in 
                                       zip(x.index.values, x.apply(lambda x: self._lastname_features(x)))}, 
                                          orient='index').fillna(0)
        # create dummy variables for nominal features
        return pd.get_dummies(res, columns=[c for c in res.columns if c.startswith('nom_')])

In [64]:
if __name__ == '__main__':
    
    ic = IranianClassifier()._into_words()
    
    ln = LastName()
    

In [65]:
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop(['is_iranian'], axis=1), 
                                                        ic.data['is_iranian'],  test_size=0.2, 
                                                        random_state=391, stratify=ic.data['is_iranian'])
    

In [66]:
    pipe = Pipeline([('fe_union', Pipeline([('select_last_name', Selector('full_name')),
                                            ('last_name_features', LastName())])),
                    ('classifier', SVC(gamma=2, C=1))])
    
    pipe.fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [2728, 4620]

In [60]:
ln = LastName()

In [67]:
ln.fit_transform(X_train['full_name'])

Unnamed: 0,num_lname_nwords,num_lname_len,bin_lname_hyphen,num_lname_letter_m,num_lname_letter_i,num_lname_letter_l,num_lname_letter_t,num_lname_letter_o,num_lname_letter_n,bin_lname_1_last_lettes_n,...,nom_lname_word_4_velden,nom_lname_word_4_wong,nom_lname_word_4_wyk,nom_lname_word_4_yeung,nom_lname_word_4_zade,nom_lname_word_4_zadegan,nom_lname_word_4_zadeh,nom_lname_word_5_0,nom_lname_word_5_dyckhoff,nom_lname_word_5_segrais
0,1,6,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,5,7,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,2,10,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,2,5,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,1,6,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
5,2,5,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
6,2,5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
7,1,10,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8,1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
9,2,10,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
ic.data.head()

Unnamed: 0,full_name,is_iranian
342,marija,0
1033,jurcevic,0
1115,yayi,0
1595,bochang,0
1373,wright,0
