In [27]:
import json
import pandas as pd
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from pprint import pprint
from string import punctuation
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [22]:
class IranianNames:
    
    def __init__(self):
        
        self.data = pd.read_csv('/Users/ik/Data/names/training-iranian.csv')
        
    def _into_words(self):
        """
        split full names into words and add these to the dataset with the corresponding labels
        """
        _ir = set()
        _nonir = set()
        
        for row in self.data.iterrows():
            _ir.add(row[1]['full_name'].split()[0]) if row[1].is_iranian == 1 else _nonir.update(set(row[1]['full_name'].split()))
        
        # iranian name words not found in the non-iranian name words
        self.data = pd.concat([self.data,
                               pd.DataFrame({'full_name': list(_ir - _nonir), 'is_iranian': 1}),
                                  pd.DataFrame({'full_name': list(_nonir - _ir), 'is_iranian': 0})]).reset_index(drop=True).sample(frac=1.)
        
        return self

In [23]:
class Selector(BaseEstimator, TransformerMixin):
    """
    select a columns from a data frame
    """
    
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x.loc[:, self.col_name]

class Endings(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        
        return self

    def transform(self, x):
        res = x.apply(lambda x: x[-2:])
        res.name = 'ending-2'
=
        return pd.get_dummies(res, prefix='end')
        
class LastName(BaseEstimator, TransformerMixin):
    """
    Extract features from a Series of first names
    """
    def _ngram(self, s, n):
        """
        extract n-gram counts from string s
        """
        
        f = defaultdict(int)
        
        if len(s) < n:
            return f   
        
        for i, c in enumerate(s, 1):
            if i + n <= len(s):
                ngram = s[i-1: i+n]            
                if ngram in f:
                    f[ngram] += 1 
                else:
                    f[ngram] = 1
        return f
    
    def _last_n(self, s, n):
        """
        extract ending (last n letters) from s
        """
        if len(s[-n:]) == n:
            return s[-n:]
                    
    def _lastname_features(self, s):
        """
        take a string presumably first name and extract features as a dictionary
        """
        
        # keep features here
        fn_feats = defaultdict()
        
        # if s not even a string, no features
        if not isinstance(s, str):
            print(f'{s} not a string')
            return fn_feats
        
        for p in (set(punctuation) - {"'","-"}):
            s = s.lower().replace(p,'')
        
        word1 = s.split()[0]
        
        # prefix to name features
        pref = 'lname_'
        # how many words in first name?
        fn_feats['num_' + pref + 'nwords'] = len(s.split()) if len(s) > 1 else 0
        # first letter of the first word
        fn_feats['nom_' + pref + 'first_letter'] = s[0]
        # length of first word
        fn_feats['num_' + pref + 'len'] = len(s.split()[0]) if len(s) > 1 else 0
        # last letter of the first word
        fn_feats['nom_' + pref + 'last_letter'] = s.split()[0][-1] if len(s) > 1 else 0
        # words themselves
        if len(s) > 1:
            for j, w in enumerate(s.split(), 1):
                fn_feats['nom_' + pref + 'word_' + str(j)] = w 
        # any hyphens?
        if len(s) > 1:
            if '-' in s:
                fn_feats['bin_' + pref + 'hyphen'] = 1
            # any apostrophs?
            if "'" in s:
                fn_feats['bin_' + pref + 'apostr'] = 1
            # letter counts (first word)
            for c in Counter(word1).items():
                fn_feats['num_' + pref + 'letter_' + c[0]] = c[1]
        
        # ending - last n letters
        if len(s) > 1:
            for n in range(1,5):
                ending = self._last_n(word1, n)
                if ending:
                    fn_feats['bin_' + pref + str(n) + '_last_lettes_' + ending] = 1 
            
            for n in range(1,5):
                for gr, v in self._ngram(word1, n).items():
                    fn_feats['num_' + pref + str(n)+ '_gram_' + gr] = v
        
        return fn_feats
        

    def fit(self, x, y=None):
        
        self.feature_names = sorted(self.transform(x).columns)
        
        return self

    def transform(self, x):
        
        x = x.reset_index(drop=True)
        
        res =  pd.DataFrame.from_dict({t[0]: t[1] for t in 
                                       zip(x.index.values, x.apply(self._lastname_features))}, 
                                          orient='index').fillna(0)
        print(res.shape)
        # create dummy variables for nominal features
        return pd.get_dummies(res, columns=[c for c in res.columns if c.startswith('nom_')])[[self.feature_names]]

In [24]:
if __name__ == '__main__':
    
    ic = IranianNames()._into_words()
    

In [25]:
endn = Endings()

In [30]:
pd.get_dummies(endn.fit_transform(X_train['full_name']), prefix='end')

Unnamed: 0,end_'e,end_'i,end_a,end_aa,end_ab,end_ac,end_ad,end_ae,end_af,end_ah,...,end_yr,end_yu,end_z,end_za,end_ze,end_zh,end_zi,end_zl,end_zu,end_áh
5032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop(['is_iranian'], axis=1), 
                                                        ic.data['is_iranian'],  test_size=0.2, 
                                                        random_state=391, stratify=ic.data['is_iranian'])
    

    pipe = Pipeline([('select_fullname', Selector()),  # df with 1 column
                     ('get_endings', Endings()),
                    ('clf', SGDClassifier())])
    print('X_train shape=', X_train.shape, ' Y_train shape=',y_train.shape)
    pipe.fit(X_train, y_train)

X_train shape= (4620, 1)  Y_train shape= (4620,)
(4620, 16514)


AttributeError: 'LastName' object has no attribute 'feature_names'

In [None]:
pipe.predict(X_test)