In [82]:
import json
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [78]:
class IranianClassifier:
    
    def __init__(self):
        
        self.data = pd.read_csv('/Users/ik/Data/names/training-iranian.csv')
        
    def _into_words(self):
        """
        split full names into words and add these to the dataset with the corresponding labels
        """
        _ir = set()
        _nonir = set()
        
        for row in self.data.iterrows():
            _ir.add(row[1]['full_name'].split()[0]) if row[1].is_iranian == 1 else _nonir.update(set(row[1]['full_name'].split()))
        
        # iranian name words not found in the non-iranian name words
        self.data = pd.concat([self.data,
                               pd.DataFrame({'full_name': list(_ir - _nonir), 'is_iranian': 1}),
                                  pd.DataFrame({'full_name': list(_nonir - _ir), 'is_iranian': 0})]).sample(frac=1.)
        
        
        return self

In [87]:
if __name__ == '__main__':
    
    ic = IranianClassifier()._into_words()
    cv = CountVectorizer(decode_error='replace', strip_accents='ascii', ngram_range=(1,3))
    
    # split into the training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(ic.data.drop(['is_iranian'], axis=1), ic.data['is_iranian'],  test_size=0.2, random_state=391, stratify=ic.data['is_iranian'])

In [86]:
ic.data.head()

Unnamed: 0,full_name,is_iranian
736,fred,0
633,dean wheldon,0
1591,hoh,0
1981,paton,0
362,shahin abbasian,1


In [76]:
len(cv.get_feature_names())

7562

In [88]:
X_train

Unnamed: 0,full_name
2624,zhang zhenqing
30,rumpf
1795,duong
1496,shermine shahrivar
2764,penina uili
1382,jialu
28,lenore
1431,hamed bahiraei
2168,qian zhuo
1531,narayan
