In [5]:
import pandas as pd
import json
from collections import defaultdict
from unidecode import unidecode
from string import ascii_lowercase

In [51]:
class EthnicityDetector(object):
    
    def __init__(self, df, ethnicity_list):
        
        self.DATA_DIR = "/Users/ik/Data/"
        self.NAME_DATA_DIR = self.DATA_DIR + "names/"
        self.ethnicity_list = ethnicity_list
        self.input_df = df
        print("new customer ids: {}".format(len(self.input_df)))
        
        # load name and surname databases
        self.name_dict = json.load(open(self.NAME_DATA_DIR + "names_26092017.json", "r"))
        self.surname_dict = json.load(open(self.NAME_DATA_DIR + "surnames_26092017.json", "r"))
        # make name and surname dictionaries by letter for required ethnicities
        self.names = defaultdict(lambda: defaultdict(set))
        self.surnames = defaultdict(lambda: defaultdict(set))
    
    def _create_ethnic_dicts(self):
        
        for ethnicity in self.ethnicity_list:
            
            if ethnicity in self.name_dict:
                self.names[ethnicity] = {letter: {w["name"] for w in self.name_dict[ethnicity] 
                                                 if w["name"][0] == letter} for letter in ascii_lowercase}
            else:
                self.names[ethnicity] = {}
                
            if ethnicity in self.surname_dict:
                self.surnames[ethnicity] = {letter: {w for w in self.surname_dict[ethnicity] 
                                                 if w[0] == letter} for letter in ascii_lowercase}
            else:
                self.surnames[ethnicity] = {}
                
        return self
    
    def _clean_input(self):
        
        # replace separators with white spaces, then make sure there's only 1 white space separating name parts
        self.input_df["full_name"] = self.input_df["full_name"].str.replace(r"[-'_]", " ").str.split().str.join(' ').str.strip()
        # ignore names that contain not only letters
        self.input_df["full_name"] = self.input_df[self.input_df["full_name"].str.isalpha()]
        
        return self
    
    def _find_in_name(self):
   
        self.input_df["name_ethn"] = self.input_df["full_name"].apply(lambda _: "|".join([ethnicity for ethnicity in self.ethnicity_list 
                                                                      if _ in self.names[ethnicity][_[0]]]))
            
        return self
        
        
        

In [52]:
df = pd.read_csv("/Users/ik/Data/temp/sample_new_customer_names.csv")
ed = EthnicityDetector(df, ["indian", "filipino"])

new customer ids: 200997


In [53]:
ed._create_ethnic_dicts()
ed.input_df.head()

Unnamed: 0,cust_id,full_name
0,12374,steve balzary
1,13763,annette matheson
2,52404,andrew g wiseman
3,53306,bruce fenwick
4,54140,lionel werbeloff


In [54]:
ed.input_df[ed.input_df["full_name"].apply(lambda x: x.startswith(" "))]

Unnamed: 0,cust_id,full_name
1330,1210421,meliisa anderson
9196,3925488,test
9197,3925653,test
13322,3922423,hyatt hotel canberra
23022,7025640,artist comps
24054,7464408,mystry shopper
24133,7671178,southern cross austereo
27921,8795557,player services
28792,9034572,st phillips christian college
30006,9445021,heath


In [49]:
ed.names['filipino'].keys()

dict_keys(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])