In [33]:
import pandas as pd
import re
import enchant
import json
import statistics
import os
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain, permutations

In [34]:
class ChineseFinder(object):
    
    def __init__(self):
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.name_dic_dir = "name_dicts"
        
        """ create a name/surname dictionary by ethnicity; it should look like this:
             {"chinese": {"names": {n1,n2,..}, "surnames": {s1,s2,..}}, "vietnamese": {..},..}
        """
        
        self.name_dic = defaultdict(lambda: defaultdict(set))  
        
        # ethnicity abbreviations used to name name files
        self.ethn_abbr = {"chinese": "cn", "vietnamese": "vn", "korean": "ko", "thai": "th"}
        
        # chinese names and surnames
        for ethnicity in self.ethn_abbr:
            try:
                self.name_dic[ethnicity]["names"] = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for full_name in pd.read_csv(self.name_dic_dir + "/" + "names_" + self.ethn_abbr[ethnicity] + ".txt", header=None)[0].tolist() 
                                                      for part in full_name.split()]))
            except:
                print("WARNING: can\'t find {} names...".format(ethnicity))
            try:
                self.name_dic[ethnicity]["surnames"] = set([surname.strip() for surname in pd.read_csv(self.name_dic_dir + "/" + "surnames_" + self.ethn_abbr[ethnicity] + ".txt", header=None)[0].tolist()])
            except:
                print("WARNING: can\'t find {} surnames...".format(ethnicity))   
                
       
        self.ticketek_customers = pd.read_csv("../data/ticketek_customers.csv.gz", dtype=str).drop("middle_name", axis=1)
        self.ticketek_customers["full_name"] = self.ticketek_customers["name"] + " " + self.ticketek_customers["last_name"]
        # hypocorisms; we just make a set of these and don't care what names they relate to
        self.hypoc_dict = json.load(open("../data/hypocorisms.json", "r")) 
        self.hypocs = {hyp for full_name in self.hypoc_dict
                               for hyp in self.hypoc_dict[full_name]}
        # a mask to exclude de longhi, te vroomer, etc that have parts like in chinese names
        self.del_mask = re.compile('\s+[ltodaelsn]{2,3}\s+\w{5,}(\s|$)', re.ASCII)
        # ethnicity indicator dictionary
        self.ethn_dic = defaultdict(lambda: defaultdict(int))
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 100000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
    
    def show_customer_stats(self):
        
        print("total ticketek customer ids: {}".format(len(self.ticketek_customers)))
        print("total unique emails: {}".format(len(self.ticketek_customers.email.unique())))
        
        return self
        
    def _match_asis(self, full_name, set_of_names):
        
        if set(full_name.split()) & set_of_names:
            return 1
        return 0
    
    def _match_permutations(self, full_name, set_of_names):
        
        for p in permutations(full_name.split(),2):
            if "".join(p) in set_of_names:
                return 1
        return 0
    
    def _get_cust_features(self, full_name):
        """
        evaluate ethnicity features for a customer
        """
        cust_features = defaultdict(int)
        
        for ethnicity in self.ethn_abbr:
            # name: match as is unless it's chinese (then do additional permutation match)
            cust_features["name_" + self.ethn_abbr[ethnicity]] = _match_asis(full_name, self.name_dic[ethnicity]["names"])
            if (ethnicity == "chinese") and (cust_features["name_" + self.ethn_abbr[ethnicity]] == 0):
                cust_features["name_" + self.ethn_abbr[ethnicity]] = _match_permutations(full_name, self.name_dic[ethnicity]["names"])
            # match surnames same way for all ethnicities (skip thai as we have no thai surnames)
            if (ethnicity != "thai"):
                cust_features["surname_" + self.ethn_abbr[ethnicity]] = _match_asis(full_name, self.name_dic[ethnicity]["surnames"])
        # check for hypocorisms (eng)
        cust_features["hypoc"] = 1 if (set(str(full_name).split()) & self.hypocs) else 0
        # look for any eng dictionary words
        cust_features["eng_dic"] = 1 if sum([int(self.ench_dic.check(st)) for st in str(full_name).split()]) > 0 else 0
        # if customer name matches the del mask, nullify all features as we then know that customer is
        # probably italian or spanish or someting and not of one of the ethnicities we are after
        if self.del_mask.search(str(full_name)):
            cust_features = {k: 0 for k,v in cust_features.items() if k != "hypoc"}
            
        return self

        
    def find_chinese_in_dataframe(self):
        
        print("searching for chinese among all Ticketek customers...")

        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            
            self.chinese_cust_ids.update(set(sub_df.loc[sub_df["full_name"].apply(self.is_likely_chinese) & 
                       sub_df["last_name"].apply(lambda _: (sum([str(_).endswith(w) for w in self.chinese_letter_stats["last2"]]) > 0)
                                                and (not self.non_chinese_mask.search(str(_))) and
                                                (not self._is_vietnamese(str(_)))),
                       "cust_id"].tolist()))

            if (i%20 == 0) or (i == self.FULL_CHUNKS-1):
                print("ids found so far: {}".format(len(self.chinese_cust_ids)))
                
        self.chinese_df = self.ticketek_customers.loc[self.ticketek_customers.cust_id.isin(self.chinese_cust_ids),:]
        
        return self
        

In [35]:
cf = ChineseFinder().show_customer_stats()

total ticketek customer ids: 20215503
total unique emails: 12562443


In [36]:
cf.name_dic

defaultdict(<function __main__.ChineseFinder.__init__.<locals>.<lambda>>,
            {'chinese': defaultdict(set,
                         {'names': {'ah',
                           'ai',
                           'an',
                           'anguo',
                           'bai',
                           'bao',
                           'baozhai',
                           'bingwen',
                           'biyu',
                           'bo',
                           'bohai',
                           'bojing',
                           'bolin',
                           'boqin',
                           'chang',
                           'changchang',
                           'changpu',
                           'changying',
                           'chanming',
                           'chao',
                           'chaoxiang',
                           'chen',
                           'cheng',
                           'chenglei',
     

In [29]:
cf.ethn_dic

defaultdict(<function __main__.ChineseFinder.__init__.<locals>.<lambda>>,
            {21: defaultdict(int,
                         {'name_cn': 1, 'name_ko': 1, 'surname_cn': 1})})

In [9]:
if __name__ == "__main__":
    
    cf = ChineseFinder().get_name_stats()

In [10]:
cf.find_chinese_in_dataframe()

searching for chinese among all Ticketek customers...
ids found so far: 267
ids found so far: 6741
ids found so far: 10973
ids found so far: 14805
ids found so far: 20740
ids found so far: 27410
ids found so far: 35892
ids found so far: 46693
ids found so far: 57408
ids found so far: 74342
ids found so far: 91088
ids found so far: 91842


<__main__.ChineseFinder at 0x1186e61d0>

In [11]:
cf.chinese_df

Unnamed: 0,cust_id,title,name,last_name,email,full_name
39,4596995,mr,kevin,feng,downloadpass999@hotmail.com,kevin feng
123,4597079,ms,katherine,chen,kathchen@optusnet.com.au,katherine chen
365,4597321,ms,menghua,liu,quedy2002@yahoo.com,menghua liu
577,4597533,,narin,na ranong,narin@optushome.com.au,narin na ranong
859,4597815,mrs,linda,chen,lindachen@swiftdsl.com.au,linda chen
2173,4599129,ms,jennifer,lin,l_maddie68@yahoo.com.au,jennifer lin
2593,4599549,ms,katherine,liu,kliu1008@bigpond.net.au,katherine liu
2639,4599595,ms,emily,lim,anggrani@hotmail.com,emily lim
2670,4599626,mr,michael,yi,changjaeyi@hotmail.com,michael yi
2876,4599832,,li,shing,klshing@yahoo.com,li shing
