In [9]:
import pandas as pd
import re
import enchant
import json
import statistics
import os
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain, permutations

In [10]:
class ChineseFinder(object):
    
    def __init__(self):
        
        self.DATA_DIR = "/Users/ik/Data"
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.name_dic_dir = "name_dicts"
        
        """ create a name/surname dictionary by ethnicity; it should look like this:
             {"chinese": {"names": {n1,n2,..}, "surnames": {s1,s2,..}}, "vietnamese": {..},..}
        """
        
        self.name_dic = defaultdict(lambda: defaultdict(set))  
        
        # ethnicity abbreviations used to name name files
        self.ethn_abbr = {"chinese": "cn", "vietnamese": "vn", "korean": "ko", "thai": "th"}
        
        # chinese names and surnames
        for ethnicity in self.ethn_abbr:
            try:
                self.name_dic[ethnicity]["names"] = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for full_name in pd.read_csv(self.name_dic_dir + "/" + "names_" + self.ethn_abbr[ethnicity] + ".txt", header=None)[0].tolist() 
                                                      for part in full_name.split()]))
            except:
                print("WARNING: can\'t find {} names...".format(ethnicity))
            try:
                self.name_dic[ethnicity]["surnames"] = set([surname.strip() for surname in pd.read_csv(self.name_dic_dir + "/" + "surnames_" + self.ethn_abbr[ethnicity] + ".txt", header=None)[0].tolist()])
            except:
                print("WARNING: can\'t find {} surnames...".format(ethnicity))   
                
       
        self.ticketek_customers = pd.read_csv(self.DATA_DIR + "/ticketek-customers/ticketek_customers.csv.gz", dtype=str).drop("middle_name", axis=1).fillna("")
        self.ticketek_customers["full_name"] = self.ticketek_customers["name"] + " " + self.ticketek_customers["last_name"]
        # hypocorisms; we just make a set of these and don't care what names they relate to
        self.hypoc_dict = json.load(open(self.DATA_DIR + "/names/hypocorisms.json", "r")) 
        self.hypocs = {hyp for full_name in self.hypoc_dict
                               for hyp in self.hypoc_dict[full_name]}
        # a mask to exclude de longhi, te vroomer, etc that have parts like in chinese names
        self.del_mask = re.compile('\s+[ltodaelsn]{2,3}\s+\w{5,}(\s|$)', re.ASCII)
        # ethnicity indicator dictionary
        self.ethn_df = pd.DataFrame()
        self.ethn_dic = defaultdict(lambda: defaultdict(int))
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 100000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
        
        # file name where to save as csv
        self.ETHN_CSV_FILE = "tkt_customers_ethn.csv.gz"
    
    def show_customer_stats(self):
        
        print("total ticketek customer ids: {}".format(len(self.ticketek_customers)))
        print("total unique emails: {}".format(len(self.ticketek_customers.email.unique())))
        
        return self
        
    def _match_asis(self, full_name, set_of_names):
        
        if isinstance(full_name, str) and (len(full_name)):
            if set(full_name.split()) & set_of_names:
                return 1
        return 0
    
    def _match_permutations(self, full_name, set_of_names):
        
        if isinstance(full_name, str) and (len(full_name)):
            for p in permutations(full_name.split(),2):
                if "".join(p) in set_of_names:
                    return 1
        return 0
    
    def _get_cust_features(self, full_name):
        """
        evaluate ethnicity features for a customer
        """
        cust_features = defaultdict(int)
        
        for ethnicity in self.ethn_abbr:
            # name: match as is unless it's chinese (then do additional permutation match)
            cust_features["name_" + self.ethn_abbr[ethnicity]] = self._match_asis(full_name, self.name_dic[ethnicity]["names"])
            if (ethnicity == "chinese") and (cust_features["name_" + self.ethn_abbr[ethnicity]] == 0):
                cust_features["name_" + self.ethn_abbr[ethnicity]] = self._match_permutations(full_name, self.name_dic[ethnicity]["names"])
            # match surnames same way for all ethnicities (skip thai as we have no thai surnames)
            if (ethnicity != "thai"):
                cust_features["surname_" + self.ethn_abbr[ethnicity]] = self._match_asis(full_name, self.name_dic[ethnicity]["surnames"])
        # check for hypocorisms (eng)
        cust_features["hypoc"] = 1 if (set(str(full_name).split()) & self.hypocs) else 0
        # look for any eng dictionary words
        cust_features["eng_dic"] = 1 if sum([int(self.ench_dic.check(st)) for st in str(full_name).split()]) > 0 else 0
        # if customer name matches the del mask, nullify all features as we then know that customer is
        # probably italian or spanish or someting and not of one of the ethnicities we are after
        if self.del_mask.search(str(full_name)):
            cust_features.update({k: 0 for k,v in cust_features.items() if k != "hypoc"})
            
        return cust_features

        
    def get_features(self):
        
        print("matching ethnicities for all Ticketek customers...")

        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            sub_feats = defaultdict(lambda: defaultdict(int))
            for tp in zip(sub_df["cust_id"], sub_df["full_name"].apply(self._get_cust_features)):
                sub_feats[tp[0]] = tp[1]
            self.ethn_df = pd.concat([self.ethn_df,
                                      pd.DataFrame.from_dict(sub_feats, orient="index")])
            if (i%10 == 0) or (i == self.FULL_CHUNKS-1):
                print("ids processed so far: {}".format(LAST_ONE))
                
        return self
    
    def features_to_csv(self):
        
        # remove rows with all zeroes
        self.ethn_df = self.ethn_df[(self.ethn_df.T !=0).any()]
        # resulting data frame
        res_df = self.ticketek_customers.join(self.ethn_df, on="cust_id", how="inner")    
        print("saving inferred ethnicities ({} rows) to {}...".format(len(res_df), self.ETHN_CSV_FILE), end="")
        res_df.to_csv(self.ETHN_CSV_FILE, index=False, compression="gzip")
        print("ok")

In [11]:
if __name__ == "__main__":
    
    cf = ChineseFinder().show_customer_stats().get_features()
    cf.features_to_csv()

total ticketek customer ids: 20215503
total unique emails: 12562443
matching ethnicities for all Ticketek customers...
ids processed so far: 100000
ids processed so far: 1100000
ids processed so far: 2100000
ids processed so far: 3100000
ids processed so far: 4100000
ids processed so far: 5100000
ids processed so far: 6100000
ids processed so far: 7100000
ids processed so far: 8100000
ids processed so far: 9100000
ids processed so far: 10100000
ids processed so far: 11100000
ids processed so far: 12100000
ids processed so far: 13100000
ids processed so far: 14100000
ids processed so far: 15100000
ids processed so far: 16100000
ids processed so far: 17100000
ids processed so far: 18100000
ids processed so far: 19100000
ids processed so far: 20100000
ids processed so far: 20200000
saving inferred ethnicities (8257872 rows) to tkt_customers_ethn.csv.gz...ok


In [12]:
d = pd.read_csv("tkt_customers_ethn.csv.gz")

In [13]:
d.head()

Unnamed: 0,cust_id,title,name,last_name,email,full_name,name_cn,surname_cn,name_vn,surname_vn,name_ko,surname_ko,name_th,hypoc,eng_dic
0,4596957,ms,dawn,grant,dawngrant@aapt.net.au,dawn grant,0,0,0,0,0,0,0,0,1
1,4596959,,chris,bonnici,chrisnsara1@dodo.com.au,chris bonnici,0,0,0,0,0,0,0,1,0
2,4596965,ms,anna,mckee,oldmickey@hotmail.com,anna mckee,0,0,0,0,0,0,1,0,0
3,4596971,mr,peter,dowd,petejayne@yahoo.com,peter dowd,0,0,0,0,0,0,0,0,1
4,4596972,mr,aman,wang,everjustaman@hotmail.com,aman wang,0,1,0,0,0,1,0,0,0


In [30]:
chinese_db = d.loc[((d.name_cn == 1) | (d.surname_cn == 1)) &
                   (d.name_vn == 0) & (d.surname_vn == 0) & 
                   (d.name_ko == 0) & (d.surname_ko == 0) & 
                   (d.name_th == 0) & (d.hypoc == 0) &
                   (d.eng_dic == 0),:]

In [31]:
chinese_db.head()

Unnamed: 0,cust_id,title,name,last_name,email,full_name,name_cn,surname_cn,name_vn,surname_vn,name_ko,surname_ko,name_th,hypoc,eng_dic
15,4596995,mr,kevin,feng,downloadpass999@hotmail.com,kevin feng,1,1,0,0,0,0,0,0,0
56,4597079,ms,katherine,chen,kathchen@optusnet.com.au,katherine chen,1,1,0,0,0,0,0,0,0
180,4597321,ms,menghua,liu,quedy2002@yahoo.com,menghua liu,1,1,0,0,0,0,0,0,0
265,4597500,,james & karen,liang,yaya2002@hotmail.com,james & karen liang,0,1,0,0,0,0,0,0,0
279,4597532,ms,julie,wong,wochip@bigpond.com,julie wong,0,1,0,0,0,0,0,0,0


In [32]:
len(chinese_db.email.unique())

112020

In [33]:
chinese_db.to_csv("chinese_segment.csv", index=False, sep="\t", encoding="latin-1")