In [2]:
import pandas as pd
from nltk.corpus import words # list of english words
import enchant  # english dictionary
import re
import json
import statistics
import os
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain, permutations
import jellyfish

In [3]:
class ChineseFinder(object):
    
    def __init__(self):
        
        self.DATA_DIR = "/Users/ik/Data/"
        self.NAME_DATA_DIR = self.DATA_DIR + "names-surnames/"
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.wordnet_dic = words
        
        self.name_dic_dir = "name_dicts"
        
        """ create a name/surname dictionary by ethnicity; it should look like this:
             {"chinese": {"names": {n1,n2,..}, "surnames": {s1,s2,..}}, "vietnamese": {..},..}
        """
        
        self.name_dic = defaultdict(lambda: defaultdict(set))  
        self.incl_ethnicities = "arabic chinese dutch hawaiian indian italian japanese khmer korean samoan serbian thai".split()
        
        # chinese names and surnames
        for ethnicity in self.incl_ethnicities:
            try:
                self.name_dic[ethnicity]["names"] = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for full_name in pd.read_csv(self.NAME_DATA_DIR + "names_" + ethnicity + ".txt", header=None)[0].tolist() 
                                                      for part in full_name.split()]))
            except:
                print("WARNING: can\'t find {} names...".format(ethnicity))
            try:
                self.name_dic[ethnicity]["surnames"] = set([surname.strip() for surname in pd.read_csv(self.NAME_DATA_DIR + "surnames_" + ethnicity + ".txt", header=None)[0].tolist()])
            except:
                print("WARNING: can\'t find {} surnames...".format(ethnicity))   
                
       
        self.ticketek_customers = pd.read_csv(self.DATA_DIR + "/ticketek-customers/ticketek_customers.csv.gz", dtype=str).drop("middle_name", axis=1).fillna("")
        self.ticketek_customers["full_name"] = self.ticketek_customers["name"] + " " + self.ticketek_customers["last_name"]
        # hypocorisms; we just make a set of these and don't care what names they relate to
        self.hypoc_dict = json.load(open(self.NAME_DATA_DIR + "hypocorisms.json", "r")) 
        self.hypocs = {hyp for full_name in self.hypoc_dict
                               for hyp in self.hypoc_dict[full_name]}
        # the US Census 2010 surnames
        self.c2010surns = (pd.read_csv(self.NAME_DATA_DIR + "us_census_2010_surnames.csv")[["name", "pctwhite", "pctblack", "pcthispanic", "pctapi"]]
                                .set_index("name")
                                   .rename(columns={"pctwhite": "white", "pctblack": "black", "pcthispanic": "hisp", "pctapi": "asian"})
                                      .to_dict(orient="index"))

        # a mask to exclude de longhi, te vroomer, etc that have parts like in chinese names
        self.del_mask = re.compile('\s+[ltodauelsn]{2,3}\s+\w{5,}(\s|$)', re.ASCII)
        # ethnicity indicator dictionary
        self.ethn_df = pd.DataFrame()
        self.ethn_dic = defaultdict(lambda: defaultdict(int))
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 100000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
        
        # file name where to save as csv
        self.ETHN_CSV_FILE = "tkt_customers_ethn.csv.gz"
    
    def show_customer_stats(self):
        
        print("total ticketek customer ids: {}".format(len(self.ticketek_customers)))
        print("total unique emails: {}".format(len(self.ticketek_customers.email.unique())))
        
        return self
        
    def _match_asis(self, full_name, set_of_names):
        
        if isinstance(full_name, str) and (len(full_name)):
            if set(full_name.split()) & set_of_names:
                return 1
        return 0
    
    def _match_permutations(self, full_name, set_of_names):
        
        if isinstance(full_name, str) and (len(full_name)):
            for p in permutations(full_name.split(),2):
                if "".join(p) in set_of_names:
                    return 1
        return 0
    
    def _min_dist(self, full_name, ethnicity):
        
        distances_all_name_parts = set()
        # any surnames that start from the same letter as the parts of the name?
        for w in full_name.split():
            candidate_surs = {s for s in self.name_dic[ethnicity]["surnames"] if s[0] == w[0]}
            if candidate_surs:
                distances = {jellyfish.damerau_levenshtein_distance(w, sur) for sur in candidate_surs}
                distances_all_name_parts.update(distances)
            else:
                continue
        
        if distances_all_name_parts:
            return min(distances_all_name_parts)
        else:
            return None

    
    def _get_cust_features(self, name, surname):
        """
        evaluate ethnicity features for a customer
        """
        cust_features = defaultdict(int)
        
        for nm, st in zip(["n", "s", "f"], [name, surname, " ".join(name, surname)]):
            if nm.strip() and len(nm.split()):    
                for ethnicity in self.incl_ethnicities:
                    # names: match as is unless it's chinese (then do additional permutation match)
                    feat_name = "_".join([nm, "name", ethnicity])
                    if len(nm.split()) == 1:
                        cust_features[feat_name] = self._match_asis(nm, self.name_dic[ethnicity]["names"])
                    else:
                        cust_features[feat_name] = self._match_permutations(nm, self.name_dic[ethnicity]["names"])
                    # surnames: same way for all ethnicities we have surnames for
                    if self.name_dic[ethnicity]["surnames"]:
                        feat_name = "_".join([nm, "surname", ethnicity])
                        cust_features[feat_name] = max([self._match_asis(w, self.name_dic[ethnicity]["surnames"]) for w in nm.split()])
                        feat_name = "_".join([nm, "dist", "surname", ethnicity])  
                        cust_features[feat_name] = self._min_dist(nm, ethnicity)
        # check for hypocorisms (eng)
        cust_features["hypoc"] = 1 if (set(str(full_name).split()) & self.hypocs) else 0
        # look for any eng dictionary words
        cust_features["eng_dic"] = 1 if sum([int(self.ench_dic.check(st)) for st in str(full_name).split()]) > 0 else 0
        cust_features["wordnet_dic"] = 1 if sum([int(st in self.wordnet_dic.words()) for st in str(full_name).split()]) > 0 else 0
        # if customer name matches the del mask, nullify all features as we then know that customer is
        # probably italian or spanish or someting and not of one of the ethnicities we are after
        if self.del_mask.search(str(full_name)):
            cust_features.update({k: 0 for k,v in cust_features.items() if k != "hypoc"})
        # find surnames
        for w in full_name.split():
            if w.upper() in self.c2010surns:
                cust_features.update(self.c2010surns[w.upper()])      
            
        return cust_features

        
    def get_features(self):
        
        print("matching ethnicities for all Ticketek customers...")

        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            sub_feats = defaultdict(lambda: defaultdict(int))
            for tp in zip(sub_df["cust_id"], sub_df["full_name"].apply(self._get_cust_features)):
                sub_feats[tp[0]] = tp[1]
            self.ethn_df = pd.concat([self.ethn_df,
                                      pd.DataFrame.from_dict(sub_feats, orient="index")])
            if (i%10 == 0) or (i == self.FULL_CHUNKS-1):
                print("ids processed so far: {}".format(LAST_ONE))
                
        return self
    
    def features_to_csv(self):
        
        # remove rows with all zeroes
        self.ethn_df = self.ethn_df[(self.ethn_df.T !=0).any()]
        # resulting data frame
        res_df = self.ticketek_customers.join(self.ethn_df, on="cust_id", how="inner")    
        print("saving inferred ethnicities ({} rows) to {}...".format(len(res_df), self.ETHN_CSV_FILE), end="")
        res_df.to_csv(self.ETHN_CSV_FILE, index=False, compression="gzip")
        print("ok")

In [4]:
if __name__ == "__main__":
    
    cf = ChineseFinder().show_customer_stats().get_features()
    cf.features_to_csv()

total ticketek customer ids: 20215503
total unique emails: 12562443
matching ethnicities for all Ticketek customers...


TypeError: _get_cust_features() missing 1 required positional argument: 'surname'

In [13]:
d = pd.read_csv("tkt_customers_ethn.csv.gz", dtype=str)

In [14]:
d.head(20)

Unnamed: 0,cust_id,title,name,last_name,email,full_name,name_cn,surname_cn,damlev_cn,name_vn,...,name_th,name_kh,surname_kh,damlev_kh,hypoc,eng_dic,white,black,hisp,asian
0,4596956,ms,carla,seychell,carlaseychell@yahoo.com,carla seychell,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,3.0,0,0,46.55,6.03,35.34,9.48
1,4596957,ms,dawn,grant,dawngrant@aapt.net.au,dawn grant,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,0,1,55.36,37.91,2.79,0.52
2,4596958,ms,carmella,rowsthorne,crowsthorne@farmersinfo.com.au,carmella rowsthorne,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,5.0,0,0,91.43,(S),5.71,(S)
3,4596959,,chris,bonnici,chrisnsara1@dodo.com.au,chris bonnici,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,1,0,94.48,0,4.55,(S)
4,4596960,,kristen,doyle,kristen_is@hotmail.com,kristen doyle,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,3.0,0,0,89.12,5.66,2.39,0.66
5,4596961,ms,victoria,salefao,dareeldeel@hotmail.com,victoria salefao,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,4.0,0,0,17.86,7.15,63.37,10.06
6,4596962,ms,riana,murray,peaceoutwabbit@hotmail.com,riana murray,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,3.0,0,0,74.92,19.29,2.6,0.57
7,4596963,mr,shawn,rajanayagam,lankanz4lyf_69@hotmail.com,shawn rajanayagam,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,0,0,75.34,11.65,5.72,2.08
8,4596964,mr,daniel,harvey,leisa@au00.com,daniel harvey,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,4.0,0,0,68.0,25,2.52,0.52
9,4596965,ms,anna,mckee,oldmickey@hotmail.com,anna mckee,0.0,0.0,3.0,0.0,...,1.0,0.0,0.0,2.0,0,0,87.42,6.81,2.35,0.63


In [35]:
d.to_csv("ticketek_ethnicities.csv.gz", encoding="latin-1", index=False, compression="gzip")

In [None]:
korean_db = d.loc[((d.name_ko == 1) | (d.surname_ko == 1)) &
                   (d.name_cn == 0) & (d.surname_cn == 0) & 
                   (d.name_vn == 0) & (d.surname_vn == 0) & 
                   (d.name_th == 0) & (d.hypoc == 0) &
                   (d.eng_dic == 0),:]

In [None]:
korean_db.head()

In [None]:
len(korean_db.email.unique())

In [None]:
vietnamese_db.to_csv("vietnamese_segment.csv", index=False, sep="\t", encoding="latin-1")

In [None]:
cnames = pd.read_csv("~/Data/names/us_census_2010_surnames.csv")

In [None]:
float(cnames.loc[cnames.name == "NICKOLSON",:].iloc[0]["pctwhite"])

In [24]:
c2010surns = 

In [25]:
c2010surns

{'SMITH': {'asian': '0.5', 'black': '23.11', 'hisp': '2.4', 'white': '70.9'},
 'JOHNSON': {'asian': '0.54',
  'black': '34.63',
  'hisp': '2.36',
  'white': '58.97'},
 'WILLIAMS': {'asian': '0.46',
  'black': '47.68',
  'hisp': '2.49',
  'white': '45.75'},
 'BROWN': {'asian': '0.51', 'black': '35.6', 'hisp': '2.52', 'white': '57.95'},
 'JONES': {'asian': '0.44',
  'black': '38.48',
  'hisp': '2.29',
  'white': '55.19'},
 'GARCIA': {'asian': '1.41',
  'black': '0.45',
  'hisp': '92.03',
  'white': '5.38'},
 'MILLER': {'asian': '0.54',
  'black': '10.76',
  'hisp': '2.17',
  'white': '84.11'},
 'DAVIS': {'asian': '0.49', 'black': '31.6', 'hisp': '2.44', 'white': '62.2'},
 'RODRIGUEZ': {'asian': '0.57',
  'black': '0.54',
  'hisp': '93.77',
  'white': '4.75'},
 'MARTINEZ': {'asian': '0.6',
  'black': '0.49',
  'hisp': '92.91',
  'white': '5.28'},
 'HERNANDEZ': {'asian': '0.6',
  'black': '0.36',
  'hisp': '94.89',
  'white': '3.79'},
 'LOPEZ': {'asian': '1.02', 'black': '0.57', 'hisp': '9