In [96]:
import pandas as pd
from nltk.corpus import words # list of english words
import enchant  # english dictionary
import re
import json
import os
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain, permutations
import jellyfish

In [132]:
class ChineseFinder(object):
    
    def __init__(self):
        
        self.DATA_DIR = "/Users/ik/Data/"
        self.NAME_DATA_DIR = self.DATA_DIR + "names-surnames/"
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.wordnet_dic = words.words()
        
        """ create a name/surname dictionary by ethnicity; it should look like this:
             {"chinese": {"names": {n1,n2,..}, "surnames": {s1,s2,..}}, "vietnamese": {..},..}
        """
        
        self.name_dic = defaultdict(lambda: defaultdict(set))  
        self.incl_ethnicities = """arabic chinese dutch hawaiian indian 
                                    italian japanese khmer korean samoan 
                                        serbian thai turkish greek""".split()
        
        print("total ethnicities available: {}".format(len(self.incl_ethnicities)))
    
        # chinese names and surnames
        for ethnicity in self.incl_ethnicities:
            try:
                # we collect names or more spacifically, name parts if there are more than one
                self.name_dic[ethnicity]["names"] = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for name in pd.read_csv(self.NAME_DATA_DIR + "names_" + ethnicity + ".txt")["name"].tolist() 
                                                      for part in name.split()]))
            except:
                print("WARNING: can\'t find {} names...".format(ethnicity))
            try:
                self.name_dic[ethnicity]["surnames"] = set([surname.strip() for surname in pd.read_csv(self.NAME_DATA_DIR + "surnames_" + ethnicity + ".txt", header=None)[0].tolist()])
            except:
                print("WARNING: can\'t find {} surnames...".format(ethnicity))   
                
       
        self.ticketek_customers = pd.read_csv(self.DATA_DIR + "/ticketek-customers/ticketek_customers.csv.gz", dtype=str).drop("middle_name", axis=1).fillna("").sample(n=3600)
        self.ticketek_customers[["name", "last_name"]] = self.ticketek_customers[["name", "last_name"]].applymap(unidecode).applymap(lambda x: x.replace("-", " "))
        # hypocorisms; we just make a set of these and don't care what names they relate to
        # self.hypoc is simply a set like {'abbie', 'abby', 'abe',...}
        self.hypocs = {s for s in chain.from_iterable(json.load(open(self.NAME_DATA_DIR + "hypocorisms.json", "r")).values())}
        # the US Census 2010 surnames
        # note that created dictionary has entries like 
        # 'SMITH': {'asian': '0.5', 'black': '23.11', 'hisp': '2.4', 'white': '70.9'},
        self.c2010surns = {surname.lower(): {race: (float(pct_str) if (("." in pct_str) and pct_str.replace(".","").isdigit()) else 0) for race, pct_str in race_dict.items()} 
                           for surname, race_dict in 
                           pd.read_csv(self.NAME_DATA_DIR + "us_census_2010_surnames.csv", dtype=str)[["name", "pctwhite", "pctblack", "pcthispanic", "pctapi"]]
                           .dropna(how='any')     
                           .set_index("name")
                           .rename(columns={"pctwhite": "white", "pctblack": "black", "pcthispanic": "hisp", "pctapi": "asian"})
                           .to_dict(orient="index").items()
                          }
        print("surnames in the US census data: {}".format(len(self.c2010surns)))
        # a mask to exclude de longhi, te vroomer, etc that have parts like in chinese names
        self.del_mask = re.compile('\s+[ltodauelsn]{2,3}\s+\w{5,}(\s|$)', re.ASCII)
        # ethnicity indicator dictionary
        self.ethn_df = pd.DataFrame()
        self.ethn_dic = defaultdict(lambda: defaultdict(int))
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 1000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
        
        # file name where to save as csv
        self.ETHN_CSV_FILE = "tkt_customers_ethn.csv.gz"
    
    def show_customer_stats(self):
        
        print("total ticketek customer ids: {}".format(len(self.ticketek_customers)))
        print("total unique emails: {}".format(len(self.ticketek_customers.email.unique())))
        
        return self
        
    def _match_asis(self, name, set_of_names):
        
        if isinstance(name, str) and (len(name) > 1):
            if set(name.split()) & set_of_names:
                return 1
            else:
                return 0
        return None
    
    def _min_dist(self, name, ethnicity):
        
        distances_all_name_parts = set()
        
        if isinstance(name, str) and (len(name) > 1):
            
            # any surnames that start from the same letter as the parts of the name?
            for w in name.split():
                if isinstance(w, str):
                    candidate_surs = {unidecode(s) for s in self.name_dic[ethnicity]["surnames"] if s[0] == w[0]}
                    if candidate_surs:
                        distances = {jellyfish.damerau_levenshtein_distance(w, sur) for sur in candidate_surs}
                        distances_all_name_parts.update(distances)
                    else:
                        continue
                else:
                    continue
                
        if distances_all_name_parts:
            return min(distances_all_name_parts)
        else:
            return None
    
    
    def _get_cust_features(self, name_plus_surname):
        """
        evaluate ethnicity features for a customer
        """
        cust_features = defaultdict(int)
        
        name, surname = name_plus_surname

        print(name, surname)
        
        for nm in zip(["n", "s", "f"], [name, surname, " ".join([name, surname])]):
            print(nm[1])
            if nm[1].strip():    
                for ethnicity in self.incl_ethnicities:
                    # names: match as is unless it's chinese (then do additional permutation match)
                    feat_name = "_".join([nm[0], "name", ethnicity])
                    print(feat_name)
                    cust_features[feat_name] = self._match_asis(nm[1], self.name_dic[ethnicity]["names"])
                    if self.name_dic[ethnicity]["surnames"]:
                        feat_name = "_".join([nm[0], "surname", ethnicity])
                        cust_features[feat_name] = max([self._match_asis(w, self.name_dic[ethnicity]["surnames"]) for w in nm[1].split() if w.strip()])
                        feat_name = "_".join([nm[0], "dist", "surname", ethnicity])  
                        cust_features[feat_name] = self._min_dist(nm[1], ethnicity)
        # check for hypocorisms (eng)
        cust_features["hypoc"] = 1 if (set([w for v in [name, surname] for w in v.split()]) & self.hypocs) else 0
        # look for any eng dictionary words
        cust_features["eng_dic"] = 1 if sum([int(self.ench_dic.check(st)) for st in set([w for v in [name, surname] for w in v.split()])]) > 0 else 0
        cust_features["wordnet_dic"] = 1 if sum([int(st in self.wordnet_dic) for st in set([w for v in [name, surname] for w in v.split()])]) > 0 else 0
        # if customer name matches the del mask, nullify all features as we then know that customer is
        # probably italian or spanish or someting and not of one of the ethnicities we are after
        if self.del_mask.search(name + ' ' + surname):
            cust_features.update({k: 0 for k,v in cust_features.items() if k != "hypoc"})
        # find surnames
        for w in set([w for v in [name, surname] for w in v.split()]):
            if w in self.c2010surns:
                cust_features.update(self.c2010surns[w])      
            
        return cust_features

        
    def get_features(self):
        
        print("matching ethnicities for all Ticketek customers...")

        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            sub_feats = defaultdict(lambda: defaultdict(int))
            for tp in zip(sub_df["cust_id"], self._get_cust_features([[a, b] for a in sub_df["name"].tolist() 
                                                                              for b in sub_df["last_name"].tolist()])):
                sub_feats[tp[0]] = tp[1]
                print(tp)
            self.ethn_df = pd.concat([self.ethn_df,
                                      pd.DataFrame.from_dict(sub_feats, orient="index")])
            if (i%10 == 0) or (i == self.FULL_CHUNKS-1):
                print("ids processed so far: {}".format(LAST_ONE))
                
        return self
    
    def features_to_csv(self):
        
        # remove rows with all zeroes
        self.ethn_df = self.ethn_df[(self.ethn_df.T !=0).any()]
        # resulting data frame
        res_df = self.ticketek_customers.join(self.ethn_df, on="cust_id", how="inner")    
        print("saving inferred ethnicities ({} rows) to {}...".format(len(res_df), self.ETHN_CSV_FILE), end="")
        res_df.to_csv(self.ETHN_CSV_FILE, index=False, compression="gzip")
        print("ok")

In [133]:
if __name__ == "__main__":
    
    cf = ChineseFinder().show_customer_stats().get_features()
    cf.features_to_csv()

total ethnicities available: 14
surnames in the US census data: 162253
total ticketek customer ids: 3600
total unique emails: 2530
matching ethnicities for all Ticketek customers...


ValueError: too many values to unpack (expected 2)

In [118]:
d = pd.read_csv("tkt_customers_ethn.csv.gz", dtype=str)

In [119]:
d

Unnamed: 0,cust_id,title,name,last_name,email,n_name_arabic,n_surname_arabic,n_dist_surname_arabic,n_name_chinese,n_surname_chinese,...,f_name_thai,f_name_turkish,f_name_greek,hypoc,eng_dic,wordnet_dic,white,black,hisp,asian
0,18697142,,luke,cahill,shamalu3@bigpond.com,,,,,,...,,,,0,0,0,94.54,0.0,2.15,0.64
1,21406048,mrs,kerryn,costello,kerryncostello@me.com,,,,,,...,,,,0,0,0,90.85,1.34,5.46,0.69
2,2043997,mr,r,kennedy,,,,,,,...,,,,0,0,0,80.82,13.63,2.45,0.62
3,5323969,mr,brian,murray,,,,,,,...,,,,0,0,0,74.92,19.29,2.6,0.57
4,1364476,ms,tara,pollard,,,,,,,...,,,,0,0,0,62.77,31.39,2.33,0.55
5,18281767,,leslie,rosen,lrosen@rosens.com.au,,,,,,...,,,,0,0,0,94.48,0.63,2.7,0.83
6,1978626,ms,lesley,versay,,,,,,,...,,,,0,0,0,78.53,14.8,2.56,0.62
7,21225381,ms,amy,melville,amy_lee82@live.com.au,,,,,,...,,,,0,0,0,82.39,11.32,2.55,1.26
8,8276735,ms,elaine,charker,burrabel@bigpond.com,,,,,,...,,,,0,0,0,,,,
9,16220675,miss,kristy,marr,kristy.marr@hotmail.com,,,,,,...,,,,0,0,0,88.34,4.26,2.82,1.91


In [90]:
'rowe' in cf.c2010surns

True

In [11]:
d.head(20)

Unnamed: 0,cust_id,title,name,last_name,email,n_name_arabic,n_surname_arabic,n_name_chinese,n_surname_chinese,n_name_dutch,...,f_name_serbian,f_name_thai,f_name_turkish,hypoc,eng_dic,wordnet_dic,white,black,hisp,asian
0,16321912,miss,liza,noble,lizenobes@gmail.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
1,13297074,mr,glenn,forster,,0,0,0,0,0,...,0,0,0,0,0,0,,,,
2,4752231,dr,christopher,green,,0,0,0,0,0,...,0,0,0,0,0,0,,,,
3,21386855,,david,viljoen,foo@baz.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
4,18532128,,john,morruzi,jpmoruzzi04@aol.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
5,11881120,mr,ben,noble,ben.noble84@btinternet.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
6,23766648,mr,casey,walsh,casey.f.walsh@gmail.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
7,23610336,mrs,rene,pilcher,wooramel3@gmail.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,
8,11645312,,alan,parkin,,0,0,0,0,0,...,0,0,0,0,0,0,,,,
9,22413489,mr,january,meauta,gtofa1@gmail.com,0,0,0,0,0,...,0,0,0,0,0,0,,,,


In [None]:
d.to_csv("ticketek_ethnicities.csv.gz", encoding="latin-1", index=False, compression="gzip")

In [None]:
korean_db = d.loc[((d.name_ko == 1) | (d.surname_ko == 1)) &
                   (d.name_cn == 0) & (d.surname_cn == 0) & 
                   (d.name_vn == 0) & (d.surname_vn == 0) & 
                   (d.name_th == 0) & (d.hypoc == 0) &
                   (d.eng_dic == 0),:]

In [None]:
korean_db.head()

In [None]:
len(korean_db.email.unique())

In [None]:
vietnamese_db.to_csv("vietnamese_segment.csv", index=False, sep="\t", encoding="latin-1")

In [None]:
cnames = pd.read_csv("~/Data/names/us_census_2010_surnames.csv")

In [None]:
float(cnames.loc[cnames.name == "NICKOLSON",:].iloc[0]["pctwhite"])

In [None]:
c2010surns = 

In [None]:
c2010surns

In [111]:
{s for s in chain.from_iterable(json.load(open(cf.NAME_DATA_DIR + "hypocorisms.json", "r")).values())}

{'abbie',
 'abby',
 'abe',
 'abi',
 'aggie',
 'al',
 'alec',
 'alex',
 'alexa',
 'alexis',
 'alf',
 'alfie',
 'ali',
 'allie',
 'ally',
 'aly',
 'amy',
 'andie',
 'andra',
 'andy',
 'annie',
 'ant',
 'anto',
 'archie',
 'archy',
 'art',
 'artie',
 'aud',
 'babs',
 'barb',
 'barney',
 'bart',
 'bastian',
 'becca',
 'beccy',
 'becky',
 'bee',
 'bell',
 'bella',
 'ben',
 'benji',
 'bennie',
 'benny',
 'bernd',
 'bernie',
 'bert',
 'berta',
 'bertie',
 'bess',
 'bessie',
 'betsy',
 'betty',
 'bid',
 'billy',
 'bob',
 'bobbie',
 'brad',
 'carol',
 'carrie',
 'cary',
 'chaz',
 'chic',
 'chris',
 'chuck',
 'cilla',
 'cilly',
 'cindy',
 'cliff',
 'clint',
 'costa',
 'criffer',
 'daisy',
 'dalt',
 'dan',
 'danny',
 'dave',
 'davy',
 'deb',
 'debbie',
 'denny',
 'des',
 'dick',
 'dobbin',
 'dom',
 'dommy',
 'don',
 'donnie',
 'dot',
 'dottie',
 'doug',
 'drea',
 'drew',
 'dud',
 'eck',
 'ed',
 'edd',
 'eddie',
 'eddy',
 'ellie',
 'em',
 'emmy',
 'eug',
 'fi',
 'flo',
 'florrie',
 'fran',
 'frank