In [1]:
import pandas as pd
import re
import enchant
import json
import statistics
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain

In [8]:
class ChineseFinder(object):
    
    def __init__(self):
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.name_dic_dir = "name_dicts"
        # chinese names and surnames
        self.chinese_names = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for full_name in pd.read_csv(self.name_dic_dir + "/" + "names_cn.txt", header=None)[0].tolist() 
                                                      for part in full_name.split()]))
        self.chinese_surnames = pd.read_csv(self.name_dic_dir + "/" + "surnames_cn.txt", header=None)[0].tolist() 
        self.korean_names = set(chain.from_iterable([unidecode(part).replace("-"," ").split() 
                                    for full_name in pd.read_csv(self.name_dic_dir + "/" + "names_ko.txt", header=None)[0].tolist() 
                                                     for part in full_name.split()]))
        self.koresn_surnames = pd.read_csv(self.name_dic_dir + "/" + "surnames_ko.txt", header=None)[0].tolist()
        # vietnamese last names, cover 90% of population (wikipedia)
        self.vietnamese_names = set(pd.read_csv(self.name_dic_dir + "/" + "names_vn.txt", header=None)[0].tolist())
        self.vietnamese_surnames = pd.read_csv(self.name_dic_dir + "/" + "surnames_vn.txt", header=None)[0].tolist()
        self.ticketek_customers = pd.read_csv("../data/ticketek_customers.csv.gz", dtype=str).drop("middle_name", axis=1)
        self.ticketek_customers["full_name"] = self.ticketek_customers["name"] + " " + self.ticketek_customers["last_name"]
        # hypocorisms; we just make a set of these and don't care what names they relate to
        self.hypoc_dict = json.load(open("../data/hypocorisms.json", "r")) 
        self.hypocs = {hyp for full_name in self.hypoc_dict
                               for hyp in self.hypoc_dict[full_name]}
        self.chinese_letter_stats = defaultdict()
        self.chinese_cust_ids = set()
        # a mask to exclude de longhi, te vroomer, etc that have parts like in chinese names
        self.non_chinese_mask = re.compile('\s+[ltodaelsn]{2,3}\s+\w{5,}(\s|$)', re.ASCII)
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 100000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
    
    def _last_n_letters(self, names, n):  
        
        last_letters = [w[-n:] for w in names if len(w[-n:]) == n]
        most_frequent = statistics.mode(last_letters)
              
        return (set(last_letters), most_frequent)
        
    def get_name_stats(self):
        
        for nletters in range(1,4):
            lst, top = self._last_n_letters(self.chinese_names, nletters)
            self.chinese_letter_stats["last" + str(nletters)] = lst
            self.chinese_letter_stats["toplast" + str(nletters)] = top
        
        self.chinese_letter_stats["max_len"] = len(max(self.chinese_names, key=len))
        self.chinese_letter_stats["avg_len"] = round(statistics.mean([len(name) for name in self.chinese_names]),0)
        
        return self
    
    def _is_hypoc(self, st):
        return 1 if (set(str(st).split()) & self.hypocs) else 0
    
    def _is_chinese(self, st):
        return 1 if (set(str(st).split()) & self.chinese_names) else 0
    
    def _is_korean(self, st):
        return 1 if (set(str(st).split()) & self.korean_names) else 0
    
    def _is_vietnamese(self, st):
        return 1 if (set(str(st).split()) & self.vietnamese_names) else 0
    
    def _is_english_word(self, st):
        return int(self.ench_dic.check(st))
    
    def is_likely_chinese(self, st):
        if (self._is_chinese(st) and 
            (not self._is_korean(st)) and 
                (not self._is_hypoc(st)) and 
                    (all([not self._is_english_word(part) for part in str(st).split()]))):
            return 1
        else:
            return 0
    
    def get_top_popular_last_names(self, n):
        
        self.most_popular_last_names = sorted([(k,v) for k, v in Counter(self.chinese_df["last_name"].tolist()).items()], 
                                              key=lambda x: x[1], reverse=True)
        return self
        
    def find_chinese_in_dataframe(self):
        
        print("searching for chinese among all Ticketek customers...")

        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            
            self.chinese_cust_ids.update(set(sub_df.loc[sub_df["full_name"].apply(self.is_likely_chinese) & 
                       sub_df["last_name"].apply(lambda _: (sum([str(_).endswith(w) for w in self.chinese_letter_stats["last2"]]) > 0)
                                                and (not self.non_chinese_mask.search(str(_))) and
                                                (not self._is_vietnamese(str(_)))),
                       "cust_id"].tolist()))

            if (i%20 == 0) or (i == self.FULL_CHUNKS-1):
                print("ids found so far: {}".format(len(self.chinese_cust_ids)))
                
        self.chinese_df = self.ticketek_customers.loc[self.ticketek_customers.cust_id.isin(self.chinese_cust_ids),:]
        
        return self
        

In [9]:
if __name__ == "__main__":
    
    cf = ChineseFinder().get_name_stats()

In [10]:
cf.find_chinese_in_dataframe()

searching for chinese among all Ticketek customers...
ids found so far: 267
ids found so far: 6741
ids found so far: 10973
ids found so far: 14805
ids found so far: 20740
ids found so far: 27410
ids found so far: 35892
ids found so far: 46693
ids found so far: 57408
ids found so far: 74342
ids found so far: 91088
ids found so far: 91842


<__main__.ChineseFinder at 0x1186e61d0>

In [11]:
cf.chinese_df

Unnamed: 0,cust_id,title,name,last_name,email,full_name
39,4596995,mr,kevin,feng,downloadpass999@hotmail.com,kevin feng
123,4597079,ms,katherine,chen,kathchen@optusnet.com.au,katherine chen
365,4597321,ms,menghua,liu,quedy2002@yahoo.com,menghua liu
577,4597533,,narin,na ranong,narin@optushome.com.au,narin na ranong
859,4597815,mrs,linda,chen,lindachen@swiftdsl.com.au,linda chen
2173,4599129,ms,jennifer,lin,l_maddie68@yahoo.com.au,jennifer lin
2593,4599549,ms,katherine,liu,kliu1008@bigpond.net.au,katherine liu
2639,4599595,ms,emily,lim,anggrani@hotmail.com,emily lim
2670,4599626,mr,michael,yi,changjaeyi@hotmail.com,michael yi
2876,4599832,,li,shing,klshing@yahoo.com,li shing
