In [8]:
import pandas as pd
from nltk.corpus import words # list of english words
import enchant  # english dictionary
import re
import json
import os
from unidecode import unidecode
from collections import Counter, defaultdict
from itertools import chain, permutations
import jellyfish

In [11]:
class EthnicityDetector(object):
    
    def __init__(self):
        
        self.DATA_DIR = "/Users/ik/Data/"
        self.NAME_DATA_DIR = self.DATA_DIR + "names/"
        self.INDIAN_EVENTS_CIDS = {i for i in set(pd.read_csv("customer_ids_indian_events.txt", dtype=str)["LotusCustomerID"]) if i}
        
        self.ench_dic = enchant.Dict("en_US")  # english spellcheck
        self.wordnet_dic = words.words()
        
        """ create a name/surname dictionary by ethnicity; it should look like this:
             {"chinese": {"names": {n1,n2,..}, "surnames": {s1,s2,..}}, "vietnamese": {..},..}
        """
        
        self.name_dict = json.load(open(self.NAME_DATA_DIR + "names_26092017.json", "r"))
        self.surname_dict = json.load(open(self.NAME_DATA_DIR + "surnames_26092017.json", "r"))
        
        self.indian_names = {w["name"] for w in self.name_dict['indian']}
        self.indian_surnames = {w for w in self.surname_dict['indian']}
        self.filipino_names = {w["name"] for w in self.name_dict['filipino']}
        self.filipino_surnames = {w for w in self.surname_dict['filipino']}
        print("indian names: {} surnames: {}".format(len(self.indian_names), len(self.indian_surnames)))
        print("filipino names: {} surnames: {}".format(len(self.filipino_names), len(self.filipino_surnames)))
               
        self.ticketek_customers = pd.read_csv(self.DATA_DIR + "/customers/all-customers-18092017.csv.gz", dtype=str)
        self.ticketek_customers = self.ticketek_customers[self.ticketek_customers.cust_lst == '2'].drop("title cust_lst email ph_home ph_work ph_mob".split(), axis=1)
        print("ticketek customer ids: {}".format(len(self.ticketek_customers)))
        
        # customer data frame will be processed by chunks specified below
        self.CHUNK_SIZE = 10000    # in rows
        self.FULL_CHUNKS, self.ROWS_LEFT = divmod(len(self.ticketek_customers), self.CHUNK_SIZE)
        print("full chunks to process: {}".format(self.FULL_CHUNKS))
    
    def _find_names(self, s):
        
        if not isinstance(s, str):
            return None
        
        if not s.strip():
            return None
        
        if s in self.indian_names:
            return "indian"
        
        if s in self.filipino_names:
            return "filipino"
        
        return None
    
    def _attended_indian(self, s):
        
        if not isinstance(s, str):
            return None
        
        if not s.strip():
            return None
        
        if s in self.INDIAN_EVENTS_CIDS:
            return "yes"
        
        return None
    
    def _find_surnames(self, s):
        
        if not isinstance(s, str):
            return None
        
        if not s.strip():
            return None
        
        if (len(s) > 2) and (s in self.indian_surnames):
            return "indian"
        
        if (len(s) > 2) and (s in self.filipino_surnames):
            return "filipino"
        
        return None
    
    def find_ethnicities(self):
        
        print("matching ethnicities for all ticketek customers...")
        
        self.finds = pd.DataFrame()
        
        for i in range(self.FULL_CHUNKS + 1):
            
            LAST_ONE = i*self.CHUNK_SIZE + self.CHUNK_SIZE if i < self.FULL_CHUNKS else i*self.CHUNK_SIZE + self.ROWS_LEFT
            sub_df = self.ticketek_customers.iloc[i*self.CHUNK_SIZE:LAST_ONE,:]
            
            sub_df["first_name_eth"] = sub_df.first_name.apply(self._find_names)
            sub_df["last_name_eth"] = sub_df.last_name.apply(self._find_surnames)
            sub_df["attended_indian"] = sub_df.cust_id.apply(self._attended_indian)
            
            self.finds = pd.concat([self.finds, sub_df[sub_df.first_name_eth.notnull() | 
                                                       sub_df.last_name_eth.notnull() |
                                                      sub_df.attended_indian.notnull()]])
                   
            print("processed chunks: {}".format(i + 1))
            print("ethnicities found: {}".format(len(self.finds)))
            #sub_df.to_csv("part-" + str(i) + ".csv", index=False)
            
            # print("first names found: filipino {} / indian {}".format())
            
#             if (i%10 == 0) or (i == self.FULL_CHUNKS-1):
#                 print("ids processed so far: {}".format(LAST_ONE))
                
                
        return self

ed = EthnicityDetector()

indian names: 760 surnames: 385
filipino names: 550 surnames: 2531
ticketek customer ids: 15904144
full chunks to process: 1590


In [12]:
ed.find_ethnicities()

matching ethnicities for all ticketek customers...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


processed chunks: 1
ethnicities found: 198
processed chunks: 2
ethnicities found: 518
processed chunks: 3
ethnicities found: 763
processed chunks: 4
ethnicities found: 942
processed chunks: 5
ethnicities found: 1122
processed chunks: 6
ethnicities found: 1291
processed chunks: 7
ethnicities found: 1516
processed chunks: 8
ethnicities found: 1719
processed chunks: 9
ethnicities found: 1937
processed chunks: 10
ethnicities found: 2195
processed chunks: 11
ethnicities found: 2409
processed chunks: 12
ethnicities found: 2562
processed chunks: 13
ethnicities found: 2715
processed chunks: 14
ethnicities found: 2872
processed chunks: 15
ethnicities found: 2988
processed chunks: 16
ethnicities found: 3155
processed chunks: 17
ethnicities found: 3328
processed chunks: 18
ethnicities found: 3523
processed chunks: 19
ethnicities found: 3726
processed chunks: 20
ethnicities found: 3972
processed chunks: 21
ethnicities found: 4175
processed chunks: 22
ethnicities found: 4403
processed chunks: 23
et

processed chunks: 180
ethnicities found: 44238
processed chunks: 181
ethnicities found: 44552
processed chunks: 182
ethnicities found: 44975
processed chunks: 183
ethnicities found: 45234
processed chunks: 184
ethnicities found: 45533
processed chunks: 185
ethnicities found: 45782
processed chunks: 186
ethnicities found: 46094
processed chunks: 187
ethnicities found: 46410
processed chunks: 188
ethnicities found: 46788
processed chunks: 189
ethnicities found: 47184
processed chunks: 190
ethnicities found: 47451
processed chunks: 191
ethnicities found: 47804
processed chunks: 192
ethnicities found: 48129
processed chunks: 193
ethnicities found: 48516
processed chunks: 194
ethnicities found: 48911
processed chunks: 195
ethnicities found: 49322
processed chunks: 196
ethnicities found: 50674
processed chunks: 197
ethnicities found: 50929
processed chunks: 198
ethnicities found: 51154
processed chunks: 199
ethnicities found: 51401
processed chunks: 200
ethnicities found: 51582
processed chu

processed chunks: 355
ethnicities found: 79040
processed chunks: 356
ethnicities found: 79250
processed chunks: 357
ethnicities found: 79433
processed chunks: 358
ethnicities found: 79628
processed chunks: 359
ethnicities found: 79822
processed chunks: 360
ethnicities found: 80019
processed chunks: 361
ethnicities found: 80230
processed chunks: 362
ethnicities found: 80384
processed chunks: 363
ethnicities found: 80571
processed chunks: 364
ethnicities found: 80740
processed chunks: 365
ethnicities found: 80963
processed chunks: 366
ethnicities found: 81180
processed chunks: 367
ethnicities found: 81390
processed chunks: 368
ethnicities found: 81645
processed chunks: 369
ethnicities found: 81874
processed chunks: 370
ethnicities found: 82085
processed chunks: 371
ethnicities found: 82316
processed chunks: 372
ethnicities found: 82483
processed chunks: 373
ethnicities found: 82712
processed chunks: 374
ethnicities found: 82940
processed chunks: 375
ethnicities found: 83214
processed chu

processed chunks: 529
ethnicities found: 113098
processed chunks: 530
ethnicities found: 113225
processed chunks: 531
ethnicities found: 113384
processed chunks: 532
ethnicities found: 113548
processed chunks: 533
ethnicities found: 113683
processed chunks: 534
ethnicities found: 113861
processed chunks: 535
ethnicities found: 114034
processed chunks: 536
ethnicities found: 114213
processed chunks: 537
ethnicities found: 114368
processed chunks: 538
ethnicities found: 114552
processed chunks: 539
ethnicities found: 114721
processed chunks: 540
ethnicities found: 114883
processed chunks: 541
ethnicities found: 115018
processed chunks: 542
ethnicities found: 115170
processed chunks: 543
ethnicities found: 115332
processed chunks: 544
ethnicities found: 115492
processed chunks: 545
ethnicities found: 115639
processed chunks: 546
ethnicities found: 115814
processed chunks: 547
ethnicities found: 115981
processed chunks: 548
ethnicities found: 116172
processed chunks: 549
ethnicities found:

processed chunks: 700
ethnicities found: 146178
processed chunks: 701
ethnicities found: 146350
processed chunks: 702
ethnicities found: 146483
processed chunks: 703
ethnicities found: 146663
processed chunks: 704
ethnicities found: 146825
processed chunks: 705
ethnicities found: 146986
processed chunks: 706
ethnicities found: 147145
processed chunks: 707
ethnicities found: 147321
processed chunks: 708
ethnicities found: 147499
processed chunks: 709
ethnicities found: 147648
processed chunks: 710
ethnicities found: 147842
processed chunks: 711
ethnicities found: 148032
processed chunks: 712
ethnicities found: 148205
processed chunks: 713
ethnicities found: 148391
processed chunks: 714
ethnicities found: 148562
processed chunks: 715
ethnicities found: 148725
processed chunks: 716
ethnicities found: 148902
processed chunks: 717
ethnicities found: 149056
processed chunks: 718
ethnicities found: 149187
processed chunks: 719
ethnicities found: 149367
processed chunks: 720
ethnicities found:

processed chunks: 872
ethnicities found: 187384
processed chunks: 873
ethnicities found: 187514
processed chunks: 874
ethnicities found: 187696
processed chunks: 875
ethnicities found: 187841
processed chunks: 876
ethnicities found: 188035
processed chunks: 877
ethnicities found: 188206
processed chunks: 878
ethnicities found: 188387
processed chunks: 879
ethnicities found: 188566
processed chunks: 880
ethnicities found: 188788
processed chunks: 881
ethnicities found: 188974
processed chunks: 882
ethnicities found: 189196
processed chunks: 883
ethnicities found: 189475
processed chunks: 884
ethnicities found: 189732
processed chunks: 885
ethnicities found: 189901
processed chunks: 886
ethnicities found: 190119
processed chunks: 887
ethnicities found: 190353
processed chunks: 888
ethnicities found: 190628
processed chunks: 889
ethnicities found: 190894
processed chunks: 890
ethnicities found: 191204
processed chunks: 891
ethnicities found: 191481
processed chunks: 892
ethnicities found:

processed chunks: 1043
ethnicities found: 220489
processed chunks: 1044
ethnicities found: 220604
processed chunks: 1045
ethnicities found: 220780
processed chunks: 1046
ethnicities found: 221004
processed chunks: 1047
ethnicities found: 221298
processed chunks: 1048
ethnicities found: 221549
processed chunks: 1049
ethnicities found: 221848
processed chunks: 1050
ethnicities found: 222047
processed chunks: 1051
ethnicities found: 222273
processed chunks: 1052
ethnicities found: 222408
processed chunks: 1053
ethnicities found: 222523
processed chunks: 1054
ethnicities found: 222653
processed chunks: 1055
ethnicities found: 222845
processed chunks: 1056
ethnicities found: 223058
processed chunks: 1057
ethnicities found: 223261
processed chunks: 1058
ethnicities found: 223479
processed chunks: 1059
ethnicities found: 223700
processed chunks: 1060
ethnicities found: 223926
processed chunks: 1061
ethnicities found: 224183
processed chunks: 1062
ethnicities found: 224552
processed chunks: 10

ethnicities found: 258248
processed chunks: 1212
ethnicities found: 258418
processed chunks: 1213
ethnicities found: 258569
processed chunks: 1214
ethnicities found: 258734
processed chunks: 1215
ethnicities found: 258876
processed chunks: 1216
ethnicities found: 259060
processed chunks: 1217
ethnicities found: 259185
processed chunks: 1218
ethnicities found: 259329
processed chunks: 1219
ethnicities found: 259523
processed chunks: 1220
ethnicities found: 259681
processed chunks: 1221
ethnicities found: 259858
processed chunks: 1222
ethnicities found: 260027
processed chunks: 1223
ethnicities found: 260187
processed chunks: 1224
ethnicities found: 260330
processed chunks: 1225
ethnicities found: 260482
processed chunks: 1226
ethnicities found: 260643
processed chunks: 1227
ethnicities found: 260814
processed chunks: 1228
ethnicities found: 261054
processed chunks: 1229
ethnicities found: 261242
processed chunks: 1230
ethnicities found: 261509
processed chunks: 1231
ethnicities found: 2

processed chunks: 1379
ethnicities found: 298982
processed chunks: 1380
ethnicities found: 299160
processed chunks: 1381
ethnicities found: 299342
processed chunks: 1382
ethnicities found: 299483
processed chunks: 1383
ethnicities found: 299658
processed chunks: 1384
ethnicities found: 299790
processed chunks: 1385
ethnicities found: 299950
processed chunks: 1386
ethnicities found: 300112
processed chunks: 1387
ethnicities found: 300286
processed chunks: 1388
ethnicities found: 300513
processed chunks: 1389
ethnicities found: 300670
processed chunks: 1390
ethnicities found: 300856
processed chunks: 1391
ethnicities found: 301011
processed chunks: 1392
ethnicities found: 301216
processed chunks: 1393
ethnicities found: 301403
processed chunks: 1394
ethnicities found: 301634
processed chunks: 1395
ethnicities found: 301843
processed chunks: 1396
ethnicities found: 302069
processed chunks: 1397
ethnicities found: 302259
processed chunks: 1398
ethnicities found: 302500
processed chunks: 13

processed chunks: 1548
ethnicities found: 335417
processed chunks: 1549
ethnicities found: 335606
processed chunks: 1550
ethnicities found: 335863
processed chunks: 1551
ethnicities found: 336105
processed chunks: 1552
ethnicities found: 336250
processed chunks: 1553
ethnicities found: 336422
processed chunks: 1554
ethnicities found: 336591
processed chunks: 1555
ethnicities found: 336802
processed chunks: 1556
ethnicities found: 336995
processed chunks: 1557
ethnicities found: 337243
processed chunks: 1558
ethnicities found: 337566
processed chunks: 1559
ethnicities found: 337835
processed chunks: 1560
ethnicities found: 338015
processed chunks: 1561
ethnicities found: 338189
processed chunks: 1562
ethnicities found: 338381
processed chunks: 1563
ethnicities found: 338550
processed chunks: 1564
ethnicities found: 338744
processed chunks: 1565
ethnicities found: 338953
processed chunks: 1566
ethnicities found: 339153
processed chunks: 1567
ethnicities found: 339344
processed chunks: 15

<__main__.EthnicityDetector at 0x2525040b8>

In [71]:
ed.finds.head()

Unnamed: 0,cust_id,first_name,middle_name,last_name,first_name_eth,last_name_eth,attended_indian
0,3161590,anita,,conte,indian,,
137,3162293,mary,,mendoza,,filipino,
210,3162603,jason,,jensen,,,yes
270,3162900,praveen,,chand,indian,,
308,3163038,stuart,,rogers,,,yes


In [64]:
filipinos = ed.finds[(ed.finds.first_name_eth == 'filipino') & (ed.finds.last_name_eth == 'filipino')]
print(filipinos.head())

       cust_id first_name middle_name  last_name first_name_eth last_name_eth  \
29350  3223409     ronnel         NaN  magdaluyo       filipino      filipino   
34858  3228921      chona           e   castillo       filipino      filipino   
38212  3232277       ding                poblete       filipino      filipino   
78305  3272403     rommel                   cruz       filipino      filipino   
91881  3285989    corazon                  reyes       filipino      filipino   

      attended_indian  
29350            None  
34858            None  
38212            None  
78305            None  
91881            None  


In [80]:
excl_indian_first = set("anita tara hasan samir jay rita lina ahmad ahmed muhammad mahmud sharif om farrukh sultan mehmud syed".split())

indians = ed.finds[(ed.finds.last_name_eth == 'indian') | 
                  ((ed.finds.first_name_eth == 'indian') & ~(ed.finds.first_name.isin(excl_indian_first)))]

In [85]:
len(indians)

165191

In [86]:
len(filipinos)

433

In [87]:
filipinos.cust_id.to_csv('filipino_cust_ids.txt', header=False, index=False)

In [88]:
indians.cust_id.to_csv('indian_cust_ids.txt', header=False, index=False)