In [63]:
import pandas as pd
import numpy as np
import logging
import time

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

np.random.seed(0)

import ray
ray.init(num_cpus = 8, ignore_reinit_error=True) # Specify this system has 4 CPUs.

2021-02-24 16:42:17,397	INFO worker.py:665 -- Calling ray.init() again after it has already been called.


#### Load 1000 randomly generated names and surnames

In [29]:
csv1 = pd.read_csv(r"./data/MOCK_DATA.csv")

In [30]:
csv1.shape

(28, 3)

In [31]:
csv1.head(5)

Unnamed: 0,id,full name,email
0,1,Mirelle Spire Kirtley,mkirtley0@geocities.jp
1,2,Detelina R. Labed,dlenz1@mapquest.com
2,3,Bryanty Wolford,bwolford2@ucoz.ru
3,4,Elijah von Hagt,evon3@who.int
4,5,Mathe Sivier,msivier4@google.ru


#### Load first names

In [32]:
csv2_first_names = pd.read_csv(r"./data/first_names.txt", header=None, names=['first_name'])

In [33]:
csv2_first_names.shape

(19948, 1)

In [34]:
csv2_first_names.head(5)

Unnamed: 0,first_name
0,Añaterve
1,Añes
2,Aadil
3,Aali
4,Aaliyah


#### Load surnames

In [35]:
csv2_surnames = pd.read_csv(r"./data/surnames.txt", error_bad_lines=False, header=None, names=['surname'])

In [36]:
csv2_surnames.shape

(88025, 1)

In [37]:
csv2_surnames.head(5)

Unnamed: 0,surname
0,Ñeco
1,Ñiguez
2,Açaola
3,Añaños
4,Añale


#### Let's mix first names and surnames randomly

In [38]:
sample = csv2_first_names['first_name'].iloc[np.random.choice(csv2_first_names.shape[0])]
sample

'Billi'

In [39]:
%%time
csv2 = pd.DataFrame()
csv2["full_name"] = csv2_surnames['surname'].apply(lambda x : '{} {}'.format(csv2_first_names['first_name'].iloc[np.random.choice(csv2_first_names.shape[0])], x))

Wall time: 6.58 s


In [40]:
csv2.tail(5)

Unnamed: 0,full_name
88020,Ludivina Zwolenksy
88021,Sherley Zydz
88022,Chad Zylinsk
88023,Gian Zylstra
88024,Rashad Zywiyask


Shortcut to load and save the full name, instead of randomly generating them on each run

In [41]:
csv2 = pd.read_csv(r"./data/full_names.txt", header=None, names=['full_name'])

In [42]:
# csv2.to_csv(r'./data/full_names.txt', index=False)

#### Start comparing

In [43]:
compare = pd.MultiIndex.from_product([csv1["full name"], csv2["full_name"]]).to_series()

In [62]:
def distances(vals):
    return pd.Series([fuzz.ratio(*vals), fuzz.token_sort_ratio(*vals)], ['ratio', 'token'])

In [63]:
%%time
results = compare.apply(distances)

Wall time: 3min 52s


In [64]:
results.sort_values('ratio', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratio,token
full_name,full name,Unnamed: 2_level_1,Unnamed: 3_level_1
Detelina Labed,Detelina R. Labed,90,93
Matheus Ozier,Mathe Sivier,72,72
Detelina Kaufer,Detelina R. Labed,69,77
Murielle Spire,Mirelle Spire Kirtley,69,69
Marquerite Siverio,Mathe Sivier,67,67
Margarethe Sineiro,Mathe Sivier,67,67
Matene Cliville,Mathe Sivier,67,37
Detelina Harfert,Detelina R. Labed,67,75
Detelin Pagadi,Detelina R. Labed,65,67
Mbaye Siwe,Mathe Sivier,64,64


In [65]:
results.sort_values('token', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratio,token
full_name,full name,Unnamed: 2_level_1,Unnamed: 3_level_1
Detelina Labed,Detelina R. Labed,90,93
Silveri Matei,Mathe Sivier,40,80
Detelina Kaufer,Detelina R. Labed,69,77
Detelina Harfert,Detelina R. Labed,67,75
Silvie Marek,Mathe Sivier,50,75
Matheus Ozier,Mathe Sivier,72,72
Silvie Batels,Mathe Sivier,40,72
Cvetelina Leidiger,Detelina R. Labed,63,71
Silvestra Matthewes,Mathe Sivier,39,71
Ivie Ama,Mathe Sivier,30,70


#### Show final results

In [66]:
%%time
output = results.unstack().idxmax(0).unstack(0)
output.head(5)

Wall time: 181 ms


Unnamed: 0_level_0,ratio,token
full name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bryanty Wolford,Goran Woodward,Wilfred Blanquet
Detelina R. Labed,Detelina Labed,Detelina Labed
Elijah von Hagt,Elijah Raimond,Elijah Raimond
Mathe Sivier,Matheus Ozier,Silveri Matei
Mirelle Spire Kirtley,Murielle Spire,Murielle Spire


#### Distribute it!

In [9]:
csv1_split = np.array_split(csv1["full name"], 8)
compare_split = [pd.MultiIndex.from_product([csv1_split[i], csv2["full_name"]]).to_series() for i in range(8)]

In [58]:
@ray.remote
class Worker(object):
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    def calculate(self, compare):
        self.logger.info("Calculating distances.")
        return compare.apply(self.distances)
    def distances(self, vals):
        return pd.Series([fuzz.ratio(*vals), fuzz.token_sort_ratio(*vals)], ['ratio', 'token'])

worker_actor = Worker.remote()
compare_refs = ray.get([ray.put(compare_split[i]) for i in range(8)])
workers_actors = [worker_actor.calculate.remote(compare_refs[i]) for i in range(8)]

In [61]:
ready, not_ready = ray.wait(workers_actors)
ready

[ObjectRef(d3ab1aaf09c00ee8931690d20f999d2f304778680100000001000000)]

In [62]:
not_ready

[ObjectRef(b9679322487eb9c3931690d20f999d2f304778680100000001000000),
 ObjectRef(fc8204747604c8cc931690d20f999d2f304778680100000001000000),
 ObjectRef(9d78bd90898368ca931690d20f999d2f304778680100000001000000),
 ObjectRef(36e14f5d7a18682e931690d20f999d2f304778680100000001000000),
 ObjectRef(48b7ee15cf1ae472931690d20f999d2f304778680100000001000000),
 ObjectRef(9bb55bc9a880f4ce931690d20f999d2f304778680100000001000000),
 ObjectRef(aae99729138a74d2931690d20f999d2f304778680100000001000000)]

Get details on what is ready and what is not, wait for 60 iterations of 1 minute each

In [None]:
for i in range(1,120): 
    ready, not_ready = ray.wait(workers_actors)
    print('iteration:', i) 
    print('Ready length, values: ', len(ready), ray.get(ready))
    print('Not Ready length:', len(not_ready))
    ids = not_ready
    time.sleep(60) # Sleep for 1 minute
    if not ids:
        break

iteration: 1
Ready length, values:  1 [                                 ratio  token
full name     full_name                      
Gale Chalfont full_name             27     36
              Karttikeya Ñeco       29     37
              Zakari Ñiguez         15     16
              Nagore Açaola         38     40
              Jianmei Añaños        37     40
...                                ...    ...
Niles Nezey   Sherley Zwolenksy     36     50
              Chad Zydz             20     20
              Gian Zylinsk          26     43
              Rashad Zylstra        24     32
              Miaomiao Zywiyask     21     29

[352104 rows x 2 columns]]
Not Ready length: 7
iteration: 2
Ready length, values:  1 [                                      ratio  token
full name          full_name                      
Maureen Simoneschi full_name             30     30
                   Karttikeya Ñeco       36     25
                   Zakari Ñiguez         32     27
                   Na

[2m[36m(pid=30900)[0m Windows fatal exception: access violation
[2m[36m(pid=30900)[0m 
[2m[36m(pid=30900)[0m Windows fatal exception: access violation
[2m[36m(pid=30900)[0m 


iteration: 11
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Deteln R Lerz    full_name             27     27
                 Karttikeya Ñeco       29     22
                 Zakari Ñiguez         23     32
                 Nagore Açaola         15     24
                 Jianmei Añaños        15     16
...                                   ...    ...
Helen Maria Tiez Sherley Zwolenksy     30     36
                 Chad Zydz             24     32
                 Gian Zylinsk          29     29
                 Rashad Zylstra        20     27
                 Miaomiao Zywiyask     36     36

[264078 rows x 2 columns]]
Not Ready length: 7
iteration: 12
Ready length, values:  1 [                                      ratio  token
full name          full_name                      
Maureen Simoneschi full_name             30     30
                   Karttikeya Ñeco       36     25
                   Zakari Ñigue

iteration: 23
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Deteln R Lerz    full_name             27     27
                 Karttikeya Ñeco       29     22
                 Zakari Ñiguez         23     32
                 Nagore Açaola         15     24
                 Jianmei Añaños        15     16
...                                   ...    ...
Helen Maria Tiez Sherley Zwolenksy     30     36
                 Chad Zydz             24     32
                 Gian Zylinsk          29     29
                 Rashad Zylstra        20     27
                 Miaomiao Zywiyask     36     36

[264078 rows x 2 columns]]
Not Ready length: 7
iteration: 24
Ready length, values:  1 [                                 ratio  token
full name     full_name                      
Gale Chalfont full_name             27     36
              Karttikeya Ñeco       29     37
              Zakari Ñiguez         15     16
     

iteration: 35
Ready length, values:  1 [                                ratio  token
full name    full_name                      
Mathe Sivier full_name             19     19
             Karttikeya Ñeco       37     31
             Zakari Ñiguez         32     25
             Nagore Açaola         24     25
             Jianmei Añaños        23     33
...                               ...    ...
Art Solleme  Sherley Zwolenksy     36     36
             Chad Zydz             10     20
             Gian Zylinsk          17     26
             Rashad Zylstra        16     24
             Miaomiao Zywiyask     14     21

[352104 rows x 2 columns]]
Not Ready length: 7
iteration: 36
Ready length, values:  1 [                                 ratio  token
full name     full_name                      
Gale Chalfont full_name             27     36
              Karttikeya Ñeco       29     37
              Zakari Ñiguez         15     16
              Nagore Açaola         38     40
           

iteration: 47
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Deteln R Lerz    full_name             27     27
                 Karttikeya Ñeco       29     22
                 Zakari Ñiguez         23     32
                 Nagore Açaola         15     24
                 Jianmei Añaños        15     16
...                                   ...    ...
Helen Maria Tiez Sherley Zwolenksy     30     36
                 Chad Zydz             24     32
                 Gian Zylinsk          29     29
                 Rashad Zylstra        20     27
                 Miaomiao Zywiyask     36     36

[264078 rows x 2 columns]]
Not Ready length: 7
iteration: 48
Ready length, values:  1 [                                 ratio  token
full name     full_name                      
Gale Chalfont full_name             27     36
              Karttikeya Ñeco       29     37
              Zakari Ñiguez         15     16
     

iteration: 59
Ready length, values:  1 [                                ratio  token
full name    full_name                      
Mathe Sivier full_name             19     19
             Karttikeya Ñeco       37     31
             Zakari Ñiguez         32     25
             Nagore Açaola         24     25
             Jianmei Añaños        23     33
...                               ...    ...
Art Solleme  Sherley Zwolenksy     36     36
             Chad Zydz             10     20
             Gian Zylinsk          17     26
             Rashad Zylstra        16     24
             Miaomiao Zywiyask     14     21

[352104 rows x 2 columns]]
Not Ready length: 7
iteration: 60
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Eryn Tirkin      full_name             10     10
                 Karttikeya Ñeco       23     40
                 Zakari Ñiguez         25     35
                 Nagore Açaola         17 

iteration: 71
Ready length, values:  1 [                                ratio  token
full name    full_name                      
Mathe Sivier full_name             19     19
             Karttikeya Ñeco       37     31
             Zakari Ñiguez         32     25
             Nagore Açaola         24     25
             Jianmei Añaños        23     33
...                               ...    ...
Art Solleme  Sherley Zwolenksy     36     36
             Chad Zydz             10     20
             Gian Zylinsk          17     26
             Rashad Zylstra        16     24
             Miaomiao Zywiyask     14     21

[352104 rows x 2 columns]]
Not Ready length: 7
iteration: 72
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Deteln R Lerz    full_name             27     27
                 Karttikeya Ñeco       29     22
                 Zakari Ñiguez         23     32
                 Nagore Açaola         15 

iteration: 83
Ready length, values:  1 [                                    ratio  token
full name        full_name                      
Deteln R Lerz    full_name             27     27
                 Karttikeya Ñeco       29     22
                 Zakari Ñiguez         23     32
                 Nagore Açaola         15     24
                 Jianmei Añaños        15     16
...                                   ...    ...
Helen Maria Tiez Sherley Zwolenksy     30     36
                 Chad Zydz             24     32
                 Gian Zylinsk          29     29
                 Rashad Zylstra        20     27
                 Miaomiao Zywiyask     36     36

[264078 rows x 2 columns]]
Not Ready length: 7
iteration: 84
Ready length, values:  1 [                                ratio  token
full name    full_name                      
Mathe Sivier full_name             19     19
             Karttikeya Ñeco       37     31
             Zakari Ñiguez         32     25
          

In [None]:
with pd.ExcelWriter('./data/distance_output.xlsx', mode='a') as writer:  
    output.to_excel(writer, sheet_name='Distances')

In [27]:
ray.shutdown()