In [2]:
import pandas as pd
import numpy as np
import logging

import textdistance
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

np.random.seed(0)

import ray
ray.init(num_cpus = 8, ignore_reinit_error=True) # Specify this system has 4 CPUs.

2021-02-23 18:13:01,141	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.102',
 'raylet_ip_address': '192.168.1.102',
 'redis_address': '192.168.1.102:6379',
 'object_store_address': 'tcp://127.0.0.1:54947',
 'raylet_socket_name': 'tcp://127.0.0.1:55118',
 'webui_url': '127.0.0.1:8265',
 'session_dir': 'C:\\tmp\\ray\\session_2021-02-23_18-12-59_587536_17020',
 'metrics_export_port': 60489,
 'node_id': 'd7b115505b77688c586dd9db40e6cc1ee58eb06e6ec93700c9238360'}

#### Load 1000 randomly generated names and surnames

In [3]:
csv1 = pd.read_csv(r"./data/MOCK_DATA.csv")

In [4]:
csv1.shape

(28, 3)

In [5]:
csv1.head(5)

Unnamed: 0,id,full name,email
0,1,Mirelle Spire Kirtley,mkirtley0@geocities.jp
1,2,Detelina R. Labed,dlenz1@mapquest.com
2,3,Bryanty Wolford,bwolford2@ucoz.ru
3,4,Elijah von Hagt,evon3@who.int
4,5,Mathe Sivier,msivier4@google.ru


#### Load first names

In [58]:
csv2_first_names = pd.read_csv(r"./data/first_names.txt", header=None, names=['first_name'])

In [59]:
csv2_first_names.shape

(19948, 1)

In [26]:
csv2_first_names.head(5)

Unnamed: 0,first_name
0,Añaterve
1,Añes
2,Aadil
3,Aali
4,Aaliyah


#### Load surnames

In [27]:
csv2_surnames = pd.read_csv(r"./data/surnames.txt", error_bad_lines=False, header=None, names=['surname'])

In [28]:
csv2_surnames.shape

(88025, 1)

In [29]:
csv2_surnames.head(5)

Unnamed: 0,surname
0,Ñeco
1,Ñiguez
2,Açaola
3,Añaños
4,Añale


#### Let's mix first names and surnames randomly

In [31]:
sample = csv2_first_names['first_name'].iloc[np.random.choice(csv2_first_names.shape[0])]
sample

'Leyi'

In [32]:
%%time
csv2 = pd.DataFrame()
csv2["full_name"] = csv2_surnames['surname'].apply(lambda x : '{} {}'.format(csv2_first_names['first_name'].iloc[np.random.choice(csv2_first_names.shape[0])], x))

Wall time: 5.53 s


In [33]:
csv2.tail(5)

Unnamed: 0,full_name
88020,Sherley Zwolenksy
88021,Chad Zydz
88022,Gian Zylinsk
88023,Rashad Zylstra
88024,Miaomiao Zywiyask


Shortcut to load and save the full name, instead of randomly generating them on each run

In [7]:
csv2 = pd.read_csv(r"./data/full_names.txt", header=None, names=['full_name'])

In [40]:
# csv2.to_csv(r'./data/full_names.txt', index=False)

#### Start comparing

In [8]:
compare = pd.MultiIndex.from_product([csv1["full name"], csv2["full_name"]]).to_series()

In [62]:
def distances(vals):
    return pd.Series([fuzz.ratio(*vals), fuzz.token_sort_ratio(*vals)], ['ratio', 'token'])

In [63]:
%%time
results = compare.apply(distances)

Wall time: 3min 52s


In [64]:
results.sort_values('ratio', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratio,token
full_name,full name,Unnamed: 2_level_1,Unnamed: 3_level_1
Detelina Labed,Detelina R. Labed,90,93
Matheus Ozier,Mathe Sivier,72,72
Detelina Kaufer,Detelina R. Labed,69,77
Murielle Spire,Mirelle Spire Kirtley,69,69
Marquerite Siverio,Mathe Sivier,67,67
Margarethe Sineiro,Mathe Sivier,67,67
Matene Cliville,Mathe Sivier,67,37
Detelina Harfert,Detelina R. Labed,67,75
Detelin Pagadi,Detelina R. Labed,65,67
Mbaye Siwe,Mathe Sivier,64,64


In [65]:
results.sort_values('token', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratio,token
full_name,full name,Unnamed: 2_level_1,Unnamed: 3_level_1
Detelina Labed,Detelina R. Labed,90,93
Silveri Matei,Mathe Sivier,40,80
Detelina Kaufer,Detelina R. Labed,69,77
Detelina Harfert,Detelina R. Labed,67,75
Silvie Marek,Mathe Sivier,50,75
Matheus Ozier,Mathe Sivier,72,72
Silvie Batels,Mathe Sivier,40,72
Cvetelina Leidiger,Detelina R. Labed,63,71
Silvestra Matthewes,Mathe Sivier,39,71
Ivie Ama,Mathe Sivier,30,70


#### Show final results

In [66]:
%%time
output = results.unstack().idxmax(0).unstack(0)
output.head(5)

Wall time: 181 ms


Unnamed: 0_level_0,ratio,token
full name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bryanty Wolford,Goran Woodward,Wilfred Blanquet
Detelina R. Labed,Detelina Labed,Detelina Labed
Elijah von Hagt,Elijah Raimond,Elijah Raimond
Mathe Sivier,Matheus Ozier,Silveri Matei
Mirelle Spire Kirtley,Murielle Spire,Murielle Spire


#### Distribute it!

In [9]:
csv1_split = np.array_split(csv1["full name"], 8)
compare_split = [pd.MultiIndex.from_product([csv1_split[i], csv2["full_name"]]).to_series() for i in range(8)]

In [13]:
@ray.remote
class Worker(object):
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    def calculate(self):
        self.logger.warning("print from inside worker")
    def distances(self, vals):
        return pd.Series([fuzz.ratio(*vals), fuzz.token_sort_ratio(*vals)], ['ratio', 'token'])

worker = Worker.remote()

futures = [worker.remote(compare_split[i]) for i in range(8)]
print(ray.get(futures))

AttributeError: 'ActorHandle' object has no attribute 'remote'

In [None]:
with pd.ExcelWriter('./data/distance_output.xlsx', mode='a') as writer:  
    output.to_excel(writer, sheet_name='Distances')

In [69]:
ray.shutdown()