In [54]:
import pandas as pd
from faker import Faker
import datetime
from joblib import Parallel, delayed
# from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
from thefuzz import fuzz
from thefuzz import process

fake = Faker()
df1 = pd.read_csv('CN.csv')
df2 = pd.read_csv('WACR.csv')

In [55]:
def fuzzy_match_row(row1, df2, threshold=90):
    matches = []
    for _, row2 in df2.iterrows():
        first_name_score = fuzz.token_sort_ratio(row1['Given Name'], row2['Given Name'])
        if first_name_score >= threshold:
            row2['first_name_score'] = first_name_score
            last_name_score = fuzz.token_sort_ratio(row1['Surname'], row2['Surname'])
            if last_name_score >= threshold:
                row2['surname_score'] = last_name_score
                birthday_score = fuzz.token_sort_ratio(row1['Date of Birth'], row2['Date of Birth'])
                if birthday_score >= threshold:
                    row2['birthday_score'] = birthday_score
                    matches.append(row2)
        
    return matches
tqdm_notebook.pandas()

df1 = df1#.head(n=10000)
df2 = df2#.head(n=4000)
# Divide df2 into batches for parallel processing
batch_size = 1000
batch_indices = range(0, len(df2), batch_size)
batches = [df2.iloc[i:i+batch_size] for i in batch_indices]

matched_rows = []  # Initialize a list to store matched rows

# Use tqdm to display the progress bar
# for _, row1 in tqdm(df1.iterrows(), total=len(df1)):
for _, row1 in tqdm_notebook(df1.iterrows(), total=len(df1), desc="Processing"):    
    # Perform fuzzy matching in parallel using joblib
    batch_matches = Parallel(n_jobs=-1)(
        delayed(fuzzy_match_row)(row1, batch_df) for batch_df in batches
    )

    # Flatten the list of lists
    matched_rows.extend([row for sublist in batch_matches for row in sublist])

# Create a new dataframe containing the matched rows
columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']

matched_df = pd.DataFrame(matched_rows, columns=columns)


Processing:   0%|          | 0/4000 [00:00<?, ?it/s]

In [57]:
matched_df.drop_duplicates()

Unnamed: 0,Surname,Given Name,Date of Birth,Gender,Postal Address,Postal Code,Date of Test1,first_name_score,surname_score,birthday_score
50267,Copeland,Tara,10-05-1960,Male,"5508 Hunter Forks\nWest Rhonda, VT 30333",91516,17-01-2018,100,100,100
12118,Owens,Michael,29-12-1956,Other,"269 Kevin Rest Apt. 422\nMelissahaven, VT 40668",76233,09-07-2019,100,100,100
635,Clarke,Ruth,18-03-1954,Female,837 Amanda Islands Suite 787\nNorth Thomasberg...,78933,03-07-2018,100,100,100
48754,Elliott,Lisa,26-04-1965,Female,"08825 Thomas Forest\nButlerborough, LA 68703",8236,11-06-2019,100,100,100
54269,Zamora,Manuel,11-03-1958,Male,USNV Edwards\nFPO AP 32117,98798,01-04-2018,100,100,100
...,...,...,...,...,...,...,...,...,...,...
47036,Blackburn,Tammy,20-07-1952,Female,"12325 Miller Lakes\nEast Alicialand, FM 34217",60898,12-06-2018,100,100,100
13647,Ayers,Penny,04-06-1955,Other,"8716 Allen Points\nWilliamsfort, WA 32282",91141,01-02-2018,100,100,100
37290,Anderson,Michael,24-08-1969,Female,"229 Copeland Plains Suite 531\nEast Catherine,...",40050,11-02-2019,100,100,100
52896,Holmes,Aaron,16-07-1954,Female,"51711 Anderson Plain Apt. 537\nPort Matthew, K...",33315,19-06-2018,100,100,100


In [59]:
from concurrent.futures import ProcessPoolExecutor

matched_rows = []  # Initialize a list to store matched rows

# Use tqdm to display the progress bar
with ProcessPoolExecutor() as executor:
    for _, row1 in tqdm_notebook(df1.iterrows(), total=len(df1), desc="Processing"):
        batch_matches = list(executor.map(fuzzy_match_row, [row1] * len(batches), batches))

        # Flatten the list of lists
        matched_rows.extend([row for sublist in batch_matches for row in sublist])

# Create a new dataframe containing the matched rows
columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']
matched_df = pd.DataFrame(matched_rows, columns=columns)

Processing:   0%|          | 0/4000 [00:00<?, ?it/s]

Process ForkProcess-136:
Traceback (most recent call last):
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/concurrent/futures/process.py", line 240, in _process_worker
    call_item = call_queue.get(block=True)
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/queues.py", line 103, in get
    res = self._recv_bytes()
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/connection.py", line 421, in _recv_bytes
    return self._recv(size)
  File "/home/dagmawi/miniconda3/envs/fuzz/lib/python3.10/multiprocessing/c

KeyboardInterrupt: 

In [60]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

def fuzzy_match_row(row1, df2, threshold=90):
    matches = []
    for _, row2 in df2.iterrows():
        first_name_score = fuzz.token_sort_ratio(row1['Given Name'], row2['Given Name'])
        if first_name_score >= threshold:
            row2['first_name_score'] = first_name_score
            last_name_score = fuzz.token_sort_ratio(row1['Surname'], row2['Surname'])
            if last_name_score >= threshold:
                row2['surname_score'] = last_name_score
                birthday_score = fuzz.token_sort_ratio(row1['Date of Birth'], row2['Date of Birth'])
                if birthday_score >= threshold:
                    row2['birthday_score'] = birthday_score
                    matches.append(row2)
        
    return matches

def fuzzy_match_batch(row1, df2, threshold=90):
    return df2.apply(fuzzy_match_row, args=(row1, threshold), axis=1)

batch_size = 1000
batch_indices = range(0, len(df2), batch_size)
batches = [df2.iloc[i:i+batch_size] for i in batch_indices]

results = []
with ProgressBar():
    for _, row1 in df1.iterrows():
        batch_matches = [dask.delayed(fuzzy_match_batch)(row1, batch_df) for batch_df in batches]
        results.extend(dask.compute(*batch_matches, scheduler='processes'))

matched_rows = [row for sublist in results for row in sublist]

columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']
matched_df = pd.DataFrame(matched_rows, columns=columns)


ModuleNotFoundError: No module named 'dask'

In [64]:
import numpy as np
from tqdm import tqdm

def fuzzy_match_matrix(df1, df2):
    first_name_scores = np.zeros((len(df1), len(df2)))
    last_name_scores = np.zeros((len(df1), len(df2)))
    birthday_scores = np.zeros((len(df1), len(df2)))

    for i, row1 in enumerate(tqdm_notebook(df1.itertuples(), total=len(df1), desc="Processing")):
        for j, row2 in enumerate(df2.itertuples()):
            first_name_scores[i, j] = fuzz.token_sort_ratio(row1[2], row2[2])  # Adjust column index
            last_name_scores[i, j] = fuzz.token_sort_ratio(row1[3], row2[3])    # Adjust column index
            birthday_scores[i, j] = fuzz.token_sort_ratio(row1[4], row2[4])     # Adjust column index

    return first_name_scores, last_name_scores, birthday_scores

first_name_scores, last_name_scores, birthday_scores = fuzzy_match_matrix(df1, df2)

threshold = 90
matching_indices = np.where(
    (first_name_scores >= threshold) &
    (last_name_scores >= threshold) &
    (birthday_scores >= threshold)
)

matched_rows = []
for i, j in zip(*matching_indices):
    matched_rows.append(df2.iloc[j])

columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']
matched_df = pd.DataFrame(matched_rows, columns=columns)


Processing:   0%|          | 0/4000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [66]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

def fuzzy_match_row(row1, df2, j, threshold=90):
    first_name_score = fuzz.token_sort_ratio(row1[2], df2.iloc[j, 2])  # Adjust column index
    last_name_score = fuzz.token_sort_ratio(row1[3], df2.iloc[j, 3])    # Adjust column index
    birthday_score = fuzz.token_sort_ratio(row1[4], df2.iloc[j, 4])     # Adjust column index
    
    if (first_name_score >= threshold) and (last_name_score >= threshold) and (birthday_score >= threshold):
        return df2.iloc[j]
    else:
        return None

def fuzzy_match_matrix(df1, df2):
    matching_indices = []

    with ProcessPoolExecutor() as executor:
        futures = []
        for i, row1 in enumerate(tqdm_notebook(df1.itertuples(), total=len(df1), desc="Processing")):
            for j in range(len(df2)):
                futures.append(executor.submit(fuzzy_match_row, row1, df2, j))

        for future in tqdm_notebook(futures, total=len(futures), desc="Processing"):
            matched_row = future.result()
            if matched_row is not None:
                matching_indices.append(matched_row)

    return matching_indices

matched_rows = fuzzy_match_matrix(df1, df2)

columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']
matched_df = pd.DataFrame(matched_rows, columns=columns)


Processing:   0%|          | 0/4000 [00:00<?, ?it/s]

OSError: [Errno 12] Cannot allocate memory