In [1]:
import pandas as pd
from faker import Faker
import datetime
import numpy as np

from joblib import Parallel, delayed
from alive_progress import alive_bar
from thefuzz import fuzz, process
from tqdm.notebook import tqdm_notebook

In [2]:
fake = Faker()

In [3]:
def generate_synthetic_data(n, column_types):
    """
    Generate synthetic data using Faker library based on specified column types.

    Parameters:
    n: integer number of synthetic data rows to generate.
    column_types (dict): Dictionary where keys are column names and values are Faker providers.

    Returns:
    pd.DataFrame: DataFrame containing synthetic data.
    """

    fake = Faker()
    data = {col: [getattr(fake, col_type)() for _ in range(n)] for col, col_type in column_types.items()}
    df = pd.DataFrame(data)

    return df

def add_variance(df, num_records_to_vary, column_types):
    # Randomly choose rows and columns to vary
    rows_to_vary = np.random.randint(0, len(df), num_records_to_vary)
    modified_df = df.loc[rows_to_vary]
    cols_to_vary = np.random.choice(df.columns, num_records_to_vary, replace=True)

    # Add variance to randomly sampled records
    for i in range(num_records_to_vary):
        row_idx = rows_to_vary[i]
        col = cols_to_vary[i]
        if column_types[col] == 'date_of_birth':
            modified_df.at[row_idx, col] = modified_df.at[row_idx, col] + pd.Timedelta(days=int(np.random.normal(0, 10)))
        elif column_types[col] == 'float64':
            modified_df.at[row_idx, col] = modified_df.at[row_idx, col] + np.random.normal(0, 0.1)
        else:
            modified_df.at[row_idx, col] = str(modified_df.at[row_idx, col]) + str(np.random.normal(0, 0.1))

    new_df = generate_synthetic_data(len(df)-num_records_to_vary, column_types)
    new_df = pd.concat([new_df, modified_df], axis=0)

    return new_df.reset_index(drop=True)

def match_rows(row1, df2, threshold=70):
    matches = []
    with tqdm_notebook(total=len(df2)) as pbar:
        for _, row2 in df2.iterrows():
            # Calculate the fuzzy match score for first name, last name, and birthday
            first_name_score = fuzz.token_sort_ratio(row1['First Name'], row2['First Name'])
            last_name_score = fuzz.token_sort_ratio(row1['Last Name'], row2['Last Name'])
            birthday_score = fuzz.token_sort_ratio(row1['date_of_birth'], row2['date_of_birth'])
    
            # Check if all three scores are above the threshold
            if first_name_score >= threshold and last_name_score >= threshold and birthday_score >= threshold:
                # Add the matched row to the list
                matches.append(row2)

            pbar.update(1)
            
    return matches

def deidentify_data(df):
    df['personal_id'] = [fake.uuid4() for _ in range(len(df))]
    df['Date of Birth'] = pd.to_datetime(df['date_of_birth'])
    df['Month/Year of Birth'] = df['date_of_birth'].apply(lambda date: date.strftime('%m-%Y'))
    df = df.drop(columns=['Last Name','First Name','Address'])
    return df


In [4]:
column_types = {
    'First Name': 'first_name',
    'Last Name': 'last_name',
    'date_of_birth': 'date_of_birth',
    'job': 'job',
    'Address': 'address',
    'email': 'email',
    'phone_number': 'phone_number',
    'email': 'email'
}

synthetic_df = generate_synthetic_data(100, column_types)


synthetic_df_with_variance = add_variance(synthetic_df, 10, column_types)

synthetic_df_with_variance.sort_values(by='First Name')

Unnamed: 0,First Name,Last Name,date_of_birth,job,Address,email,phone_number
26,Aaron,Patrick,1969-10-08,"Scientist, clinical (histocompatibility and im...","100 Robinson Springs\nDavidside, UT 88080",normanthomas@example.net,269-353-0358
66,Aaron,Blake,1941-11-10,Learning disability nurse,"0239 Lowe Greens\nMarthamouth, WV 62826",nancychen@example.org,(432)728-2738x07653
30,Angela,Hansen,1958-01-09,Energy engineer,"85098 Lisa Mews\nWest Jasmineview, UT 59510",amanda85@example.org,(674)156-0396x7920
47,Angela,Arnold,1946-08-30,"Nurse, adult","22485 Jesse Field\nBillyport, ND 25738",bonnie45@example.com,(511)072-2213x283
6,Angela,Long,2002-06-04,Exhibition designer,"348 Veronica Fields Suite 325\nNorth Jonathan,...",sherri09@example.org,(126)610-4645x720
...,...,...,...,...,...,...,...
91,Tony,Rice,2017-12-01,Advertising account executive,"3295 Sarah Lake Suite 254\nRyanview, PW 02101",umyers@example.com,413.857.6656x6583
76,Tonya,Williams,2004-08-04,Animal technologist,"264 Erica Rest Apt. 043\nWest Scottshire, MD 9...",lawrenceashley@example.com,738.508.3216x19592
84,Veronica,Baker,1916-02-18,IT technical support officer,"61078 Brandon Rapid Suite 729\nRobertsview, LA...",victoria46@example.com,740.189.3521
52,Wanda,Hodge,2010-09-17,Speech and language therapist,"538 Kevin Fort Apt. 328\nPort Cherylfurt, TN 8...",rosschristopher@example.org,722-990-3718


In [5]:
synthetic_df.sort_values(by='First Name')

Unnamed: 0,First Name,Last Name,date_of_birth,job,Address,email,phone_number
27,Adam,Rodriguez,1939-01-05,Immunologist,"083 Rubio Center Suite 199\nFrostside, GA 89269",kyledouglas@example.org,(397)376-9133x36617
70,Alexandria,Waller,1969-12-18,Travel agency manager,"648 Hamilton Turnpike\nNew William, FM 07120",kristenhoward@example.com,082.663.2203x0419
56,Amanda,Rangel,1993-05-25,Architect,82566 Richmond Harbor Apt. 981\nLake Harryland...,bscott@example.net,6336448012
1,Amanda,Ellison,1967-07-08,Accommodation manager,"1773 Rodriguez Throughway\nMauriceview, SC 40829",jessehoward@example.org,001-462-2624x266
78,Amanda,Fuller,1952-09-20,Trading standards officer,USNS Farmer\nFPO AE 35042,john66@example.net,+1-388-690-7691x303
...,...,...,...,...,...,...,...
97,Tony,Rice,2017-11-20,Advertising account executive,"3295 Sarah Lake Suite 254\nRyanview, PW 02101",umyers@example.com,413.857.6656x6583
63,Tracy,Lynch,1956-08-24,"Scientist, physiological",384 Hughes Stravenue Apt. 124\nPort Theresabur...,tammy66@example.com,9995423788
45,Tyler,Martinez,1943-11-24,"Engineer, control and instrumentation","26486 Jason Mountain Suite 509\nMasonhaven, MP...",robin65@example.org,502.658.2681x09810
66,Zachary,Gutierrez,1939-12-23,Geophysicist/field seismologist,"800 Eaton Groves\nStewartbury, MA 70642",jonathan01@example.org,+1-640-947-1419x316


In [6]:
df1 = synthetic_df_with_variance
df2 = synthetic_df

matched_rows = Parallel(n_jobs=-1)(delayed(match_rows)(row1, df2) for _, row1 in df1.iterrows())
# Flatten the list of lists
matched_rows = [row for sublist in matched_rows for row in sublist]

# Create a new dataframe containing the matched rows
matched_df = pd.DataFrame(matched_rows, columns=df1.columns.to_list())


In [7]:
matched_df

Unnamed: 0,First Name,Last Name,date_of_birth,job,Address,email,phone_number
44,Michelle,Dunn,1912-11-13,Biomedical scientist,"81221 Scott Light\nRiggsmouth, FL 13912",markperez@example.net,(915)532-6309x02691
97,Tony,Rice,2017-11-20,Advertising account executive,"3295 Sarah Lake Suite 254\nRyanview, PW 02101",umyers@example.com,413.857.6656x6583
8,Holly,Oliver,1981-01-07,Company secretary,"216 Salinas Creek\nNew Jasonview, OR 67352",thomas21@example.org,988-560-1528
7,Stephanie,Mitchell,2023-06-02,Film/video editor,67862 Courtney Stream Apt. 795\nEast Taylorbur...,gilbertjaime@example.com,592.750.2735x478
37,Richard,Conner,1928-12-11,Computer games developer,"4514 Tracey Spur\nPort Shelleyland, AZ 64315",chopkins@example.com,134-828-6746
25,Benjamin,Martin,1956-04-01,"Development worker, international aid",Unit 5282 Box 4473\nDPO AE 64073,jessica72@example.com,516.733.2511
74,Erin,Martinez,1996-07-13,Equality and diversity officer,Unit 3229 Box 1841\nDPO AA 59980,zacharyblackwell@example.net,(640)711-7158x226
75,Eduardo,Robles,2021-05-30,Investment analyst,"344 Mcgee Center\nHopkinsberg, VI 85733",nathaniel43@example.org,5598349757


In [8]:
def fuzzy_match_row(row1, df2, threshold=70):
    matches = []
    for _, row2 in df2.iterrows():
        first_name_score = fuzz.token_sort_ratio(row1['First Name'], row2['First Name'])
        if first_name_score >= threshold:
            row2['first_name_score'] = first_name_score
            last_name_score = fuzz.token_sort_ratio(row1['Last Name'], row2['Last Name'])
            if last_name_score >= threshold:
                row2['surname_score'] = last_name_score
                birthday_score = fuzz.token_sort_ratio(row1['date_of_birth'], row2['date_of_birth'])
                if birthday_score >= threshold:
                    row2['birthday_score'] = birthday_score
                    matches.append(row2)
        
    return matches

In [9]:
batch_size = 10
batch_indices = range(0, len(df2), batch_size)
batches = [df2.iloc[i:i+batch_size] for i in batch_indices]

matched_rows = []  

for _, row1 in tqdm_notebook(df1.iterrows(), total=len(df1), desc="Processing"):    
    # Perform fuzzy matching in parallel using joblib
    batch_matches = Parallel(n_jobs=-1)(
        delayed(fuzzy_match_row)(row1, batch_df) for batch_df in batches
    )

    # Flatten the list of lists
    matched_rows.extend([row for sublist in batch_matches for row in sublist])

# Create a new dataframe containing the matched rows
columns = df1.columns.tolist() + ['first_name_score', 'surname_score', 'birthday_score']

matched_df = pd.DataFrame(matched_rows, columns=columns)
matched_df

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,First Name,Last Name,date_of_birth,job,Address,email,phone_number,first_name_score,surname_score,birthday_score
44,Michelle,Dunn,1912-11-13,Biomedical scientist,"81221 Scott Light\nRiggsmouth, FL 13912",markperez@example.net,(915)532-6309x02691,100,100,100
97,Tony,Rice,2017-11-20,Advertising account executive,"3295 Sarah Lake Suite 254\nRyanview, PW 02101",umyers@example.com,413.857.6656x6583,100,100,80
8,Holly,Oliver,1981-01-07,Company secretary,"216 Salinas Creek\nNew Jasonview, OR 67352",thomas21@example.org,988-560-1528,100,100,100
7,Stephanie,Mitchell,2023-06-02,Film/video editor,67862 Courtney Stream Apt. 795\nEast Taylorbur...,gilbertjaime@example.com,592.750.2735x478,100,100,100
37,Richard,Conner,1928-12-11,Computer games developer,"4514 Tracey Spur\nPort Shelleyland, AZ 64315",chopkins@example.com,134-828-6746,100,100,100
25,Benjamin,Martin,1956-04-01,"Development worker, international aid",Unit 5282 Box 4473\nDPO AE 64073,jessica72@example.com,516.733.2511,100,100,100
74,Erin,Martinez,1996-07-13,Equality and diversity officer,Unit 3229 Box 1841\nDPO AA 59980,zacharyblackwell@example.net,(640)711-7158x226,100,100,100
75,Eduardo,Robles,2021-05-30,Investment analyst,"344 Mcgee Center\nHopkinsberg, VI 85733",nathaniel43@example.org,5598349757,100,100,100


In [10]:
matched_df

Unnamed: 0,First Name,Last Name,date_of_birth,job,Address,email,phone_number,first_name_score,surname_score,birthday_score
44,Michelle,Dunn,1912-11-13,Biomedical scientist,"81221 Scott Light\nRiggsmouth, FL 13912",markperez@example.net,(915)532-6309x02691,100,100,100
97,Tony,Rice,2017-11-20,Advertising account executive,"3295 Sarah Lake Suite 254\nRyanview, PW 02101",umyers@example.com,413.857.6656x6583,100,100,80
8,Holly,Oliver,1981-01-07,Company secretary,"216 Salinas Creek\nNew Jasonview, OR 67352",thomas21@example.org,988-560-1528,100,100,100
7,Stephanie,Mitchell,2023-06-02,Film/video editor,67862 Courtney Stream Apt. 795\nEast Taylorbur...,gilbertjaime@example.com,592.750.2735x478,100,100,100
37,Richard,Conner,1928-12-11,Computer games developer,"4514 Tracey Spur\nPort Shelleyland, AZ 64315",chopkins@example.com,134-828-6746,100,100,100
25,Benjamin,Martin,1956-04-01,"Development worker, international aid",Unit 5282 Box 4473\nDPO AE 64073,jessica72@example.com,516.733.2511,100,100,100
74,Erin,Martinez,1996-07-13,Equality and diversity officer,Unit 3229 Box 1841\nDPO AA 59980,zacharyblackwell@example.net,(640)711-7158x226,100,100,100
75,Eduardo,Robles,2021-05-30,Investment analyst,"344 Mcgee Center\nHopkinsberg, VI 85733",nathaniel43@example.org,5598349757,100,100,100


In [11]:
deIdentified_df = deidentify_data(matched_df)
deIdentified_df

Unnamed: 0,date_of_birth,job,email,phone_number,first_name_score,surname_score,birthday_score,personal_id,Date of Birth,Month/Year of Birth
44,1912-11-13,Biomedical scientist,markperez@example.net,(915)532-6309x02691,100,100,100,32998cef-8c41-4bce-9b3b-f6e14dd89871,1912-11-13,11-1912
97,2017-11-20,Advertising account executive,umyers@example.com,413.857.6656x6583,100,100,80,f4faadb6-87a5-4221-9283-486d1c83b442,2017-11-20,11-2017
8,1981-01-07,Company secretary,thomas21@example.org,988-560-1528,100,100,100,47be9c9b-89e5-4e3b-8b0e-8541e00019c0,1981-01-07,01-1981
7,2023-06-02,Film/video editor,gilbertjaime@example.com,592.750.2735x478,100,100,100,0bc81ce0-2598-400c-8d82-51c0e7eede00,2023-06-02,06-2023
37,1928-12-11,Computer games developer,chopkins@example.com,134-828-6746,100,100,100,666d4387-458a-4a73-a6bb-e65ae6b7e109,1928-12-11,12-1928
25,1956-04-01,"Development worker, international aid",jessica72@example.com,516.733.2511,100,100,100,1a764c29-c1e6-4fc8-9fbb-461484f3e39a,1956-04-01,04-1956
74,1996-07-13,Equality and diversity officer,zacharyblackwell@example.net,(640)711-7158x226,100,100,100,a4db950a-803f-4057-9b91-cae6c8097fad,1996-07-13,07-1996
75,2021-05-30,Investment analyst,nathaniel43@example.org,5598349757,100,100,100,b3032ec8-e80c-4ab8-b6e3-afe5d3f36e48,2021-05-30,05-2021
