# Hierarchical Naive Bayes Model

In [2]:
import pandas as pd

Importing the dataset from the data folder.

In [3]:
df = pd.read_csv('data/synthetic_dataset.csv')
df.head()

Unnamed: 0,ID,Name,Address,Date of Birth
0,787879,Karen Hughes,"668 Thompson Square, East Monicaburgh, OR 01361",07-18-1995
1,442995,Tracy George,"013 Watson Prairie, North Mistyton, ME 96068",11-04-1977
2,865957,Jennifer Patel,"5787 Kim Summit Apt. 750, Janeport, MI 55669",04-24-2004
3,427670,Ashley Williams,"32599 Tracy Flat, North Lisamouth, MA 38573",05-26-1951
4,294077,Christopher Lee,"PSC 4163, Box 9402, APO AP 88964",06-18-1941


## Preprocessing

### Cleanup

In [4]:
"""
Function to split name into first, middle, and last name
"""
def split_name(name):
    name = name.split()
    first_name = name[0]
    last_name = name[-1]
    middle_name = " ".join(name[1:-1])
    return first_name, middle_name, last_name

The split name function may not be necessary depending on the structure of the data:

In [5]:
# If name column exists, split name into first, middle, and last name
# If name column does not exist, assume the existence of other columns and do nothing
if 'Name' in df.columns:
    # Split name into first, middle, and last name
    df['First Name'], df['Middle Name'], df['Last Name'] = zip(*df['Name'].map(split_name))

    df.drop('Name', axis=1, inplace=True)

if 'Middle Name' not in df.columns:
    df['Middle Name'] = ""

Now, we need to reorder the columns and sample them to ensure consistency before building our similarity measures.

In [6]:
# Reorder cols to put name cols in front
df = df[['First Name', 'Middle Name', 'Last Name', 'Date of Birth', 'Address', 'ID']]

In [7]:
df.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
0,Karen,,Hughes,07-18-1995,"668 Thompson Square, East Monicaburgh, OR 01361",787879
1,Tracy,,George,11-04-1977,"013 Watson Prairie, North Mistyton, ME 96068",442995
2,Jennifer,,Patel,04-24-2004,"5787 Kim Summit Apt. 750, Janeport, MI 55669",865957
3,Ashley,,Williams,05-26-1951,"32599 Tracy Flat, North Lisamouth, MA 38573",427670
4,Christopher,,Lee,06-18-1941,"PSC 4163, Box 9402, APO AP 88964",294077


Sample 200 records from dataframe.

In [8]:
# Sample 200 records from df into new df
df_sample = df.sample(200)

In [9]:
df_sample.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
246,Mario,,Carroll,12-17-1962,"967 Davis Avenue Suite 177, South Taylorboroug...",466635
660,Amy,,Boyd,01-17-1965,"99499 Herrera Spur Suite 970, West Zacharyshir...",599222
526,Harold,,Cline,09-10-1996,"519 Salinas Fields, Bairdberg, NJ 00837",574582
56,Ashley,,Martinez,1938-09-01,"24755 Judith Meadow, North Summermouth, WY 97962",251902
393,Mr.,Robert,Jones,09-13-1960,"PSC 3748, Box 6236, APO AP 30488",345121


In [10]:
# Remove sampled records from df
df = df.drop(df_sample.index)

### Similarity Measures

With a sample df, we can now build a similarity measure df by comparing each record in the sample df with the other records.

In [11]:
import Levenshtein as lev

In [12]:
"""
Returns normalized levenshtein distance between two strings
"""
def levenshtein_distance(
        s1: str,
        s2: str
) -> int:
    distance = lev.distance(s1, s2)
    try:
        return 1 - distance / float(max(len(s1), len(s2)))
    except ZeroDivisionError:
        return 0

In [13]:
"""
Returns Levenshtein distance between each field of two rows
Assumes following format for rows:
    row = pd.Series([First Name, Middle Name, Last Name, DOB, Address, ID])
"""
def row_similarity(
        row_1: pd.Series, 
        df_2: pd.DataFrame
    ) -> pd.DataFrame:

    # Compute similarity measures for each column using Levenshtein distance
    first_name_similarity = df_2['First Name'].apply(lambda x : levenshtein_distance(row_1['First Name'], x))
    middle_name_similarity = df_2['Middle Name'].apply(lambda x : levenshtein_distance(row_1['Middle Name'], x))
    last_name_similarity = df_2['Last Name'].apply(lambda x : levenshtein_distance(row_1['Last Name'], x))
    dob_similarity = df_2['Date of Birth'].apply(lambda x : levenshtein_distance(row_1['Date of Birth'], x))
    address_similarity = df_2['Address'].apply(lambda x : levenshtein_distance(row_1['Address'], x))
    id_similarity = df_2['ID'].apply(lambda x : levenshtein_distance(row_1['ID'], x))

    # Return DataFrame with similarity measures
    return pd.DataFrame({
        'ID1': row_1['ID'],
        'ID2': df_2['ID'],
        'First Name Similarity': first_name_similarity,
        'Middle Name Similarity': middle_name_similarity,
        'Last Name Similarity': last_name_similarity,
        'Date of Birth Similarity': dob_similarity,
        'Address Similarity': address_similarity,
        'ID Similarity': id_similarity
    })


In [14]:
"""
Builds similarity measure between records in two df
Creates a new df from the two df with the following columns:
    - ID1: ID of record in df1
    - ID2: ID of record in df2
    - First Name Similarity: Normalized levenshtein distance between first names
    - Middle Name Similarity: Normalized levenshtein distance between middle names
    - Last Name Similarity: Normalized levenshtein distance between last names
    - Date of Birth Similarity: Normalized levenshtein distance between dates of birth
    - Address Similarity: Normalized levenshtein distance between addresses
    - ID Similarity: Normalized levenshtein distance between IDs
"""
def build_similarity_df (
        df_1: pd.DataFrame,
        df_2: pd.DataFrame
) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['ID1', 
                                   'ID2', 
                                   'First Name Similarity', 
                                   'Middle Name Similarity', 
                                   'Last Name Similarity', 
                                   'Date of Birth Similarity', 
                                   'Address Similarity', 
                                   'ID Similarity'])

    # Convert ID columns to string
    df_1["ID"] = df_1["ID"].astype(str)
    df_2["ID"] = df_2["ID"].astype(str)

    def apply_row_similarity(row, new_df):
        sim = row_similarity(row, df_2)
        new_df = pd.concat([new_df, sim], ignore_index=True)
        return new_df

    # Use vectorization to compute similarity between each row in df_1 and df_2
    new_df = df_1.apply(apply_row_similarity, args=(new_df,), axis=1).reset_index(drop=True)

    # New df is a series of dfs, so we need to concatenate them
    new_df = pd.concat(new_df.to_list(), ignore_index=True)

    return new_df

In [15]:
similarity_df = build_similarity_df(df_sample,  df)
similarity_df.head()

Unnamed: 0,ID1,ID2,First Name Similarity,Middle Name Similarity,Last Name Similarity,Date of Birth Similarity,Address Similarity,ID Similarity
0,466635,787879,0.4,0.0,0.0,0.5,0.333333,0.0
1,466635,442995,0.0,0.0,0.142857,0.5,0.280702,0.333333
2,466635,865957,0.125,0.0,0.285714,0.2,0.263158,0.166667
3,466635,294077,0.272727,0.0,0.0,0.5,0.157895,0.0
4,466635,492930,0.4,0.0,0.222222,0.5,0.192982,0.333333


## Discretization

In [16]:
# #for solving systems of equations
# import sympy as sp 
# import math

In [17]:
# """
#     t = # of intervals
#     s = interval size
#     n = # of instances in training dataset

#     s * t = n
#     s - 30 = t
# """
# total_instances = len(similarity_df) # rows in training dataset
# print(total_instances)

In [18]:
# # WPKID function to solve for s and t given n, returns pair {s, t}
# def interval_size_and_num(n) -> tuple:
#     s, t = sp.symbols('s t')
#     equation1 = sp.Eq(s * t, n)
#     equation2 = sp.Eq(s - 30, t)  
#     solution = sp.solve((equation1, equation2), (s, t))
#     s_val = solution[0]
#     t_val = solution[1] 
#     return s_val, t_val

In [19]:
# Calculate s and t
# s = interval_size_and_num(total_instances)[0][1] * -1
# t = interval_size_and_num(total_instances)[1][1]

# num_instances = math.floor(s)
# num_intervals = math.floor(t)
# print(num_instances)
# print(num_intervals)

## Equal Frequency Discretization(EFD)
#### Using k = 7 intervals

### 1. Standardize the data

In [135]:
def std_dev_divide(column):
    #find std dev of nonzeros
    std_dev = column.std()
    # divide all values by std dev
    column = column/std_dev

In [136]:
discrete_df = similarity_df
bins = {'First Name Similarity' : [0],
        'Middle Name Similarity' : [0],
        'Last Name Similarity' : [0],
        'Date of Birth Similarity' : [0],
        'Address Similarity' : [0],
        'ID Similarity' : [0]}

In [137]:
for column in discrete_df.iloc[:, 2:]:
    #split into zero and nonzeros
    zeros_df = discrete_df[discrete_df[column] == 0].copy()
    nonzeros_df = discrete_df[discrete_df[column] != 0].copy()
    #divide by standard dev in nonzeros
    std_dev_divide(nonzeros_df[column])
    #discretize column
    nonzeros_df[column], bin = pd.qcut(nonzeros_df[column], 
                                        q=6, 
                                        labels=False,
                                        retbins=True,
                                        duplicates='drop') 
    #add 1 to every value to switch to index 1
    nonzeros_df[column] = nonzeros_df[column] + 1
    #save bin ranges
    bins[column].extend(bin.tolist())
    #recombine zero and nonzero dataframes
    discrete_df = pd.concat([zeros_df, nonzeros_df], axis=0)   

In [138]:
discrete_df.tail()

Unnamed: 0,ID1,ID2,First Name Similarity,Middle Name Similarity,Last Name Similarity,Date of Birth Similarity,Address Similarity,ID Similarity
117025,938960,429090,6.0,3.0,6.0,3.0,3.0,1.0
117261,938960,559069,4.0,5.0,5.0,1.0,4.0,1.0
117271,938960,332934,3.0,5.0,5.0,3.0,6.0,1.0
117734,938960,42730,5.0,3.0,5.0,5.0,5.0,1.0
117839,938960,983803,4.0,2.0,6.0,3.0,4.0,1.0


In [139]:
for key, value in bins.items():
    print(key, value)

First Name Similarity [0, 0.09090909090909094, 0.125, 0.1428571428571429, 0.16666666666666663, 0.19999999999999996, 0.2857142857142857, 1.0]
Middle Name Similarity [0, 0.06666666666666665, 0.09999999999999998, 0.1428571428571429, 0.16666666666666663, 0.25, 1.0]
Last Name Similarity [0, 0.09090909090909094, 0.125, 0.1428571428571429, 0.16666666666666663, 0.2222222222222222, 0.33333333333333337, 1.0]
Date of Birth Similarity [0, 0.09999999999999998, 0.30000000000000004, 0.4, 0.5, 0.6, 1.0]
Address Similarity [0, 0.08333333333333337, 0.1707317073170732, 0.20408163265306123, 0.23076923076923073, 0.26086956521739135, 0.30188679245283023, 1.0]
ID Similarity [0, 0.16666666666666663, 0.33333333333333337, 1.0]


## Hierarchical Naive Bayes

In [28]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
# Create the Hierarchical Naive Bayes model
model = BayesianModel()



In [31]:
level1 = ['Identity Match']
level2 = ['Name Match', 'DOB Match', 'Address Match', 'ID Match']
level3 = ['First Name Match', 'Middle Name Match', 'Last Name Match', 'DOB Similarity', 'Address Similarity', 'ID Similarity']
level4 = ['First Name Similarity', 'Middle Name Similarity', 'Last Name Similarity']

# Add Nodes
model.add_nodes_from(level1)
model.add_nodes_from(level2)
model.add_nodes_from(level3)
model.add_nodes_from(level4)

# Add edges in most confusing way possible

# Connect level1 with level2
pairs = []
for val in level2:
    pairs.append(level1[0], val)
model.add_edges_from(pairs)

# Connect Name Match
pairs.clear()
for val in level3[0:3]:
    pairs.append(level2[0], val)
model.add_edges_from(pairs)

# Connect DOB, Address, and ID Match with Similarity
pairs.clear()
for i in range(1, 4):
    pairs.append(level2[i], level3[i + 2])
model.add_edges_from(pairs)

# Connect Name Matches with Similarity
pairs.clear()
for i in range(3):
    pairs.append(level3[i], level4[i])
model.add_edges_from(pairs)

TypeError: list.append() takes exactly one argument (2 given)

In [None]:
# Estimate CPDs using Maximum Likelihood Estimation (MLE)
estimator = MaximumLikelihoodEstimator(model, data)

In [None]:
# Fit the model to the data
model.fit(data, estimator=estimator)

In [None]:
# Predict using the trained model
# Assuming you have a test DataFrame 'test_data' with columns 'A', 'B', 'C', 'D', 'E'
predictions = model.predict(test_data)