# Hierarchical Naive Bayes Model

In [2]:
import pandas as pd

Importing the dataset from the data folder.

In [3]:
df = pd.read_csv('data/synthetic_dataset.csv')
df.head()

Unnamed: 0,ID,Name,Address,Date of Birth
0,787879,Karen Hughes,"668 Thompson Square, East Monicaburgh, OR 01361",07-18-1995
1,442995,Tracy George,"013 Watson Prairie, North Mistyton, ME 96068",11-04-1977
2,865957,Jennifer Patel,"5787 Kim Summit Apt. 750, Janeport, MI 55669",04-24-2004
3,427670,Ashley Williams,"32599 Tracy Flat, North Lisamouth, MA 38573",05-26-1951
4,294077,Christopher Lee,"PSC 4163, Box 9402, APO AP 88964",06-18-1941


## Preprocessing

### Cleanup

In [4]:
"""
Function to split name into first, middle, and last name
"""
def split_name(name):
    name = name.split()
    first_name = name[0]
    last_name = name[-1]
    middle_name = " ".join(name[1:-1])
    return first_name, middle_name, last_name

The split name function may not be necessary depending on the structure of the data:

In [5]:
# If name column exists, split name into first, middle, and last name
# If name column does not exist, assume the existence of other columns and do nothing
if 'Name' in df.columns:
    # Split name into first, middle, and last name
    df['First Name'], df['Middle Name'], df['Last Name'] = zip(*df['Name'].map(split_name))

    df.drop('Name', axis=1, inplace=True)

if 'Middle Name' not in df.columns:
    df['Middle Name'] = ""

Now, we need to reorder the columns and sample them to ensure consistency before building our similarity measures.

In [6]:
# Reorder cols to put name cols in front
df = df[['First Name', 'Middle Name', 'Last Name', 'Date of Birth', 'Address', 'ID']]

In [7]:
df.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
0,Karen,,Hughes,07-18-1995,"668 Thompson Square, East Monicaburgh, OR 01361",787879
1,Tracy,,George,11-04-1977,"013 Watson Prairie, North Mistyton, ME 96068",442995
2,Jennifer,,Patel,04-24-2004,"5787 Kim Summit Apt. 750, Janeport, MI 55669",865957
3,Ashley,,Williams,05-26-1951,"32599 Tracy Flat, North Lisamouth, MA 38573",427670
4,Christopher,,Lee,06-18-1941,"PSC 4163, Box 9402, APO AP 88964",294077


Sample 200 records from dataframe.

In [8]:
# Sample 200 records from df into new df
df_sample = df.sample(200)

In [9]:
df_sample.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
536,Lauren,,Carter,05-06-1949,"4724 Deanna Via Suite 040, North Matthew, LA 1...",835583
810,Emily,,Hopkins,07-19-1947,"1508 Klein Station Suite 178, West Maryville, ...",875713
900,Kimberly,,Miller,10-04-1990,"3745 Phillips Common Apt. 170, North Jamieport...",280943
383,Danielle,,Hayes,11-21-1952,"724 Sarah Inlet, Joshualand, MN 31128",24446
606,Stephen,,Diaz,07-15-1956,"975 Teresa Shores, Riveraberg, NH 81331",136265


In [10]:
# Remove sampled records from df
df = df.drop(df_sample.index)

### Similarity Measures

With a sample df, we can now build a similarity measure df by comparing each record in the sample df with the other records.

In [11]:
import Levenshtein as lev

In [12]:
"""
Returns normalized levenshtein distance between two strings
"""
def levenshtein_distance(
        s1: str,
        s2: str
) -> int:
    distance = lev.distance(s1, s2)
    try:
        return 1 - distance / float(max(len(s1), len(s2)))
    except ZeroDivisionError:
        return 0

In [14]:
"""
Returns Levenshtein distance between each field of two rows
Assumes following format for rows:
    row = pd.Series([First Name, Middle Name, Last Name, DOB, Address, ID])
"""
def row_similarity(
        row_1: pd.Series, 
        df_2: pd.DataFrame
    ) -> pd.DataFrame:

    # Compute similarity measures for each column using Levenshtein distance
    first_name_similarity = df_2['First Name'].apply(lambda x : levenshtein_distance(row_1['First Name'], x))
    middle_name_similarity = df_2['Middle Name'].apply(lambda x : levenshtein_distance(row_1['Middle Name'], x))
    last_name_similarity = df_2['Last Name'].apply(lambda x : levenshtein_distance(row_1['Last Name'], x))
    dob_similarity = df_2['Date of Birth'].apply(lambda x : levenshtein_distance(row_1['Date of Birth'], x))
    address_similarity = df_2['Address'].apply(lambda x : levenshtein_distance(row_1['Address'], x))
    id_similarity = df_2['ID'].apply(lambda x : levenshtein_distance(row_1['ID'], x))

    # Return DataFrame with similarity measures
    return pd.DataFrame({
        'ID1': row_1['ID'],
        'ID2': df_2['ID'],
        'First Name Similarity': first_name_similarity,
        'Middle Name Similarity': middle_name_similarity,
        'Last Name Similarity': last_name_similarity,
        'Date of Birth Similarity': dob_similarity,
        'Address Similarity': address_similarity,
        'ID Similarity': id_similarity
    })


In [15]:
"""
Builds similarity measure between records in two df
Creates a new df from the two df with the following columns:
    - ID1: ID of record in df1
    - ID2: ID of record in df2
    - First Name Similarity: Normalized levenshtein distance between first names
    - Middle Name Similarity: Normalized levenshtein distance between middle names
    - Last Name Similarity: Normalized levenshtein distance between last names
    - Date of Birth Similarity: Normalized levenshtein distance between dates of birth
    - Address Similarity: Normalized levenshtein distance between addresses
    - ID Similarity: Normalized levenshtein distance between IDs
"""
def build_similarity_df (
        df_1: pd.DataFrame,
        df_2: pd.DataFrame
) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['ID1', 
                                   'ID2', 
                                   'First Name Similarity', 
                                   'Middle Name Similarity', 
                                   'Last Name Similarity', 
                                   'Date of Birth Similarity', 
                                   'Address Similarity', 
                                   'ID Similarity'])

    # Convert ID columns to string
    df_1["ID"] = df_1["ID"].astype(str)
    df_2["ID"] = df_2["ID"].astype(str)

    def apply_row_similarity(row, new_df):
        sim = row_similarity(row, df_2)
        new_df = pd.concat([new_df, sim], ignore_index=True)
        return new_df

    # Use vectorization to compute similarity between each row in df_1 and df_2
    new_df = df_1.apply(apply_row_similarity, args=(new_df,), axis=1).reset_index(drop=True)

    # New df is a series of dfs, so we need to concatenate them
    new_df = pd.concat(new_df.to_list(), ignore_index=True)

    return new_df

In [16]:
similarity_df = build_similarity_df(df_sample,  df)
similarity_df.head()

Unnamed: 0,ID1,ID2,First Name Similarity,Middle Name Similarity,Last Name Similarity,Date of Birth Similarity,Address Similarity,ID Similarity
0,835583,787879,0.666667,0.0,0.166667,0.5,0.2,0.0
1,835583,442995,0.0,0.0,0.166667,0.5,0.34,0.0
2,835583,865957,0.125,0.0,0.5,0.3,0.26,0.333333
3,835583,427670,0.166667,0.0,0.0,0.7,0.3,0.0
4,835583,294077,0.090909,0.0,0.166667,0.6,0.18,0.0


### Discretization

In [17]:
#for solving systems of equations
import sympy as sp 
import math

In [19]:
"""
    t = # of intervals
    s = interval size
    n = # of instances in training dataset

    s * t = n
    s - 30 = t
"""
n = len(similarity_df) # rows in training dataset
print(n)

180000


In [21]:
#function to solve for s and t given n, returns pair {s, t}
def interval_size_and_num(n) -> tuple:
    s, t = sp.symbols('s t')
    equation1 = sp.Eq(s * t, n)
    equation2 = sp.Eq(s - 30, t)  
    solution = sp.solve((equation1, equation2), (s, t))
    s_val = solution[0]
    t_val = solution[1] 
    return s_val, t_val

In [22]:
# Calculate s and t
s = interval_size_and_num(n)[0][1] * -1
t = interval_size_and_num(n)[1][1]

num_instances = math.floor(s)
num_intervals = math.floor(t)
print(num_instances)
print(num_intervals)

439
409


In [24]:
discrete_df = similarity_df

remainder = len(similarity_df) % num_intervals

# function that replaces similarity measures with discrete values and records bin's range [min, max] for one column
def discretize_column(col_name, bin_ranges, discrete_df, remainder, num_instances):
    count = 0
    bin = 0
    discrete_df = discrete_df.sort_values(col_name)
    range = [-1, -1]
    for index, row in discrete_df.iterrows():
        similarity = row[col_name]
        row[col_name] = bin
        count += 1
        if(count == 1):
            range[0] = similarity
        if(bin >= remainder):
            if(count == num_instances):
                bin += 1
                count = 0
                range[1] = similarity
                bin_ranges.append(range)
        if(bin < remainder):
            if(count == num_instances + 1):
                bin += 1
                count = 0
                range[1] = similarity
                bin_ranges.append(range)

In [25]:
# bins for each attribute (list of pairs)

bin_intervals = {'First Name Similarity' : [],
                'Middle Name Similarity' : [],
                'Last Name Similarity' : [],
                'Date of Birth Similarity' : [],
                'Address Similarity' : [],
                'ID Similarity' : []
                }

In [26]:
for column in discrete_df.columns[2:]:
    discretize_column(column, bin_intervals[column], discrete_df, remainder, num_instances)

discrete_df.head()

Unnamed: 0,ID1,ID2,First Name Similarity,Middle Name Similarity,Last Name Similarity,Date of Birth Similarity,Address Similarity,ID Similarity
0,835583,787879,0.666667,0.0,0.166667,0.5,0.2,0.0
1,835583,442995,0.0,0.0,0.166667,0.5,0.34,0.0
2,835583,865957,0.125,0.0,0.5,0.3,0.26,0.333333
3,835583,427670,0.166667,0.0,0.0,0.7,0.3,0.0
4,835583,294077,0.090909,0.0,0.166667,0.6,0.18,0.0


In [27]:
for bin in bin_intervals.values():
    print(bin)

[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.

## Hierarchical Naive Bayes

In [28]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
# Create the Hierarchical Naive Bayes model
model = BayesianModel()



In [31]:
level1 = ['Identity Match']
level2 = ['Name Match', 'DOB Match', 'Address Match', 'ID Match']
level3 = ['First Name Match', 'Middle Name Match', 'Last Name Match', 'DOB Similarity', 'Address Similarity', 'ID Similarity']
level4 = ['First Name Similarity', 'Middle Name Similarity', 'Last Name Similarity']

# Add Nodes
model.add_nodes_from(level1)
model.add_nodes_from(level2)
model.add_nodes_from(level3)
model.add_nodes_from(level4)

# Add edges in most confusing way possible

# Connect level1 with level2
pairs = []
for val in level2:
    pairs.append(level1[0], val)
model.add_edges_from(pairs)

# Connect Name Match
pairs.clear()
for val in level3[0:3]:
    pairs.append(level2[0], val)
model.add_edges_from(pairs)

# Connect DOB, Address, and ID Match with Similarity
pairs.clear()
for i in range(1, 4):
    pairs.append(level2[i], level3[i + 2])
model.add_edges_from(pairs)

# Connect Name Matches with Similarity
pairs.clear()
for i in range(3):
    pairs.append(level3[i], level4[i])
model.add_edges_from(pairs)

TypeError: list.append() takes exactly one argument (2 given)

In [None]:
# Estimate CPDs using Maximum Likelihood Estimation (MLE)
estimator = MaximumLikelihoodEstimator(model, data)

In [None]:
# Fit the model to the data
model.fit(data, estimator=estimator)

In [None]:
# Predict using the trained model
# Assuming you have a test DataFrame 'test_data' with columns 'A', 'B', 'C', 'D', 'E'
predictions = model.predict(test_data)