# Hierarchical Naive Bayes Model

In [1]:
import pandas as pd

In [7]:
df = pd.read_csv('data/synthetic_dataset.csv')

In [8]:
df.head()

Unnamed: 0,ID,Name,Address,Date of Birth
0,787879,Karen Hughes,"668 Thompson Square, East Monicaburgh, OR 01361",07-18-1995
1,442995,Tracy George,"013 Watson Prairie, North Mistyton, ME 96068",11-04-1977
2,865957,Jennifer Patel,"5787 Kim Summit Apt. 750, Janeport, MI 55669",04-24-2004
3,427670,Ashley Williams,"32599 Tracy Flat, North Lisamouth, MA 38573",05-26-1951
4,294077,Christopher Lee,"PSC 4163, Box 9402, APO AP 88964",06-18-1941


In [21]:
"""
Function to split name into first, middle, and last name
"""
def split_name(name):
    name = name.split()
    first_name = name[0]
    last_name = name[-1]
    middle_name = " ".join(name[1:-1])
    return first_name, middle_name, last_name

In [22]:
# Split name into first, middle, and last name
df['First Name'], df['Middle Name'], df['Last Name'] = zip(*df['Name'].map(split_name))
# Remove Name column
df.drop('Name', axis=1, inplace=True)

In [29]:
# Reorder cols to put name cols in front
df = df[['First Name', 'Middle Name', 'Last Name', 'Date of Birth', 'Address', 'ID']]

In [30]:
df.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
0,Karen,,Hughes,07-18-1995,"668 Thompson Square, East Monicaburgh, OR 01361",787879
1,Tracy,,George,11-04-1977,"013 Watson Prairie, North Mistyton, ME 96068",442995
2,Jennifer,,Patel,04-24-2004,"5787 Kim Summit Apt. 750, Janeport, MI 55669",865957
3,Ashley,,Williams,05-26-1951,"32599 Tracy Flat, North Lisamouth, MA 38573",427670
4,Christopher,,Lee,06-18-1941,"PSC 4163, Box 9402, APO AP 88964",294077


Sample 200 records from dataframe.

In [31]:
# Sample 200 records from df into new df and remove them from the old df
df_sample = df.sample(200)


In [32]:
df_sample.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
408,Mitchell,,Fuller,04-25-1933,"Unit 9627 Box 7329, DPO AE 51570",139323
669,Glenn,,Smith,03-28-1986,"28706 Lindsey Light, Lake Lindseyview, MI 14764",357124
646,Teresa,,Prince,01-21-1948,"05216 Antonio Green Apt. 839, South Rebeccasid...",836589
639,Dennis,,Allen,01-06-1987,"481 Scott Locks, Lake Andrew, LA 24626",400425
491,David,,Miller,07-13-1984,"12445 Tonya Key, South Denisechester, MA 95179",942665


### Similarity Measures

With a sample df, we can now build a similarity measure df by comparing each record in the sample df with the other records.

In [12]:
import Levenshtein as lev

In [40]:
"""
Returns normalized levenshtein distance between two strings
"""
def levenshtein_distance(
        s1: str,
        s2: str
) -> int:
    distance = lev.distance(s1, s2)
    try:
        return 1 - distance / max(len(s1), len(s2))
    except ZeroDivisionError:
        return 1

In [48]:
"""
Returns Levenshtein distance between each field of two rows
Assumes following format for rows:
    row = pd.Series([First Name, Middle Name, Last Name, DOB, Address, ID])
"""
def row_similarity (
        row1: pd.Series,
        row2: pd.Series
) -> list:
    similarities = []
    for col in row1.index:
        similarity = levenshtein_distance(str(row1[col]), str(row2[col]))
        similarities.append(similarity)
    return similarities

In [49]:
"""
Builds similarity measure between records in two df
Creates a new df from the two df with the following columns:
    - ID1: ID of record in df1
    - ID2: ID of record in df2
    - First Name Similarity: Normalized levenshtein distance between first names
    - Middle Name Similarity: Normalized levenshtein distance between middle names
    - Last Name Similarity: Normalized levenshtein distance between last names
    - Date of Birth Similarity: Normalized levenshtein distance between dates of birth
    - Address Similarity: Normalized levenshtein distance between addresses
    - ID Similarity: Normalized levenshtein distance between IDs
"""
def build_similarity_df (
        df_1: pd.DataFrame,
        df_2: pd.DataFrame
) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['ID1', 
                                   'ID2', 
                                   'First Name Similarity', 
                                   'Middle Name Similarity', 
                                   'Last Name Similarity', 
                                   'Date of Birth Similarity', 
                                   'Address Similarity', 
                                   'ID Similarity'])
    for index_1, row_1 in df_1.iterrows():
        row = []
        for index_2, row_2 in df_2.iterrows():
            row.append(row_1['ID'])
            row.append(row_2['ID'])
            row += row_similarity(row_1, row_2)
            # Append new row to new_df
            new_df.loc[len(new_df)] = row
            row.clear()
    return new_df

In [50]:
similarity_df = build_similarity_df(df_sample,  df)

KeyboardInterrupt: 

### Discretization

### Build Model

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator

In [None]:
# Create the Hierarchical Naive Bayes model
model = BayesianModel()

In [None]:
level1 = ['Identity Match']
level2 = ['Name Match', 'DOB Match', 'Address Match', 'ID Match']
level3 = ['First Name Match', 'Middle Name Match', 'Last Name Match', 'DOB Similarity', 'Address Similarity', 'ID Similarity']
level4 = ['First Name Similarity', 'Middle Name Similarity', 'Last Name Similarity']

# Add Nodes
model.add_nodes_from(level1)
model.add_nodes_from(level2)
model.add_nodes_from(level3)
model.add_nodes_from(level4)

# Add edges in most confusing way possible

# Connect level1 with level2
pairs = []
for val in level2:
    pairs.append(level1[0], val)
model.add_edges_from(pairs)

# Connect Name Match
pairs.clear()
for val in level3[0:3]:
    pairs.append(level2[0], val)
model.add_edges_from(pairs)

# Connect DOB, Address, and ID Match with Similarity
pairs.clear()
for i in range(1, 4):
    pairs.append(level2[i], level3[i + 2])
model.add_edges_from(pairs)

# Connect Name Matches with Similarity
pairs.clear()
for i in range(3):
    pairs.append(level3[i], level4[i])
model.add_edges_from(pairs)

In [None]:
# Estimate CPDs using Maximum Likelihood Estimation (MLE)
estimator = MaximumLikelihoodEstimator(model, data)

In [None]:
# Fit the model to the data
model.fit(data, estimator=estimator)

In [None]:
# Predict using the trained model
# Assuming you have a test DataFrame 'test_data' with columns 'A', 'B', 'C', 'D', 'E'
predictions = model.predict(test_data)