# Hierarchical Naive Bayes Model

In [108]:
import pandas as pd

In [109]:
df = pd.read_csv('data/synthetic_dataset.csv')

In [110]:
df.head()

Unnamed: 0,ID,Name,Address,Date of Birth
0,787879,Karen Hughes,"668 Thompson Square, East Monicaburgh, OR 01361",07-18-1995
1,442995,Tracy George,"013 Watson Prairie, North Mistyton, ME 96068",11-04-1977
2,865957,Jennifer Patel,"5787 Kim Summit Apt. 750, Janeport, MI 55669",04-24-2004
3,427670,Ashley Williams,"32599 Tracy Flat, North Lisamouth, MA 38573",05-26-1951
4,294077,Christopher Lee,"PSC 4163, Box 9402, APO AP 88964",06-18-1941


In [111]:
"""
Function to split name into first, middle, and last name
"""
def split_name(name):
    name = name.split()
    first_name = name[0]
    last_name = name[-1]
    middle_name = " ".join(name[1:-1])
    return first_name, middle_name, last_name

In [112]:
# Split name into first, middle, and last name
df['First Name'], df['Middle Name'], df['Last Name'] = zip(*df['Name'].map(split_name))
# Remove Name column
df.drop('Name', axis=1, inplace=True)

In [113]:
# Reorder cols to put name cols in front
df = df[['First Name', 'Middle Name', 'Last Name', 'Date of Birth', 'Address', 'ID']]

In [114]:
df.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
0,Karen,,Hughes,07-18-1995,"668 Thompson Square, East Monicaburgh, OR 01361",787879
1,Tracy,,George,11-04-1977,"013 Watson Prairie, North Mistyton, ME 96068",442995
2,Jennifer,,Patel,04-24-2004,"5787 Kim Summit Apt. 750, Janeport, MI 55669",865957
3,Ashley,,Williams,05-26-1951,"32599 Tracy Flat, North Lisamouth, MA 38573",427670
4,Christopher,,Lee,06-18-1941,"PSC 4163, Box 9402, APO AP 88964",294077


Sample 200 records from dataframe.

In [115]:
# Sample 200 records from df into new df and remove them from the old df
df_sample = df.sample(200)


In [116]:
df_sample.head()

Unnamed: 0,First Name,Middle Name,Last Name,Date of Birth,Address,ID
995,Donald,,Williams,01-22-1992,"82831 Kimberly Fords, Hermanland, ND 32513",622760
717,Gary,,Robles,11-24-1978,"55931 Jones Mountain, East Katherine, IL 88478",595514
200,Daniel,,King,10-02-1959,"316 Goodwin Haven, Bryceville, SC 83514",225589
183,Chelsea,,Hobbs,01-25-1945,"7192 Barron Ports, Cameronmouth, GU 88392",881913
754,Krista,Scott,MD,06-21-1942,"7896 Griffith Circles, Lake Cynthiastad, ME 85773",497058


In [123]:
# Remove sampled records from df
df = df.drop(df_sample.index)

### Similarity Measures

With a sample df, we can now build a similarity measure df by comparing each record in the sample df with the other records.

In [124]:
import Levenshtein as lev

In [125]:
"""
Returns normalized levenshtein distance between two strings
"""
def levenshtein_distance(
        s1: str,
        s2: str
) -> int:
    distance = lev.distance(s1, s2)
    try:
        return 1 - distance / float(max(len(s1), len(s2)))
    except ZeroDivisionError:
        return 0

In [126]:
"""
Returns Levenshtein distance between each field of two rows
Assumes following format for rows:
    row = pd.Series([First Name, Middle Name, Last Name, DOB, Address, ID])
"""
def row_similarity(
        row_1: pd.Series, 
        df_2: pd.DataFrame
    ) -> pd.DataFrame:

    # Compute similarity measures for each column using Levenshtein distance
    first_name_similarity = df_2['First Name'].apply(lambda x : levenshtein_distance(row_1['First Name'], x))
    middle_name_similarity = df_2['Middle Name'].apply(lambda x : levenshtein_distance(row_1['Middle Name'], x))
    last_name_similarity = df_2['Last Name'].apply(lambda x : levenshtein_distance(row_1['Last Name'], x))
    dob_similarity = df_2['Date of Birth'].apply(lambda x : levenshtein_distance(row_1['Date of Birth'], x))
    address_similarity = df_2['Address'].apply(lambda x : levenshtein_distance(row_1['Address'], x))
    id_similarity = df_2['ID'].apply(lambda x : levenshtein_distance(row_1['ID'], x))

    # print(pd.DataFrame({
    #     'ID1': row_1['ID'],
    #     'ID2': df_2['ID'],
    #     'First Name Similarity': first_name_similarity,
    #     'Middle Name Similarity': middle_name_similarity,
    #     'Last Name Similarity': last_name_similarity,
    #     'Date of Birth Similarity': dob_similarity,
    #     'Address Similarity': address_similarity,
    #     'ID Similarity': id_similarity
    # }))

    # Return DataFrame with similarity measures
    return pd.DataFrame({
        'ID1': row_1['ID'],
        'ID2': df_2['ID'],
        'First Name Similarity': first_name_similarity,
        'Middle Name Similarity': middle_name_similarity,
        'Last Name Similarity': last_name_similarity,
        'Date of Birth Similarity': dob_similarity,
        'Address Similarity': address_similarity,
        'ID Similarity': id_similarity
    })


In [127]:
"""
Builds similarity measure between records in two df
Creates a new df from the two df with the following columns:
    - ID1: ID of record in df1
    - ID2: ID of record in df2
    - First Name Similarity: Normalized levenshtein distance between first names
    - Middle Name Similarity: Normalized levenshtein distance between middle names
    - Last Name Similarity: Normalized levenshtein distance between last names
    - Date of Birth Similarity: Normalized levenshtein distance between dates of birth
    - Address Similarity: Normalized levenshtein distance between addresses
    - ID Similarity: Normalized levenshtein distance between IDs
"""
def build_similarity_df (
        df_1: pd.DataFrame,
        df_2: pd.DataFrame
) -> pd.DataFrame:
    new_df = pd.DataFrame(columns=['ID1', 
                                   'ID2', 
                                   'First Name Similarity', 
                                   'Middle Name Similarity', 
                                   'Last Name Similarity', 
                                   'Date of Birth Similarity', 
                                   'Address Similarity', 
                                   'ID Similarity'])

    # Convert ID columns to string
    df_1["ID"] = df_1["ID"].astype(str)
    df_2["ID"] = df_2["ID"].astype(str)

    def apply_row_similarity(row, new_df):
        sim = row_similarity(row, df_2)
        new_df = pd.concat([new_df, sim], ignore_index=True)
        print(new_df)
        return new_df

    # Use vectorization to compute similarity between each row in df_1 and df_2
    new_df = df_1.apply(apply_row_similarity, args=(new_df,), axis=1).reset_index(drop=True)

    # New df is a series of dfs, so we need to concatenate them
    new_df = pd.concat(new_df.to_list(), ignore_index=True)

    return new_df

In [128]:
similarity_df = build_similarity_df(df_sample,  df)

        ID1     ID2  First Name Similarity  Middle Name Similarity  \
0    622760  787879               0.000000                     0.0   
1    622760  865957               0.125000                     0.0   
2    622760  427670               0.000000                     0.0   
3    622760  294077               0.090909                     0.0   
4    622760  492930               0.333333                     0.0   
..      ...     ...                    ...                     ...   
895  622760  502867               0.000000                     0.0   
896  622760  272530               0.000000                     0.0   
897  622760  822683               0.166667                     0.0   
898  622760  385393               0.166667                     0.0   
899  622760   69340               0.166667                     0.0   

     Last Name Similarity  Date of Birth Similarity  Address Similarity  \
0                   0.125                       0.6            0.170213   
1        

In [129]:
similarity_df

Unnamed: 0,ID1,ID2,First Name Similarity,Middle Name Similarity,Last Name Similarity,Date of Birth Similarity,Address Similarity,ID Similarity
0,622760,787879,0.000000,0.0,0.125,0.6,0.170213,0.000000
1,622760,865957,0.125000,0.0,0.000,0.4,0.227273,0.000000
2,622760,427670,0.000000,0.0,1.000,0.6,0.279070,0.500000
3,622760,294077,0.090909,0.0,0.000,0.5,0.142857,0.000000
4,622760,492930,0.333333,0.0,0.000,0.6,0.188679,0.333333
...,...,...,...,...,...,...,...,...
179995,54047,502867,0.000000,0.0,0.125,0.6,0.122807,0.333333
179996,54047,272530,0.125000,0.0,0.000,0.5,0.195122,0.000000
179997,54047,822683,0.250000,0.0,0.125,0.2,0.179487,0.000000
179998,54047,385393,0.250000,0.0,0.125,0.5,0.625000,0.000000


### Discretization

### Build Model

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator

In [None]:
# Create the Hierarchical Naive Bayes model
model = BayesianModel()

In [None]:
level1 = ['Identity Match']
level2 = ['Name Match', 'DOB Match', 'Address Match', 'ID Match']
level3 = ['First Name Match', 'Middle Name Match', 'Last Name Match', 'DOB Similarity', 'Address Similarity', 'ID Similarity']
level4 = ['First Name Similarity', 'Middle Name Similarity', 'Last Name Similarity']

# Add Nodes
model.add_nodes_from(level1)
model.add_nodes_from(level2)
model.add_nodes_from(level3)
model.add_nodes_from(level4)

# Add edges in most confusing way possible

# Connect level1 with level2
pairs = []
for val in level2:
    pairs.append(level1[0], val)
model.add_edges_from(pairs)

# Connect Name Match
pairs.clear()
for val in level3[0:3]:
    pairs.append(level2[0], val)
model.add_edges_from(pairs)

# Connect DOB, Address, and ID Match with Similarity
pairs.clear()
for i in range(1, 4):
    pairs.append(level2[i], level3[i + 2])
model.add_edges_from(pairs)

# Connect Name Matches with Similarity
pairs.clear()
for i in range(3):
    pairs.append(level3[i], level4[i])
model.add_edges_from(pairs)

In [None]:
# Estimate CPDs using Maximum Likelihood Estimation (MLE)
estimator = MaximumLikelihoodEstimator(model, data)

In [None]:
# Fit the model to the data
model.fit(data, estimator=estimator)

In [None]:
# Predict using the trained model
# Assuming you have a test DataFrame 'test_data' with columns 'A', 'B', 'C', 'D', 'E'
predictions = model.predict(test_data)