# Hierarchical Naive Bayes Model

In [142]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.inference import BeliefPropagation
import pandas as pd
import random

In [143]:
# Create the Hierarchical Naive Bayes model with latent structure
model = BayesianNetwork(
    ebunch=[
        ('Name Match', 'Identity Match'),
        ('DOB Match', 'Identity Match'),
        ('Address Match', 'Identity Match'),
        ('ID Match', 'Identity Match'),
        ('First Name Match', 'Name Match'),
        ('Middle Name Match', 'Name Match'),
        ('Last Name Match', 'Name Match'),
        ('Date of Birth Similarity', 'DOB Match'),
        ('Address Similarity', 'Address Match'),
        ('ID Similarity', 'ID Match'),
        ('First Name Similarity', 'First Name Match'),
        ('Middle Name Similarity', 'Middle Name Match'),
        ('Last Name Similarity', 'Last Name Match')
    ],
    latents=['Name Match', 'DOB Match', 'Address Match', 'ID Match', 'First Name Match', 'Middle Name Match', 'Last Name Match']
)

In [144]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')

In [145]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')
sampled_data = sampled_data.loc[:,['Date of Birth Similarity','Address Similarity','ID Similarity','First Name Similarity','Middle Name Similarity','Last Name Similarity',]]

In [153]:
# loop through each column and add max of that column to running total
sum = 0
for column in sampled_data:
    sum += sampled_data[column].max()
threshold = sum / 2

In [154]:
def assign_match(row, threshold):
    score = row.sum()
    if score >= threshold:
        return 1
    else:
        return 0

In [155]:
# E Step for the Unsupervised Learning: Adds an Identity Match column and assigns either 0 or 1 to each row
# sampled_data['Identity Match'] = sampled_data.apply(lambda row: random.randint(0, 1), axis=1)
sampled_data['Identity Match'] = sampled_data.apply(lambda row: assign_match(row, threshold), axis=1)
sampled_data.head()

Unnamed: 0,Date of Birth Similarity,Address Similarity,ID Similarity,First Name Similarity,Middle Name Similarity,Last Name Similarity,Identity Match
0,2.0,44.0,0.0,15.0,0.0,8.0,0
1,1.0,41.0,0.0,4.0,14.0,7.0,0
2,3.0,57.0,0.0,8.0,0.0,0.0,0
3,3.0,62.0,0.0,5.0,0.0,0.0,0
4,1.0,81.0,3.0,13.0,14.0,23.0,1


In [156]:
sampled_data['Identity Match'].value_counts()

Identity Match
0    14065
1     8435
Name: count, dtype: int64

In [157]:
sampled_data.columns

Index(['Date of Birth Similarity', 'Address Similarity', 'ID Similarity',
       'First Name Similarity', 'Middle Name Similarity',
       'Last Name Similarity', 'Identity Match'],
      dtype='object')

In [158]:
model.get_leaves()

['Identity Match']

In [161]:
# Fit the model
model.fit(sampled_data, estimator=ExpectationMaximization)



[A[A78157.78s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.79s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.79s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.84s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
78157.84s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.

### Make Predictions

In [164]:
# print the max of each column
print(sampled_data.max())

Date of Birth Similarity      6.0
Address Similarity          129.0
ID Similarity                 8.0
First Name Similarity        19.0
Middle Name Similarity       14.0
Last Name Similarity         23.0
Identity Match                1.0
dtype: float64


In [163]:
# print the min of each column
print(sampled_data.min())

Date of Birth Similarity    0.0
Address Similarity          1.0
ID Similarity               0.0
First Name Similarity       0.0
Middle Name Similarity      0.0
Last Name Similarity        0.0
Identity Match              0.0
dtype: float64


In [165]:
# Belief Propagation
belief_propagation = BeliefPropagation(model)
belief_propagation.calibrate()

In [166]:
for cpd in model.get_cpds():
    print(cpd)

+-------------------+-----+----------------------+
| First Name Match  | ... | First Name Match(1)  |
+-------------------+-----+----------------------+
| Last Name Match   | ... | Last Name Match(1)   |
+-------------------+-----+----------------------+
| Middle Name Match | ... | Middle Name Match(1) |
+-------------------+-----+----------------------+
| Name Match(0)     | ... | 0.5000000035487977   |
+-------------------+-----+----------------------+
| Name Match(1)     | ... | 0.4999999964512022   |
+-------------------+-----+----------------------+
+-------------------+-----+---------------------+
| Address Match     | ... | Address Match(1)    |
+-------------------+-----+---------------------+
| DOB Match         | ... | DOB Match(1)        |
+-------------------+-----+---------------------+
| ID Match          | ... | ID Match(1)         |
+-------------------+-----+---------------------+
| Name Match        | ... | Name Match(1)       |
+-------------------+-----+------------

In [167]:
# Using highest possible similarity scores --> model returns match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 6, 'Address Similarity': 129, 'ID Similarity': 8, 'First Name Similarity': 19, 'Middle Name Similarity': 14, 'Last Name Similarity': 23})

{'Identity Match': 0}

In [168]:
# Using lowest possible similarity scores --> model also returns match?
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 0, 'First Name Similarity': 0, 'Middle Name Similarity': 0, 'Last Name Similarity': 0})

{'Identity Match': 0}

In [170]:
# name and ID matches, but address and DOB do not match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 8, 'First Name Similarity': 19, 'Middle Name Similarity': 14, 'Last Name Similarity': 23})

{'Identity Match': 0}