# Hierarchical Naive Bayes Model

In [1]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.inference import BeliefPropagation
import pandas as pd
import random

In [2]:
# Create the Hierarchical Naive Bayes model with latent structure
model = BayesianNetwork(
    ebunch=[
        ('Name Match', 'Identity Match'),
        ('DOB Match', 'Identity Match'),
        ('Address Match', 'Identity Match'),
        ('ID Match', 'Identity Match'),
        ('First Name Match', 'Name Match'),
        ('Middle Name Match', 'Name Match'),
        ('Last Name Match', 'Name Match'),
        ('Date of Birth Similarity', 'DOB Match'),
        ('Address Similarity', 'Address Match'),
        ('ID Similarity', 'ID Match'),
        ('First Name Similarity', 'First Name Match'),
        ('Middle Name Similarity', 'Middle Name Match'),
        ('Last Name Similarity', 'Last Name Match')
    ],
    latents=['Name Match', 'DOB Match', 'Address Match', 'ID Match', 'First Name Match', 'Middle Name Match', 'Last Name Match']
)

In [3]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')
sampled_data = sampled_data.loc[:,['Date of Birth Similarity','Address Similarity','ID Similarity','First Name Similarity','Middle Name Similarity','Last Name Similarity',]]

In [4]:
sampled_data = sampled_data.sample(1000)

In [5]:
# loop through each column and add max of that column to running total
sum = 0
for column in sampled_data:
    sum += sampled_data[column].max()
threshold = sum / 2

In [6]:
def assign_match(row, threshold):
    score = row.sum()
    if score >= threshold:
        return 1
    else:
        return 0

In [7]:
# E Step for the Unsupervised Learning: Adds an Identity Match column and assigns either 0 or 1 to each row
# sampled_data['Identity Match'] = sampled_data.apply(lambda row: random.randint(0, 1), axis=1)
sampled_data['Identity Match'] = sampled_data.apply(lambda row: assign_match(row, threshold), axis=1)
sampled_data.head()

Unnamed: 0,Date of Birth Similarity,Address Similarity,ID Similarity,First Name Similarity,Middle Name Similarity,Last Name Similarity,Identity Match
427144,1.0,102.0,0.0,0.0,0.0,8.0,0
263597,3.0,351.0,1.0,0.0,6.0,10.0,1
457228,4.0,342.0,0.0,6.0,0.0,0.0,1
235514,2.0,227.0,1.0,11.0,0.0,14.0,1
208520,3.0,59.0,0.0,0.0,0.0,0.0,0


In [8]:
sampled_data['Identity Match'].value_counts()

1    563
0    437
Name: Identity Match, dtype: int64

In [9]:
sampled_data.columns

Index(['Date of Birth Similarity', 'Address Similarity', 'ID Similarity',
       'First Name Similarity', 'Middle Name Similarity',
       'Last Name Similarity', 'Identity Match'],
      dtype='object')

In [10]:
model.get_leaves()

['Identity Match']

In [11]:
# Fit the model
model.fit(sampled_data, estimator=ExpectationMaximization)

  0%|          | 0/100 [00:00<?, ?it/s]

### Make Predictions

In [12]:
# print the max of each column
print(sampled_data.max())

Date of Birth Similarity      5.0
Address Similarity          401.0
ID Similarity                10.0
First Name Similarity        29.0
Middle Name Similarity       28.0
Last Name Similarity         32.0
Identity Match                1.0
dtype: float64


In [13]:
# print the min of each column
print(sampled_data.min())

Date of Birth Similarity    0.0
Address Similarity          1.0
ID Similarity               0.0
First Name Similarity       0.0
Middle Name Similarity      0.0
Last Name Similarity        0.0
Identity Match              0.0
dtype: float64


In [14]:
# Belief Propagation
belief_propagation = BeliefPropagation(model)
belief_propagation.calibrate()

In [15]:
for cpd in model.get_cpds():
    print(cpd)

+-------------------+-----+----------------------+
| First Name Match  | ... | First Name Match(1)  |
+-------------------+-----+----------------------+
| Last Name Match   | ... | Last Name Match(1)   |
+-------------------+-----+----------------------+
| Middle Name Match | ... | Middle Name Match(1) |
+-------------------+-----+----------------------+
| Name Match(0)     | ... | 0.49999999750476104  |
+-------------------+-----+----------------------+
| Name Match(1)     | ... | 0.5000000024952389   |
+-------------------+-----+----------------------+
+-------------------+-----+--------------------+
| Address Match     | ... | Address Match(1)   |
+-------------------+-----+--------------------+
| DOB Match         | ... | DOB Match(1)       |
+-------------------+-----+--------------------+
| ID Match          | ... | ID Match(1)        |
+-------------------+-----+--------------------+
| Name Match        | ... | Name Match(1)      |
+-------------------+-----+--------------------

In [16]:
# Using highest possible similarity scores --> model returns match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 5, 'Address Similarity': 129, 'ID Similarity': 8, 'First Name Similarity': 19, 'Middle Name Similarity': 14, 'Last Name Similarity': 23})

KeyError: 19

In [17]:
# Using lowest possible similarity scores --> model also returns match?
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 0, 'First Name Similarity': 0, 'Middle Name Similarity': 0, 'Last Name Similarity': 0})

{'Identity Match': 1}

In [18]:
# name and ID matches, but address and DOB do not match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 8, 'First Name Similarity': 19, 'Middle Name Similarity': 14, 'Last Name Similarity': 23})

KeyError: 19

### Export Model

In [19]:
from pgmpy.readwrite import XMLBIFWriter

writer = XMLBIFWriter(model)
writer.write_xmlbif('model.xml')

In [None]:
model.save('model.bif', filetype='bif')