# Hierarchical Naive Bayes Model

In [1]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.inference import BeliefPropagation
import pandas as pd
import random

In [2]:
# Create the Hierarchical Naive Bayes model with latent structure
model = BayesianNetwork(
    ebunch=[
        ('Name Match', 'Identity Match'),
        ('DOB Match', 'Identity Match'),
        ('Address Match', 'Identity Match'),
        ('ID Match', 'Identity Match'),
        ('First Name Match', 'Name Match'),
        ('Middle Name Match', 'Name Match'),
        ('Last Name Match', 'Name Match'),
        ('Date of Birth Similarity', 'DOB Match'),
        ('Address Similarity', 'Address Match'),
        ('ID Similarity', 'ID Match'),
        ('First Name Similarity', 'First Name Match'),
        ('Middle Name Similarity', 'Middle Name Match'),
        ('Last Name Similarity', 'Last Name Match')
    ],
    latents=['Name Match', 'DOB Match', 'Address Match', 'ID Match', 'First Name Match', 'Middle Name Match', 'Last Name Match']
)

In [3]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')
sampled_data = sampled_data.loc[:,['Date of Birth Similarity','Address Similarity','ID Similarity','First Name Similarity','Middle Name Similarity','Last Name Similarity',]]

In [7]:
# If "Identity Match" column doesn't exist, create it and make it empty
if 'Identity Match' not in sampled_data.columns:
    sampled_data['Identity Match'] = pd.NA

In [9]:
def assign_match(row):
    # If the 'Identity Match' column is empty,
    # then assign 0 or 1 
    if pd.isnull(row['Identity Match']):
        return random.randint(0, 1)
    else:
        return row['Identity Match']

In [10]:
# E Step for the Unsupervised Learning: Adds an Identity Match column and assigns either 0 or 1 to each row
# sampled_data['Identity Match'] = sampled_data.apply(lambda row: random.randint(0, 1), axis=1)
sampled_data['Identity Match'] = sampled_data.apply(lambda row: assign_match(row), axis=1)
sampled_data.head()

Unnamed: 0,Date of Birth Similarity,Address Similarity,ID Similarity,First Name Similarity,Middle Name Similarity,Last Name Similarity,Identity Match
0,3.0,96.0,0.0,6.0,16.0,0.0,0
1,1.0,88.0,1.0,6.0,0.0,0.0,1
2,2.0,10.0,0.0,0.0,16.0,8.0,0
3,4.0,113.0,1.0,8.0,0.0,19.0,0
4,2.0,110.0,0.0,0.0,0.0,5.0,1


In [11]:
sampled_data['Identity Match'].value_counts()

1    11311
0    11189
Name: Identity Match, dtype: int64

In [12]:
sampled_data.columns

Index(['Date of Birth Similarity', 'Address Similarity', 'ID Similarity',
       'First Name Similarity', 'Middle Name Similarity',
       'Last Name Similarity', 'Identity Match'],
      dtype='object')

In [14]:
model.get_leaves()

['Identity Match']

In [11]:
# Fit the model
model.fit(sampled_data, estimator=ExpectationMaximization)

  0%|          | 0/100 [00:00<?, ?it/s]

### Make Predictions

In [12]:
# print the max of each column
print(sampled_data.max())

Date of Birth Similarity      6.0
Address Similarity          122.0
ID Similarity                 9.0
First Name Similarity        20.0
Middle Name Similarity       16.0
Last Name Similarity         24.0
Identity Match                1.0
dtype: float64


In [13]:
# print the min of each column
print(sampled_data.min())

Date of Birth Similarity    0.0
Address Similarity          1.0
ID Similarity               0.0
First Name Similarity       0.0
Middle Name Similarity      0.0
Last Name Similarity        0.0
Identity Match              0.0
dtype: float64


We can evaluate how the model is doing with some simple tests using belief propagation.

In [14]:
# Belief Propagation
belief_propagation = BeliefPropagation(model)
belief_propagation.calibrate()

In [None]:
for cpd in model.get_cpds():
    print(cpd)

Some of the cells below may not work. The discretized values may differ each time the model is run. To view the different discretized values, look at the BIF file exported in the second to last cell.

In [21]:
# Using highest possible similarity scores --> model returns match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 6, 'Address Similarity': 122, 'ID Similarity': 9, 'First Name Similarity': 20, 'Middle Name Similarity': 16, 'Last Name Similarity': 24})

{'Identity Match': 0}

In [18]:
# Using lowest possible similarity scores --> model also returns match?
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 0, 'First Name Similarity': 0, 'Middle Name Similarity': 0, 'Last Name Similarity': 0})

{'Identity Match': 0}

In [19]:
# name and ID matches, but address and DOB do not match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 8, 'First Name Similarity': 19, 'Middle Name Similarity': 14, 'Last Name Similarity': 23})

{'Identity Match': 0}

### Export Model

Exporting as a bif file is a friendly way to look at the model and evaluate how it is doing by looking at the conditional probabilities for each latent variable. If the probabilities do not make sense - e.g. the probability of identity match is the same between two completely different scenarios - then the model will not perform well.

Exporting as an XML file is how we load the model in the prototype, or how it would get deployed elsewhere. In our experience, loading using a BIF file instead of XML would lead to strange errors.

In [20]:
model.save('model.bif', filetype='bif')

In [22]:
from pgmpy.readwrite import XMLBIFWriter

writer = XMLBIFWriter(model)
writer.write_xmlbif('model.xml')