# Hierarchical Naive Bayes Model

In [30]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization
import pandas as pd
import random

In [31]:
# Create the Hierarchical Naive Bayes model with latent structure
model = BayesianNetwork(
    ebunch=[
        ('Identity Match', 'Name Match'),
        ('Identity Match', 'DOB Match'),
        ('Identity Match', 'Address Match'),
        ('Identity Match', 'ID Match'),
        ('Name Match', 'First Name Match'),
        ('Name Match', 'Middle Name Match'),
        ('Name Match', 'Last Name Match'),
        ('DOB Match', 'Date of Birth Similarity'),
        ('Address Match', 'Address Similarity'),
        ('ID Match', 'ID Similarity'),
        ('First Name Match', 'First Name Similarity'),
        ('Middle Name Match', 'Middle Name Similarity'),
        ('Last Name Match', 'Last Name Similarity')
    ],
    latents=['Name Match', 'DOB Match', 'Address Match', 'ID Match', 'First Name Match', 'Middle Name Match', 'Last Name Match']
)

In [32]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')

# E Step for the Unsupervised Learning: Adds an Identity Match column and assigns either 0 or 1 to each row
sampled_data['Identity Match'] = sampled_data.apply(lambda row: random.randint(0, 1), axis=1)

sampled_data = sampled_data.loc[:,['Identity Match','Date of Birth Similarity','Address Similarity','ID Similarity','First Name Similarity','Middle Name Similarity','Last Name Similarity',]]
sampled_data.head()

Unnamed: 0,Identity Match,Date of Birth Similarity,Address Similarity,ID Similarity,First Name Similarity,Middle Name Similarity,Last Name Similarity
0,0,3.0,101.0,0.0,0.0,0.0,0.0
1,1,4.0,113.0,4.0,17.0,0.0,7.0
2,1,1.0,131.0,0.0,8.0,0.0,0.0
3,1,3.0,112.0,7.0,18.0,0.0,21.0
4,0,2.0,148.0,1.0,0.0,0.0,0.0


In [33]:
sampled_data.columns

Index(['Identity Match', 'Date of Birth Similarity', 'Address Similarity',
       'ID Similarity', 'First Name Similarity', 'Middle Name Similarity',
       'Last Name Similarity'],
      dtype='object')

In [34]:
model.get_leaves()

['Date of Birth Similarity',
 'Address Similarity',
 'ID Similarity',
 'First Name Similarity',
 'Middle Name Similarity',
 'Last Name Similarity']

In [35]:
# Fit the model
model.fit(sampled_data, estimator=ExpectationMaximization)

  5%|▌         | 5/100 [06:24<2:01:36, 76.81s/it]
