# Hierarchical Naive Bayes Model

In [104]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import ExpectationMaximization
from pgmpy.inference import BeliefPropagation
import pandas as pd
import random

In [105]:
# Create the Hierarchical Naive Bayes model with latent structure
model = BayesianNetwork(
    ebunch=[
        ('Name Match', 'Identity Match'),
        ('DOB Match', 'Identity Match'),
        ('Address Match', 'Identity Match'),
        ('ID Match', 'Identity Match'),
        ('First Name Match', 'Name Match'),
        ('Middle Name Match', 'Name Match'),
        ('Last Name Match', 'Name Match'),
        ('Date of Birth Similarity', 'DOB Match'),
        ('Address Similarity', 'Address Match'),
        ('ID Similarity', 'ID Match'),
        ('First Name Similarity', 'First Name Match'),
        ('Middle Name Similarity', 'Middle Name Match'),
        ('Last Name Similarity', 'Last Name Match')
    ],
    latents=['Name Match', 'DOB Match', 'Address Match', 'ID Match', 'First Name Match', 'Middle Name Match', 'Last Name Match']
)

In [106]:
# Load training data from csv
sampled_data = pd.read_csv('data/sampleunlabeledident.csv')

# E Step for the Unsupervised Learning: Adds an Identity Match column and assigns either 0 or 1 to each row
sampled_data['Identity Match'] = sampled_data.apply(lambda row: random.randint(0, 1), axis=1)

sampled_data = sampled_data.loc[:,['Identity Match','Date of Birth Similarity','Address Similarity','ID Similarity','First Name Similarity','Middle Name Similarity','Last Name Similarity',]]
sampled_data.head()

Unnamed: 0,Identity Match,Date of Birth Similarity,Address Similarity,ID Similarity,First Name Similarity,Middle Name Similarity,Last Name Similarity
0,1,3.0,101.0,0.0,0.0,0.0,0.0
1,1,4.0,113.0,4.0,17.0,0.0,7.0
2,1,1.0,131.0,0.0,8.0,0.0,0.0
3,1,3.0,112.0,7.0,18.0,0.0,21.0
4,1,2.0,148.0,1.0,0.0,0.0,0.0


In [107]:
sampled_data.columns

Index(['Identity Match', 'Date of Birth Similarity', 'Address Similarity',
       'ID Similarity', 'First Name Similarity', 'Middle Name Similarity',
       'Last Name Similarity'],
      dtype='object')

In [108]:
model.get_leaves()

['Identity Match']

In [109]:
# Fit the model
model.fit(sampled_data, estimator=ExpectationMaximization)

  5%|▌         | 5/100 [05:58<1:53:35, 71.75s/it]


### Make Predictions

In [110]:
# take in csv of processed data to be predicted
predict_df = pd.read_csv('data/predict_similarity_df.csv')
predict_df.head()

Unnamed: 0,UNIQ_ID1,UNIQ_ID2,ID1,ID2,ID Similarity,First Name 1,First Name 2,First Name Similarity,Middle Name 1,Middle Name 2,Middle Name Similarity,Last Name 1,Last Name 2,Last Name Similarity,Date of Birth 1,Date of Birth 2,Date of Birth Similarity,Address 1,Address 2,Address Similarity
0,459741,301286,sashaikh,chipman,,Sasha,Stuart,,,Arlin,,Shaikh,Chipman-Bergsma,,19760616,19881113,,2325 Ridgeway Rd San Marino CA 91108-2116 UNIT...,197 Country Club Blvd Plainwell MI 49080-9120 ...,
1,459741,466757,sashaikh,kestell,,Sasha,Katherine,,,E,5.0,Shaikh,Meyers,,19760616,19760827,,2325 Ridgeway Rd San Marino CA 91108-2116 UNIT...,14916 16th St SE Snohomish WA 98290-4717 UNITE...,
2,459741,388774,sashaikh,cmatsos,0.0,Sasha,Christopher,9.0,,Jameson,,Shaikh,Matsos,0.0,19760616,19830801,,2325 Ridgeway Rd San Marino CA 91108-2116 UNIT...,26735 Trowbridge Sq New Boston MI 48164-8960 U...,
3,459741,293648,sashaikh,gslayton,,Sasha,Geraldine,,,,,Shaikh,Slayton,,19760616,19311112,,2325 Ridgeway Rd San Marino CA 91108-2116 UNIT...,7543 N. Adrian Hwy Tecumseh MI 49286 USA},
4,459741,486117,sashaikh,cwaldorf,,Sasha,Clayton,,,MacKenzie,,Shaikh,Waldorf,,19760616,19740810,,2325 Ridgeway Rd San Marino CA 91108-2116 UNIT...,9734 W Gull Lake Dr Richland MI 49083-9541 UNI...,


In [111]:
# print the max of each column
print(sampled_data.max())

Identity Match                1.0
Date of Birth Similarity      6.0
Address Similarity          157.0
ID Similarity                 9.0
First Name Similarity        22.0
Middle Name Similarity       17.0
Last Name Similarity         22.0
dtype: float64


In [112]:
# print the min of each column
print(sampled_data.min())

Identity Match              0.0
Date of Birth Similarity    0.0
Address Similarity          1.0
ID Similarity               0.0
First Name Similarity       0.0
Middle Name Similarity      0.0
Last Name Similarity        0.0
dtype: float64


In [113]:
# Belief Propagation
belief_propagation = BeliefPropagation(model)
belief_propagation.calibrate()

In [118]:
for cpd in model.get_cpds():
    print(cpd)

+-------------------+-----+----------------------+
| First Name Match  | ... | First Name Match(1)  |
+-------------------+-----+----------------------+
| Last Name Match   | ... | Last Name Match(1)   |
+-------------------+-----+----------------------+
| Middle Name Match | ... | Middle Name Match(1) |
+-------------------+-----+----------------------+
| Name Match(0)     | ... | 0.49999999528701855  |
+-------------------+-----+----------------------+
| Name Match(1)     | ... | 0.5000000047129816   |
+-------------------+-----+----------------------+
+-------------------+-----+---------------------+
| Address Match     | ... | Address Match(1)    |
+-------------------+-----+---------------------+
| DOB Match         | ... | DOB Match(1)        |
+-------------------+-----+---------------------+
| ID Match          | ... | ID Match(1)         |
+-------------------+-----+---------------------+
| Name Match        | ... | Name Match(1)       |
+-------------------+-----+------------

In [119]:
# Using highest possible similarity scores --> model returns match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 6, 'Address Similarity': 157, 'ID Similarity': 9, 'First Name Similarity': 22, 'Middle Name Similarity': 17, 'Last Name Similarity': 22})

{'Identity Match': 1}

In [120]:
# Using lowest possible similarity scores --> model also returns match?
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 0, 'First Name Similarity': 0, 'Middle Name Similarity': 0, 'Last Name Similarity': 0})

{'Identity Match': 1}

In [121]:
# name and ID matches, but address and DOB do not match
belief_propagation.map_query(variables=['Identity Match'], evidence={'Date of Birth Similarity': 0, 'Address Similarity': 1, 'ID Similarity': 9, 'First Name Similarity': 22, 'Middle Name Similarity': 17, 'Last Name Similarity': 22})

{'Identity Match': 1}