In [1]:
import py_entitymatching as em #Import megallan entity matching library

In [28]:
# Import the data set after blocking
yelp = em.read_csv_metadata("yelp.csv",key="id")
zomato = em.read_csv_metadata("zomato.csv",key="id")
S = em.read_csv_metadata("tagged_dataset.csv", key='_id', fk_ltable='ltable_id', fk_rtable='rtable_id',ltable=yelp, rtable=zomato)

In [29]:
S.shape

(1100, 23)

In [30]:
# Split G into development (I) and evaluation (J)
IJ = em.split_train_test(S, train_proportion=0.7)
I = IJ['train'] # Training Set
J = IJ['test'] # Test Set

In [31]:
print "Number of tuples in Development Set =", len(I)
print "Number of tuples in Evaluation Set =", len(J)

Number of tuples in Development Set = 770
Number of tuples in Evaluation Set = 330


In [32]:
# Commenting this code section, since not required at this point of time

#Store Development Set
#I.to_csv('DevelopmentSet.csv')
#Store Evaluation Set
#J.to_csv('EvaluationSet.csv')

In [33]:
# Create a set of ML-matchers
dt = em.DTMatcher(max_depth=5)
svm = em.SVMMatcher()
rf = em.RFMatcher(name='RF', random_state=0)
nb = em.NBMatcher(name='NB')
lg = em.LogRegMatcher(name='LogReg')
ln = em.LinRegMatcher(name='LinReg')

In [34]:
# Generate features automatically
feature_set = em.get_features_for_matching(yelp, zomato)
feature_set.head(1)

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x000000000BC02BA8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [35]:
# Get feature vector table for Development set
I_feature_vectors = em.extract_feature_vecs(I, feature_table=feature_set, attrs_after='Label', show_progress=False)

# Get feature vector table for Evaluation set
J_feature_vectors = em.extract_feature_vecs(J, feature_table=feature_set, attrs_after='Label', show_progress=False)

I_feature_vectors.head(1)

Unnamed: 0,_id,ltable_id,rtable_id,id_id_exm,id_id_anm,id_id_lev_dist,id_id_lev_sim,Name_Name_jac_qgm_3_qgm_3,Name_Name_cos_dlm_dc0_dlm_dc0,Name_Name_jac_dlm_dc0_dlm_dc0,...,Delivery_Delivery_lev_sim,Takeout_Takeout_exm,Takeout_Takeout_anm,Takeout_Takeout_lev_dist,Takeout_Takeout_lev_sim,Outdoor_seating_Outdoor_seating_exm,Outdoor_seating_Outdoor_seating_anm,Outdoor_seating_Outdoor_seating_lev_dist,Outdoor_seating_Outdoor_seating_lev_sim,Label
958,97885,733,419,0,0.571623,3.0,0.0,0.111111,0.408248,0.25,...,1.0,1,1.0,0.0,1.0,0,0.0,1.0,0.0,0


In [36]:
# Fill the missing values with 0
I_feature_vectors.fillna(value=0, inplace=True)
S.columns

Index([u'key_id', u'_id', u'ltable_id', u'rtable_id', u'ltable_Name',
       u'ltable_Phone', u'ltable_Zipcode', u'ltable_State', u'ltable_City',
       u'ltable_Address', u'ltable_Delivery', u'ltable_Takeout',
       u'ltable_Outdoor_seating', u'rtable_Name', u'rtable_Phone',
       u'rtable_Zipcode', u'rtable_State', u'rtable_City', u'rtable_Address',
       u'rtable_Delivery', u'rtable_Takeout', u'rtable_Outdoor_seating',
       u'Label'],
      dtype='object')

In [37]:
# select the best ML matcher using CV using precision
result_precision = em.select_matcher([dt, rf, svm, nb, lg, ln], table=I_feature_vectors, 
        exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], 
        target_attr='Label', metric='precision', random_state=0)

In [38]:
result_precision['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree_65200008118911021941,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000BDC9860>,5,1.0,0.972222,1.0,1.0,0.94,0.982444
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000BDC9940>,5,1.0,0.972222,1.0,1.0,1.0,0.994444
2,SVM_65200008118911021941,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000BDC9898>,5,1.0,1.0,1.0,0.973684,1.0,0.994737
3,NB,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000BEC6240>,5,1.0,0.972222,1.0,1.0,1.0,0.994444
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000BDC99E8>,5,1.0,1.0,1.0,0.952381,0.979167,0.98631
5,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000BDC9A90>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
# select the best ML matcher using CV using recall
result_recall = em.select_matcher([dt, rf, svm, nb, lg, ln], table=I_feature_vectors, 
        exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], 
        target_attr='Label', metric='recall', random_state=0)

In [40]:
result_recall['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree_65200008118911021941,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000BDC9860>,5,1.0,0.972222,1.0,1.0,1.0,0.994444
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000BDC9940>,5,1.0,0.972222,1.0,1.0,1.0,0.994444
2,SVM_65200008118911021941,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000BDC9898>,5,0.975,0.888889,0.939394,0.925,0.87234,0.920125
3,NB,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000BEC6240>,5,0.975,0.972222,1.0,1.0,0.978723,0.985189
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000BDC99E8>,5,0.975,1.0,1.0,1.0,1.0,0.995
5,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000BDC9A90>,5,0.975,0.972222,0.939394,0.95,0.978723,0.963068


In [41]:
# select the best ML matcher using CV using F1
result_f1 = em.select_matcher([dt, rf, svm, nb, lg, ln], table=I_feature_vectors, 
        exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], 
        target_attr='Label', metric='f1', random_state=0)

In [42]:
result_f1['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree_65200008118911021941,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000000000BDC9860>,5,1.0,0.972222,1.0,1.0,0.969072,0.988259
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000000000BDC9940>,5,1.0,0.972222,1.0,1.0,1.0,0.994444
2,SVM_65200008118911021941,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000000000BDC9898>,5,0.987342,0.941176,0.96875,0.948718,0.931818,0.955561
3,NB,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000000000BEC6240>,5,0.987342,0.972222,1.0,1.0,0.989247,0.989762
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000000000BDC99E8>,5,0.987342,1.0,1.0,0.97561,0.989474,0.990485
5,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000000000BDC9A90>,5,0.987342,0.985915,0.96875,0.974359,0.989247,0.981123


In [43]:
result_f1['selected_matcher']

<py_entitymatching.matcher.rfmatcher.RFMatcher at 0xbdc9940>

In [44]:
# Further split the Development set into Training and set test for debugging
devSetSplit = em.split_train_test(I_feature_vectors, train_proportion=0.6)
A = devSetSplit['train']
B = devSetSplit['test']

Based on the evaluation results, RF is the best matcher. 

In [45]:
#The following command was used for debugging
#em.vis_debug_rf(rf, A, B, exclude_attrs=['_id', 'ltable_id', 'rtable_id','Label'], target_attr='Label')

In [46]:
# Training rf on split development set(A) and testing it on split test set(B)

rf.fit(table=A, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

predicted_values = rf.predict(table=B, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

eval_result = em.eval_matches(predicted_values, 'Label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 100.0% (79/79)
Recall : 98.75% (79/80)
F1 : 99.37%
False positives : 0 (out of 79 positive predictions)
False negatives : 1 (out of 229 negative predictions)


RF gives the best result, hence choosing RF as the best matcher.

## Training Data on I and Testing on J

1. Train the Machine Learning Algorithm(DT, RF, SVM, NB, LogRegression, Linear Regression) using I (Development Set)
2. Test DT using J(Test Set)
3. Evaluate Precision, Recall and F1 score 

I. Decision Tree Learning

In [47]:
# 1. Train Decision Tree using I (Development Set)
dt.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test DT using J(Test Set)
predicted_values = dt.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_dt = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_dt)

Precision : 100.0% (77/77)
Recall : 98.72% (77/78)
F1 : 99.35%
False positives : 0 (out of 77 positive predictions)
False negatives : 1 (out of 253 negative predictions)


II. Random Forest Learning

In [48]:
# 1. Train Random Forest using I (Development Set)
rf.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test DT using J(Test Set)
predicted_values = rf.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_rf = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_rf)

Precision : 100.0% (77/77)
Recall : 98.72% (77/78)
F1 : 99.35%
False positives : 0 (out of 77 positive predictions)
False negatives : 1 (out of 253 negative predictions)


III. SVM Learning

In [49]:
# 1. Train SVM using I (Development Set)
svm.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test DT using J(Test Set)
predicted_values = svm.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_svm = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_svm)

Precision : 100.0% (74/74)
Recall : 94.87% (74/78)
F1 : 97.37%
False positives : 0 (out of 74 positive predictions)
False negatives : 4 (out of 256 negative predictions)


IV. Naive Bayes Learning

In [50]:
# 1. Train Naive Bayes using I (Development Set)
nb.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test Naive Bayes using J(Test Set)
predicted_values = nb.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_nb = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_nb)

Precision : 100.0% (77/77)
Recall : 98.72% (77/78)
F1 : 99.35%
False positives : 0 (out of 77 positive predictions)
False negatives : 1 (out of 253 negative predictions)


V. Logistic Regression Learning

In [51]:
# 1. Train Logistic Regression using I (Development Set)
lg.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test Logistic Regression using J(Test Set)
predicted_values = lg.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_lg = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_lg)

Precision : 100.0% (77/77)
Recall : 98.72% (77/78)
F1 : 99.35%
False positives : 0 (out of 77 positive predictions)
False negatives : 1 (out of 253 negative predictions)


VI. Linear Regression Learning

In [52]:
# 1. Train Linear Regression using I (Development Set)
ln.fit(table=I_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'], target_attr='Label')

#2. Test Linear Regression using J(Test Set)
predicted_values = ln.predict(table=J_feature_vectors, exclude_attrs=['_id','ltable_id', 'rtable_id', 'Label'],
              append=True, target_attr='predicted', inplace=False)

#3. Evaluate Precision, Recall and F1 score 
eval_result_ln = em.eval_matches(predicted_values, 'Label', 'predicted')

#4. Display the results
em.print_eval_summary(eval_result_ln)

Precision : 100.0% (73/73)
Recall : 93.59% (73/78)
F1 : 96.69%
False positives : 0 (out of 73 positive predictions)
False negatives : 5 (out of 257 negative predictions)


# # Comparing the precision, recall and F1 score of all learning methods - 

DT, RF and NB have same results in this case.
However, we ran this test a number of time and found RF to be slightly better in most of the cases.
Hence, we choose RF to be the best matcher

In [54]:
# Printing the evaluation result for "Random Forest" matcher
em.print_eval_summary(eval_result_rf)

Precision : 100.0% (77/77)
Recall : 98.72% (77/78)
F1 : 99.35%
False positives : 0 (out of 77 positive predictions)
False negatives : 1 (out of 253 negative predictions)
