In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
# Load the pre-labeled data
metacriticData = em.read_csv_metadata("data/metacritic.csv")
wikiData = em.read_csv_metadata("data/wikiData.csv")

# add ID column to each dataset
metacriticID = ["a" + str(num) for num in np.arange(1, len(metacriticData.index)+1)]
wikiID = ["b" + str(num) for num in np.arange(1, len(wikiData.index)+1)]

col_idx = 0
metacriticData.insert(loc = col_idx, column = 'ID', value = metacriticID)
wikiData.insert(loc = col_idx, column = 'ID', value = wikiID)
em.set_key(wikiData, 'ID')
em.set_key(metacriticData, 'ID')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


True

In [3]:
#read in labeled samples
S = em.read_csv_metadata("candidates_sample.csv", 
                         key='_id',
                         ltable=metacriticData, rtable=wikiData, 
                         fk_ltable='ltable_ID', fk_rtable='rtable_ID')

Metadata file is not present in the given path; proceeding to read the csv file.


In [4]:
# Split S into I an J
i_file = "I.csv"
j_file = "J.csv"
if not os.path.isfile(i_file): #so you don't delete your labels on accident
    IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    I.to_csv(i_file,sep=",")
    J.to_csv(j_file,sep=",")
    print("Split samples into I and J")
else:
    I = em.read_csv_metadata(i_file,key="_id",ltable=metacriticData,rtable=wikiData,fk_ltable="ltable_ID",fk_rtable="rtable_ID")
    J = em.read_csv_metadata(j_file,key="_id",ltable=metacriticData,rtable=wikiData,fk_ltable="ltable_ID",fk_rtable="rtable_ID")
    print("Reading I and J from files")
print(len(I))
print(len(J))

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Reading I and J from files
250
250


In [5]:
# Generate a set of features
F = em.get_features_for_matching(metacriticData, wikiData, validate_inferred_attr_types=False)

Column Producer does not seem to qualify as any atomic type. It may contain all NaNs. Please update the values of column Producer
Column Meta Score does not seem to qualify as any atomic type. It may contain all NaNs. Please update the values of column Meta Score


In [6]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [7]:
# create learners
import random
random_state = 0 

dt = em.DTMatcher(name='DecisionTree', random_state=random_state)
rf = em.RFMatcher(name='RF', random_state=random_state)
svm = em.SVMMatcher(name='SVM', random_state=random_state)
ln = em.LinRegMatcher(name='LinReg')
lg = em.LogRegMatcher(name='LogReg', random_state=random_state)
nb = em.NBMatcher(name = 'NaiveBayes')

In [8]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
                strategy='mean')

In [9]:
#initial results
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.934566,0.929703,0.929673
1,RF,0.985714,0.958421,0.970322
2,SVM,1.0,0.224399,0.364633
3,LinReg,0.972115,0.953036,0.961021
4,LogReg,0.973214,0.968421,0.968998
5,NaiveBayes,0.956667,0.912267,0.932857


- From the above, it looks as though RF gave us best results (in terms of f1-score)

In [10]:
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11518ca20>,5,1.0,0.933333,1.0,0.857143,0.882353,0.934566
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11518c9e8>,5,1.0,1.0,1.0,0.928571,1.0,0.985714
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11518ca58>,5,1.0,1.0,1.0,1.0,1.0,1.0
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x11518c1d0>,5,1.0,0.9375,1.0,0.923077,1.0,0.972115
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11518c588>,5,1.0,0.9375,1.0,0.928571,1.0,0.973214
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x11518c390>,5,1.0,1.0,0.95,0.833333,1.0,0.956667


In [11]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11518ca20>,5,0.842105,0.933333,0.95,0.923077,1.0,0.929703
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11518c9e8>,5,0.842105,1.0,0.95,1.0,1.0,0.958421
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11518ca58>,5,0.157895,0.266667,0.2,0.230769,0.266667,0.224399
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x11518c1d0>,5,0.842105,1.0,1.0,0.923077,1.0,0.953036
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11518c588>,5,0.842105,1.0,1.0,1.0,1.0,0.968421
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x11518c390>,5,0.842105,1.0,0.95,0.769231,1.0,0.912267


In [12]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x11518ca20>,5,0.914286,0.933333,0.974359,0.888889,0.9375,0.929673
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x11518c9e8>,5,0.914286,1.0,0.974359,0.962963,1.0,0.970322
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x11518ca58>,5,0.272727,0.421053,0.333333,0.375,0.421053,0.364633
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x11518c1d0>,5,0.914286,0.967742,1.0,0.923077,1.0,0.961021
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x11518c588>,5,0.914286,0.967742,1.0,0.962963,1.0,0.968998
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x11518c390>,5,0.914286,1.0,0.95,0.8,1.0,0.932857


## Train classifiers on I and test on J

In [13]:
classifiers = np.array([dt, rf, svm, ln, lg, nb])

In [14]:
# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

# Impute feature vectors with the mean of the column values
L = em.impute_table(L, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
                strategy='mean')

for c in classifiers:
    # Train using feature vectors from I 
    c.fit(table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], target_attr='label')
    
    # Predict on L 
    predictions = c.predict(table=L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], 
                             append=True, target_attr='predicted', inplace=False)
    
    predictions[['_id', 'ltable_ID', 'rtable_ID', 'predicted','label']].head()

    # Evaluate the predictions
    print(c.name)
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    em.print_eval_summary(eval_result)
    print()

DecisionTree
Precision : 86.05% (74/86)
Recall : 90.24% (74/82)
F1 : 88.1%
False positives : 12 (out of 86 positive predictions)
False negatives : 8 (out of 164 negative predictions)

RF
Precision : 95.89% (70/73)
Recall : 85.37% (70/82)
F1 : 90.32%
False positives : 3 (out of 73 positive predictions)
False negatives : 12 (out of 177 negative predictions)

SVM
Precision : 94.74% (18/19)
Recall : 21.95% (18/82)
F1 : 35.64%
False positives : 1 (out of 19 positive predictions)
False negatives : 64 (out of 231 negative predictions)

LinReg
Precision : 96.3% (78/81)
Recall : 95.12% (78/82)
F1 : 95.71%
False positives : 3 (out of 81 positive predictions)
False negatives : 4 (out of 169 negative predictions)

LogReg
Precision : 96.25% (77/80)
Recall : 93.9% (77/82)
F1 : 95.06%
False positives : 3 (out of 80 positive predictions)
False negatives : 5 (out of 170 negative predictions)

NaiveBayes
Precision : 96.1% (74/77)
Recall : 90.24% (74/82)
F1 : 93.08%
False positives : 3 (out of 77 positiv