In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
import numpy as np

In [2]:
# Load the pre-labeled data
metacriticData = em.read_csv_metadata("data/metacritic.csv")
wikiData = em.read_csv_metadata("data/wikiData.csv")

# add ID column to each dataset
metacriticID = ["a" + str(num) for num in np.arange(1, len(metacriticData.index)+1)]
wikiID = ["b" + str(num) for num in np.arange(1, len(wikiData.index)+1)]

col_idx = 0
metacriticData.insert(loc = col_idx, column = 'ID', value = metacriticID)
wikiData.insert(loc = col_idx, column = 'ID', value = wikiID)
em.set_key(wikiData, 'ID')
em.set_key(metacriticData, 'ID')

No handlers could be found for logger "py_entitymatching.io.parsers"


True

In [3]:
#read in labeled samples
S = em.read_csv_metadata("candidates_sample.csv", 
                         key='_id',
                         ltable=metacriticData, rtable=wikiData, 
                         fk_ltable='ltable_ID', fk_rtable='rtable_ID')
print(len(S))
S.head()

500


Unnamed: 0.1,Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_Album,ltable_Artist,ltable_Release Date,rtable_Album,rtable_Artist,rtable_Release Date,label
0,0,0,a1024,b2294,world eater,blanck mass,Mar 3 2017,world eater,blanck mass,Mar 3 2017,1
1,1,1,a1074,b2263,youngish american,dams of the west,Feb 24 2017,still // alone,the golden filter,Feb 24 2017,0
2,2,2,a1094,b2236,terrible human beings,the orwells,Feb 17 2017,terrible human beings,the orwells,Feb 17 2017,1
3,3,3,a1116,b320,new order presents be music,various artists,Feb 17 2017,i'll be home for christmas,various artists,Nov 24 2014,0
4,4,4,a1143,b2171,process,sampha,Feb 3 2017,process,sampha,Feb 3 2017,1


In [4]:
# Split S into I an J
i_file = "I.csv"
j_file = "J.csv"
if not os.path.isfile(i_file): #so you don't delete your labels on accident
    IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    I.to_csv(i_file,sep=",")
    J.to_csv(j_file,sep=",")
    print "Split samples into I and J"
else:
    I = em.read_csv_metadata(i_file,key="_id",ltable=metacriticData,rtable=wikiData,fk_ltable="ltable_ID",fk_rtable="rtable_ID")
    J = em.read_csv_metadata(j_file,key="_id",ltable=metacriticData,rtable=wikiData,fk_ltable="ltable_ID",fk_rtable="rtable_ID")
    print "Reading I and J from files"
print(len(I))
print(len(J))

Reading I and J from files
250
250


In [5]:
I.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,_id,ltable_ID,rtable_ID,ltable_Album,ltable_Artist,ltable_Release Date,rtable_Album,rtable_Artist,rtable_Release Date,label
0,476,476,476,a897,b2501,more scared of you than you are of me,the smith street band,Apr 7 2017,"lovely, little, lonely",the maine,Apr 7 2017,0
1,162,162,162,a1432,b2205,lazarus [original cast recording],various artists,Oct 21 2016,fifty shades darker: original motion picture soundtrack,various artists,Feb 10 2017,0
2,34,34,34,a2119,b1418,solid states,the posies,May 6 2016,chrome,the screaming jets,May 6 2016,0
3,44,44,44,a2412,b1228,god don't never change: the songs of blind willie johnson,various artists,Feb 26 2016,nemesis: the best of & reworked,blutengel,Feb 26 2016,0
4,97,97,97,a910,b2461,close ties,rodney crowell,Mar 31 2017,close ties,rodney crowell,Mar 31 2017,1


In [6]:
# Generate a set of features
F = em.get_features_for_matching(metacriticData, wikiData, validate_inferred_attr_types=False)
F.feature_name

0                                    ID_ID_lev_dist
1                                     ID_ID_lev_sim
2                                         ID_ID_jar
3                                         ID_ID_jwn
4                                         ID_ID_exm
5                             ID_ID_jac_qgm_3_qgm_3
6                       Album_Album_jac_qgm_3_qgm_3
7                   Album_Album_cos_dlm_dc0_dlm_dc0
8                   Album_Album_jac_dlm_dc0_dlm_dc0
9                                   Album_Album_mel
10                             Album_Album_lev_dist
11                              Album_Album_lev_sim
12                                  Album_Album_nmw
13                                   Album_Album_sw
14                    Artist_Artist_jac_qgm_3_qgm_3
15                Artist_Artist_cos_dlm_dc0_dlm_dc0
16                Artist_Artist_jac_dlm_dc0_dlm_dc0
17                                Artist_Artist_mel
18                           Artist_Artist_lev_dist
19          

In [11]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)
print(len(H))
H.head()

250


Unnamed: 0,_id,ltable_ID,rtable_ID,ID_ID_lev_dist,ID_ID_lev_sim,ID_ID_jar,ID_ID_jwn,ID_ID_exm,ID_ID_jac_qgm_3_qgm_3,Album_Album_jac_qgm_3_qgm_3,...,Label_Label_sw,Release_Date_Release_Date_jac_qgm_3_qgm_3,Release_Date_Release_Date_cos_dlm_dc0_dlm_dc0,Release_Date_Release_Date_jac_dlm_dc0_dlm_dc0,Release_Date_Release_Date_mel,Release_Date_Release_Date_lev_dist,Release_Date_Release_Date_lev_sim,Release_Date_Release_Date_nmw,Release_Date_Release_Date_sw,label
0,476,a897,b2501,5,0.0,0.0,0.0,0,0.0,0.0,...,,1.0,1.0,1.0,1.0,0.0,1.0,10.0,10.0,0
1,162,a1432,b2205,5,0.0,0.0,0.0,0,0.0,0.082353,...,4.0,0.083333,0.0,0.0,0.641414,6.0,0.454545,5.0,5.0,0
2,34,a2119,b1418,4,0.2,0.6,0.6,0,0.0,0.0,...,3.0,1.0,1.0,1.0,1.0,0.0,1.0,10.0,10.0,0
3,44,a2412,b1228,5,0.0,0.6,0.6,0,0.0,0.069767,...,3.0,1.0,1.0,1.0,1.0,0.0,1.0,11.0,11.0,0
4,97,a910,b2461,5,0.0,0.0,0.0,0,0.0,1.0,...,,1.0,1.0,1.0,1.0,0.0,1.0,11.0,11.0,1


In [8]:
#create learners
import random
random_state = 0 #random_state = random.randint(0,1000)

dt = em.DTMatcher(name='DecisionTree', random_state=random_state)
svm = em.SVMMatcher(name='SVM', random_state=random_state)
rf = em.RFMatcher(name='RF', random_state=random_state)
lg = em.LogRegMatcher(name='LogReg', random_state=random_state)
ln = em.LinRegMatcher(name='LinReg')

In [9]:
#show number of columns that are null
null_columns=H.columns[H.isnull().any()]
H[null_columns].isnull().sum()

Genre_Genre_jac_qgm_3_qgm_3        24
Genre_Genre_cos_dlm_dc0_dlm_dc0    24
Genre_Genre_jac_dlm_dc0_dlm_dc0    24
Genre_Genre_mel                    24
Genre_Genre_lev_dist               24
Genre_Genre_lev_sim                24
Genre_Genre_nmw                    24
Genre_Genre_sw                     24
Label_Label_jac_qgm_3_qgm_3         5
Label_Label_cos_dlm_dc0_dlm_dc0     5
Label_Label_jac_dlm_dc0_dlm_dc0     5
Label_Label_mel                     5
Label_Label_lev_dist                5
Label_Label_lev_sim                 5
Label_Label_nmw                     5
Label_Label_sw                      5
dtype: int64

In [10]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
                strategy='mean')

In [16]:
#initial results
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.934566,0.929703,0.929673
1,RF,0.985714,0.958421,0.970322
2,SVM,1.0,0.224399,0.364633
3,LinReg,0.972115,0.953036,0.961021
4,LogReg,0.973214,0.968421,0.968998
