In [1]:
# !pip install PyQt5
# ! git clone --recursive https://github.com/dmlc/xgboost
# !pip install pandastable
# !pip install -U numpy scipy py_entitymatching
# !pip install py_stringmatching
# ! pip install  -U "numpy<1.24.0"
# !pip install javaobj-py3

In [2]:
import importlib.metadata
print(importlib.metadata.version('numpy'))

1.23.5


In [3]:
# ------------- PARAMETER -------------

FOLDER_DATA = 'data/'
FOLDER_DATA_cleaner = FOLDER_DATA + 'cleanCleanErDatasets/'
FOLDER_DATA_dirty = FOLDER_DATA + 'dirtyErDatasets/'
FOLDER_DATA_csv = FOLDER_DATA + 'csv/'

In [154]:
import javaobj
import csv
import pandas as pd
class JSOFileReader:
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.DataFrame()
    
    def __read_data_set__(self, attr):
        try:
            row = dict()
            if isinstance(attr, set):
                for ins in set(attr):
                    class_desc = ins.classdesc
                    key = ''
                    for field_desc in class_desc.fields:   
                        attr_name = field_desc.name
                        if attr_name == 'name':
                            key = getattr(ins, attr_name)
                        if attr_name == 'value':
                            attr_value = getattr(ins, attr_name)
                            row[key]=attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a dictionary data")
        except TypeError as msg:
            print(msg)
    def __read_data_scala__(self, attr_value, attr_name):
        try:
            row = dict()
            if isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
                row[attr_name] = attr_value
                return pd.DataFrame(row, index=[0])
            else:
                raise TypeError("Not a scala data")
        except TypeError as msg:
            print(msg)
    def read_file(self):
        with open(self.filename, 'rb') as f:
            data = javaobj.v2.load(f)
        return data
    
    def to_pandas_df(self):
        jso_raw = self.read_file()
        
        df = pd.DataFrame()
        for obj in jso_raw:
            for key, value in obj.__dict__.items():
                if key == 'classdesc':
                    field_names = [field.name for field in value.fields]
                    field_values = []
                    pd_row = pd.DataFrame()
                    for field in value.fields:
                        attr_value = getattr(obj, field.name)
#                         print(type(attr_value))
                        if isinstance(attr_value, set):
                            attr_set = self.__read_data_set__(attr_value)
                            pd_row = pd.concat([pd_row, attr_set], axis=1)
                        elif isinstance(attr_value, int) or isinstance(attr_value, javaobj.v2.beans.JavaString):
#                             print(attr_value)
                            attr_scala = self.__read_data_scala__(attr_value,  field.name)
                            pd_row = pd.concat([pd_row, attr_scala], axis=1)
                    df = pd.concat([df, pd_row], ignore_index=True)
        self.df = df
        return df
    
    
    def to_csv(self):
        import os  
        os.makedirs('FOLDER_DATA_csv', exist_ok=True)  
        
        filename = self.filename.split('/')
        try:
            if self.df.empty:
                raise ValueError('Please call to_pandas_df() before to_csv function!')
        except ValueError as msg:
            print('Error with file' + self.filename + ' ' +  str(msg.args))
            return
        
#         print(filename)
        self.df.to_csv(FOLDER_DATA_csv + filename[-1] + '.csv', index=False)  

In [162]:
class ProcessAllJSO:
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.__process__()
    def __process__(self):
        import os

        files = os.listdir(self.folder_path)
        print('Process folder ' + self.folder_path)
        for file_name in files:
            print('Processing: ' + file_name)
            reader = JSOFileReader(self.folder_path + file_name)
            reader.to_pandas_df()
            reader.to_csv()

# Convert JSO file in FOLDER_DATA_cleaner to CSV

In [161]:
ProcessAllJSO(FOLDER_DATA_cleaner)

Process folderdata/cleanCleanErDatasets/
Processing: abtBuyIdDuplicates
Processing: abtProfiles
Processing: acmProfiles
Processing: amazonGpIdDuplicates
Processing: amazonProfiles
Processing: amazonProfiles2
Processing: amazonWalmartIdDuplicates
Processing: buyProfiles
Processing: dblpAcmIdDuplicates
Processing: dblpProfiles
Processing: dblpProfiles2
Processing: dblpScholarIdDuplicates
Processing: gpProfiles
Processing: imdbProfiles
Processing: imdbProfilesNEW
Processing: imdbTmdbIdDuplicates
Processing: imdbTvdbIdDuplicates
Processing: moviesIdDuplicates
Processing: restaurant1Profiles
Processing: restaurant2Profiles
Processing: restaurantsIdDuplicates
Processing: scholarProfiles
Processing: tmdbProfiles
Processing: tmdbTvdbIdDuplicates
Processing: tvdbProfiles
Processing: walmartProfiles


<__main__.ProcessAllJSO at 0x2430f8e19a0>

## Example of using JSOFileReader

In [157]:
reader = JSOFileReader(FOLDER_DATA_cleaner + 'abtProfiles')
df = reader.to_pandas_df()
df

Unnamed: 0,name,description,entityUrl,price
0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,552,
1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,580,399
2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,4696,49
3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,5644,
4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,6284,158
...,...,...,...,...
1071,Logitech Cordless Desktop Wave Keyboard And Mo...,Logitech Cordless Desktop Wave Keyboard And Mo...,39088,79
1072,Mitsubishi DLP Black TV Stand - MBS73V,Mitsubishi DLP Black TV Stand - MBS73V/ Matchi...,39090,549
1073,Logitech Digital Precision PC Gaming Headset -...,Logitech Digital Precision PC Gaming Headset -...,39175,49
1074,Logitech 2.1 Multimedia Silver Speaker System ...,Logitech 2.1 Multimedia Silver Speaker System ...,39176,


# Magellan Pipeline

In [44]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

# Set the seed value 
seed = 0

In [46]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'

In [47]:
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


## Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher


In [48]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

# Selecting the Best learning-based matcher 

This, typically involves the following steps:
1. Creating a set of learning-based matchers
2. Creating features
3. Extracting feature vectors
4. Selecting the best learning-based matcher using k-fold cross validation
5. Debugging the matcher (and possibly repeat the above steps)

# Creating a set of learning-based matchers

First, we need to create a set of learning-based matchers. The following matchers are supported in Magellan: (1) decision tree, (2) random forest, (3) naive bayes, (4) svm, (5) logistic regression, and (6) linear regression.


In [49]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

## Generate a set of features


In [50]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

## We observe that there were 20 features generated. As a first step, lets say that we decide to use only 'year' related features.

In [51]:
F.feature_name

0                          id_id_lev_dist
1                           id_id_lev_sim
2                               id_id_jar
3                               id_id_jwn
4                               id_id_exm
5                   id_id_jac_qgm_3_qgm_3
6             title_title_jac_qgm_3_qgm_3
7         title_title_cos_dlm_dc0_dlm_dc0
8                         title_title_mel
9                    title_title_lev_dist
10                    title_title_lev_sim
11        authors_authors_jac_qgm_3_qgm_3
12    authors_authors_cos_dlm_dc0_dlm_dc0
13                    authors_authors_mel
14               authors_authors_lev_dist
15                authors_authors_lev_sim
16                          year_year_exm
17                          year_year_anm
18                     year_year_lev_dist
19                      year_year_lev_sim
Name: feature_name, dtype: object

## Extracting feature vectors

In [52]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [53]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [54]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')

  imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans


In [55]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.915322,0.950714,0.93098
1,RF,1.0,0.950714,0.974131
2,SVM,0.969048,0.905659,0.934746
3,LinReg,1.0,0.93533,0.966131
4,LogReg,0.985714,0.93533,0.958724


In [56]:
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002880A1C2820>,5,0.95,1.0,0.764706,0.933333,0.928571,0.915322
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002880A1C2400>,5,1.0,1.0,1.0,1.0,1.0,1.0
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002880A1C2940>,5,1.0,0.928571,1.0,1.0,0.916667,0.969048
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002880A1C2640>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002880A1C2070>,5,1.0,0.928571,1.0,1.0,1.0,0.985714


In [57]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002880A1C2820>,5,0.95,1.0,0.928571,0.875,1.0,0.950714
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002880A1C2400>,5,0.95,1.0,0.928571,0.875,1.0,0.950714
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002880A1C2940>,5,0.95,1.0,0.857143,0.875,0.846154,0.905659
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002880A1C2640>,5,0.95,1.0,0.928571,0.875,0.923077,0.93533
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002880A1C2070>,5,0.95,1.0,0.928571,0.875,0.923077,0.93533


In [58]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002880A1C2820>,5,0.95,1.0,0.83871,0.903226,0.962963,0.93098
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002880A1C2400>,5,0.974359,1.0,0.962963,0.933333,1.0,0.974131
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002880A1C2940>,5,0.974359,0.962963,0.923077,0.933333,0.88,0.934746
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002880A1C2640>,5,0.974359,1.0,0.962963,0.933333,0.96,0.966131
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002880A1C2070>,5,0.974359,0.962963,0.962963,0.933333,0.96,0.958724


In [59]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']

In [60]:
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        target_attr='label')

In [61]:
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()), 
                            wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)


In [62]:
# Add feature to F
em.add_feature(F, 'jac_ws_title_authors', feature)

True

In [63]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [64]:
# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002880A1C2400>,5,0.974359,1.0,0.962963,0.933333,1.0,0.974131


In [65]:
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,1.0,1.0,1.0
1,RF,1.0,0.950714,0.974131
2,SVM,0.969048,0.905659,0.934746
3,LinReg,1.0,0.97033,0.984593
4,LogReg,0.985714,0.93533,0.958724


In [66]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002880A1C2820>,5,1.0,1.0,1.0,1.0,1.0,1.0
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002880A1C2400>,5,0.974359,1.0,0.962963,0.933333,1.0,0.974131
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002880A1C2940>,5,0.974359,0.962963,0.923077,0.933333,0.88,0.934746
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002880A1C2640>,5,1.0,1.0,0.962963,1.0,0.96,0.984593
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002880A1C2070>,5,0.974359,0.962963,0.962963,0.933333,0.96,0.958724
