# Entity matching (EM) for Laptops

This jupyter notebook contains the commands used for each step in the entity matching process for laptop products from Amazon and Walmart. 
We used the Basic EM workflow 3 as our guide for this process.

# Step 1: Reading in the input tables A, B.

In [1]:
import sys
import py_entitymatching as em
import pandas as pd
import os

path_A = 'data/amazon_products.csv'
path_B = 'data/walmart_products.csv'

# Load the csv files as dataframes and set the key attribute in the dataframe
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
print('len(A):' + str(len(A)))
print('len(B):' + str(len(B)))
print('len (A X B):' + str(len(A)*len(B)))

len(A):3000
len(B):4847
len (A X B):14541000


# Step 2: Block tables to get candidate set

In this step, we apply a blocking sequence on the input tables to get the candidate set C.

In [2]:
output_attrs = ['id','product_title','price','brand','model','operating_system','extended_title']

In [3]:
# Create an initial blocker
ob = em.OverlapBlocker()

# Use block_tables to apply blocking over two input tables.
C1 = ob.block_tables(A,B, 'model', 'model', 
                     l_output_attrs=output_attrs, 
                     r_output_attrs=output_attrs,
                     overlap_size=8,
                     q_val=5,
                     word_level=False,
                     show_progress=False,
                     n_jobs=-1
                     )
len(C1)

2574

## Debugging blocker output

In [4]:
# # Debug blocker output
# dbg = em.debug_blocker(C1, A, B, 
#                         attr_corres=[
#                             ('product_title','product_title'),
#                             ('brand', 'brand'), 
#                             ('model','model'),
#                             ('extended_title','extended_title'),
#                             ('price', 'price')
#                         ],
#                         verbose=True)
# ### Display first few tuple pairs from the debug_blocker's output
# dbg

Looking at the debug blocker's output, we observe that the initial blocker is dropping a lot of potential matches.
Blocking on the column brand alone seems incorrect.

In [5]:
# # Create overlap blocker
ob = em.OverlapBlocker()

# # Block tables using 'extended_title' attribute 
C2 = ob.block_tables(A, B, 'extended_title', 'extended_title', 
                     l_output_attrs=output_attrs, 
                     r_output_attrs=output_attrs,
                     overlap_size=25,
                     q_val=5,
                     word_level=False,
                     show_progress=False,
                     n_jobs=-1
                    )
# Updated blocking sequence
# A, B ------ overlap blocker [extended_title] ---------> C1--|
#                                                             |----> C
# A, B ------ overlap blocker [model] ------------------> C2--|

C = em.combine_blocker_outputs_via_union([C1,C2])
print ('Reduction on AxB by applying the blocker:' + str(int((len(A)*len(B))/len(C))) + 'x')

Reduction on AxB by applying the blocker:10x


In [6]:
# # Debug blocker output
# dbg = em.debug_blocker(C, A, B, 
#                         attr_corres=[
#                             ('product_title','product_title'),
#                             ('brand', 'brand'), 
#                             ('model','model'),
#                             ('extended_title','extended_title'),
#                             ('price', 'price')
#                         ],
#                         verbose=True)
# ### Display first few tuple pairs from the debug_blocker's output
# dbg

The blocker seem to have reduced the input size by 45x. However, on debugging the blocker we observe that it is still dropping potential matches. So, we are going to debug the blocker further.

In [7]:
def match_extended_title(attribute='extended_title',q_val=3, threshold=.5,debug=False):
    def jaccard_matcher(ltuple,rtuple, attribute=attribute, q_val=q_val, threshold=threshold,debug=debug):
        buffer = '#' * (q_val-1)
        l_attribute = buffer + ltuple[attribute] + buffer
        r_attribute = buffer + rtuple[attribute] + buffer
        l_grams = set()
        r_grams = set()
        # create sets of grams
        for attribute, grams in [(l_attribute,l_grams), (r_attribute,r_grams)]:
            for i in range(0,len(attribute)-(q_val-1)):
                grams.add(attribute[i:i+q_val])
               
        # compute jaccard
        intersection = list(set(l_grams) & set(r_grams))
        union = list(set(l_grams) | set(r_grams))
        if debug:
            print(union)
            print(intersection)
            print(len(intersection) / len(union))
        return len(intersection) / len(union) < threshold
    
    return jaccard_matcher

bb = em.BlackBoxBlocker()
bb.set_black_box_function(match_extended_title())
C3 = bb.block_candset(C2,  
                     show_progress=False,
                     n_jobs=-1
                     )
# Updated blocking sequence
# A, B --- overlap blocker [model] ---------------------> C1--------------------------------|
#                                                                                     union |---> C
# A, B --- overlap blocker [extended_title] ---> C2---> jaccard blocker [extended_title] ---|
C = em.combine_blocker_outputs_via_union([C1,C3])

In [8]:
# Debug again
# dbg = em.debug_blocker(C, A, B)
# dbg

Looking at the dedug output, we observe that the current blocking sequence does not seem to drop a lot of potential matches and has also reduced the candidate set to a good extent. Thus, we stop the blocking step and proceed with the matching step.

# Step 3: Reading in the labeled sample

We randomly sample 450 tuple pairs for labeling purposes and write the sample data to a csv. We then manually labeled the csv and use the labeled csv from then on.

In [9]:
# # Sample  candidate set
# S = em.sample_table(C, 450)
## em.to_csv_metadata(S, 'data/labeled.csv')

In [10]:
# # Loading the saved labeled data
path_S = 'data/labeled.csv'
S = em.read_csv_metadata(path_S, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

#showing some examples of labeled sample
S.head(5)

Unnamed: 0,_id,ltable_id,rtable_id,ltable_model,ltable_extended_title,rtable_model,rtable_extended_title,match
0,5,a0,w2300,nh.q28aa.001,acer predator helios 300 15.6 full hd intel core i7 7700hq 16gb ddr4 256gb ssd geforce gtx 1060 ...,nh.q1caa.001,acer 15.6 intel core i7 2.6ghz 16 gb 1 tb hdd 256 gb ssd windows 10 g9 593 72vt nh.q1caa.001,1
1,26,a1011,w685,t8tjg,2018 lenovo thinkpad 11e 4h 11.6 intel i3 7100u 128gb m.2 ssd 4gb ddr4 802.11ac win10 t8tjg windows,t8tjg,dell chromebook 11 3189 11.6 celeron n3060 4 gb 64 gb ssd t8tjg os,0
2,30,a102,w1746,c300sa dh02,asus chromebook c300sa compact 13.3 intel celeron 4gb 16gb emmc asus c300sa dh02,c300sa dh02,c300sa dh02 cel n3060 4gb 16gb 13.3in chrome asus chrome,1
3,44,a1031,w4581,a515 51 596k,acer 15.6 intel core i5 3.40ghz 8gb 256gb ssd windows 10 a515 51 596k home,nx.gnpaa.016,refurbished acer aspire 3 intel core i5 2.5 ghz 8gb 256 gb ssd windows 10 nx.gnpaa.016,1
4,65,a1047,w1400,n850hp6,prostar clevo n850hp6 15.6” fhd ips 1920x1080 intel core i7 7700hq 16gb ddr4 gtx 1060 120gb ssd ...,n855hj,prostar clevo n855hj 15.6” full hd 1920x1080 intel core i7 7700hq 8gb ddr4 gtx 1050 1tb hdd wind...,1


# Step 4: Splitting labeled data into development and evaluation set

In this step, we split the labeled data set S into a development set I and an evaluation set J.

In [11]:
IJ = em.split_train_test(S, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

# Step 5: Creating a set of ML-matchers.

In [12]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# Step 6: Selecting the best matcher using I.

This step includes:
    1. Creating a feature table F.
    2. Converting I into a set H of feature vectors (using the features in F).
    3. Selecting the best matcher in the first iteration using cross-validation.
    4. Debugging the matcher
    5. Selecting the best matcher again using cross-validation.    
Repeating the steps 4 and 5 for all the debug iterations.

## Initial iteration:
### a. Creating a feature table F.

In [13]:
match_t = em.get_tokenizers_for_matching([2,3,5])
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)
match_c = em.get_attr_corres(A, B)
feature_table = em.get_features(A, B, atypes1, atypes2, match_c, match_t, match_s)

### b. Converting I into a set H of feature vectors (using the features in F).

In [14]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=feature_table, 
                            attrs_after='match',
                            show_progress=False)
#Filling in the missing values if any in H.
H.fillna(0, inplace=True)

### c. Selecting the best matcher in the first iteration using cross-validation

In [15]:
# Select the best ML matcher using CV
result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb], 
        table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'],
        k=5,
        target_attr='match', 
        metric_to_select_matcher='precision', 
        random_state=0)
# result
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.911848,0.904373,0.907634
1,RF,0.925321,0.953063,0.937948
2,SVM,0.869084,0.991349,0.925726
3,LinReg,0.898176,0.939738,0.918227
4,LogReg,0.900212,0.951679,0.924574
5,NaiveBayes,0.932588,0.643338,0.760778


### d. Debugging matcher

In [16]:
#  Split feature vectors into train and test
UV = em.split_train_test(H, train_proportion=0.5)
U = UV['train']
V = UV['test']

In [17]:
# Debug rf using GUI
#em.vis_debug_dt(result['selected_matcher'], U, V, 
#                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'],
#                target_attr='match')

Making features from non-attributes like the IDs of each row resulted in false positive matches due to high similarity between the ids. There are also some dirty attributes which had erroneus values resulting in false negatives.

## Iteration 2:
### a. Creating a feature table F.

In [18]:
# Remove bad features from auto-feature-generation and trying different tokenizers for matching
AA = A.drop(['id','product_title','model','operating_system','price'],axis=1)
BB = B.drop(['id','product_title','model','operating_system','price'],axis=1)

match_t = em.get_tokenizers_for_matching([3,5,10])
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(AA)
atypes2 = em.get_attr_types(BB)
match_c = em.get_attr_corres(AA, BB)
feature_table = em.get_features(AA, BB, atypes1, atypes2, match_c, match_t, match_s)

### b. Converting I into a set H of feature vectors (using the features in F).

In [19]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=feature_table, 
                            attrs_after='match',
                            show_progress=False)
#Filling in the missing values if any in H.
H.fillna(0, inplace=True)

### c. Selecting the best matcher using cross-validation

In [20]:
# Select the best ML matcher using CV
result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb], 
        table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'],
        k=5,
        target_attr='match', 
        metric_to_select_matcher='precision', 
        random_state=0)
# result
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.920775,0.921306,0.918756
1,RF,0.911768,0.920965,0.914984
2,SVM,0.824195,0.988857,0.898029
3,LinReg,0.88149,0.954061,0.91412
4,LogReg,0.857673,0.988857,0.916457
5,NaiveBayes,0.879022,0.931885,0.902879


### d. Debugging matcher

In [21]:
#  Split feature vectors into train and test
UV = em.split_train_test(H, train_proportion=0.5)
U = UV['train']
V = UV['test']
result['drill_down_cv_stats']['precision']['Matcher'][0]

<py_entitymatching.matcher.dtmatcher.DTMatcher at 0x7f30a1e97588>

In [22]:
# Debug rf using GUI
    #em.vis_debug_dt(result['drill_down_cv_stats']['precision']['Matcher'][0], U, V, 
#                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'],
#                target_attr='match')

Previous iteration seems to have improved F score and recall. Debugging further. 
A lot of useful information seems to be lost because of data being dirty.
So trying to add more features using custom functions.

### Iteration 3:
### a. Creating a feature table F.

In [23]:
# Remove bad features from auto-feature-generation and trying different tokenizers for matching
AA = A.drop(['id','product_title','model','operating_system'],axis=1)
BB = B.drop(['id','product_title','model','operating_system'],axis=1)
match_t = em.get_tokenizers_for_matching([3,5,10])
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(AA)
atypes2 = em.get_attr_types(BB)
match_c = em.get_attr_corres(AA, BB)
feature_table = em.get_features(AA, BB, atypes1, atypes2, match_c, match_t, match_s)

This function generates a function that returns 1 if both or neither tuples' attribute contain any of the passed in values and 0 otherwise.

In [24]:
def generateContainsValueFeature(values, name=None, attribute='extended_title'):
    if type(values) is str:
        values = [values]
    def containsValueFeature(a,b):
        return int(any([value.lower() in a[attribute].lower() for value in values]) 
                   == any([value.lower() in b[attribute].lower() for value in values]))
    return containsValueFeature, name if name else values[0]

Use this to generate many new features.

In [25]:
brands = ['lg','toshiba','hp','dell','lenovo','prostar','acer','samsung','apple','asus','panasonic','msi']
models = ['zephyrus','zenbook','flex','omen','xps','x1','carbon','yoga',
          'latitude','inspiron','elitebook','clevo','spectre',
          'macbook','pavilion','ideapad','legion']
thinkpads = [['p40','p50','p51','p71'],['430','460','470','560','570']]
asus = ['swift','aspire','spin']
models = models + thinkpads + asus
sizes = [' 13',' 14',' 15',' 17']
operating_systems = ['chrome','windows','mac']
cpus = [['i3','i5','i7'],['celeron','pentium'],['m3','m5'],'amd']
miscellaneous = ['2-in-1','gtx','touch']
keywords = models \
    + sizes  \
    + operating_systems \
    + cpus \
    + miscellaneous
new_features = [generateContainsValueFeature(value) for value in keywords]

for feature in new_features:
    em.add_blackbox_feature(feature_table, feature[1], feature[0])

This function generates a function that returns 1 if both tuples contain the same value for any of the values passed in and 0 otherwise.

In [26]:
def generateContainsValueFromValuesetFeature(valueset, name=None, attribute='extended_title'):
    def sharesValue(a,b,values):
        if type(values) is str:
            values = [values]
        return int(any([value.lower() in a[attribute].lower() and value.lower() in b[attribute].lower() for value in values]))
    def containsValueFromValueset(a,b):
        return any([sharesValue(a,b,values) for values in valueset[1]])
    return containsValueFromValueset, valueset[0]

In [27]:
valuesets = [('brands',brands), 
             ('models', models), 
             ('sizes', sizes), 
             ('cpus', cpus), 
             ('operating_systems',operating_systems)]

new_features = [generateContainsValueFromValuesetFeature(valueset) for valueset in valuesets]

for feature in new_features:
    em.add_blackbox_feature(feature_table, feature[1], feature[0])

### b. Converting I into a set H of feature vectors (using the features in F).

In [28]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=feature_table, 
                            attrs_after='match',
                            show_progress=False)
#Filling in the missing values if any in H.
H.fillna(0, inplace=True)

### c. Selecting the best matcher using cross-validation

In [29]:
# Select the best ML matcher using CV
result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb], 
        table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'],
        k=5,
        target_attr='match', 
        metric_to_select_matcher='precision', 
        random_state=0)
# result
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.957928,0.944694,0.950609
1,RF,0.948328,0.968857,0.958287
2,SVM,0.852276,0.972744,0.908068
3,LinReg,0.943323,0.955533,0.948794
4,LogReg,0.921289,0.964,0.941185
5,NaiveBayes,0.953247,0.924624,0.938391


Previous iteration seems to have improved F score, precision, and recall. Stopping further debugging.

# Step 7: Evaluating the best matcher Y using J. 

This step includes:
1. Converting J into a set L of feature vectors.
2. Filling in the missing values if any in L.
3. Training the best matcher Y using I.
4. Computing the accuracy of Y


## a. Converting J into a set L of feature vectors.

As before, we convert to the feature vectors (using the feature table and the evaluation set)

In [30]:
# Convert J into a set of feature vectors using feature table
L = em.extract_feature_vecs(
        J, 
        feature_table=feature_table,
        attrs_after='match', 
        show_progress=False)

## b. Filling in the missing values if any in L.

In [31]:
#Filling in the missing values if any in L.
L.fillna(0, inplace=True)
L.head(100)

Unnamed: 0,_id,ltable_id,rtable_id,price_price_jac_qgm_3_qgm_3,price_price_cos_dlm_dc0_dlm_dc0,price_price_jac_dlm_dc0_dlm_dc0,price_price_mel,price_price_lev_dist,price_price_lev_sim,price_price_nmw,...,amd,2-in-1,gtx,touch,brands,models,sizes,cpus,operating_systems,match
124,1764,a1561,w15,0.000000,0.0,0.0,0.000000,7.0,0.000000,-6.0,...,1,0,1,1,True,True,False,True,True,0
54,705,a1269,w3092,0.157895,0.0,0.0,0.792593,4.0,0.555556,5.0,...,1,1,1,0,True,True,True,True,True,1
268,4200,a2294,w1544,0.571429,0.0,0.0,0.955556,1.0,0.888889,8.0,...,1,1,1,1,True,True,True,True,True,1
293,4614,a2439,w3188,0.157895,0.0,0.0,0.844444,4.0,0.555556,5.0,...,1,1,1,1,True,True,True,True,True,1
230,3469,a1997,w3210,0.157895,0.0,0.0,0.844444,4.0,0.555556,5.0,...,1,1,1,1,True,True,True,True,True,1
134,1896,a164,w1994,0.058824,0.0,0.0,0.571429,5.0,0.285714,2.0,...,1,1,1,0,True,False,True,True,True,1
12,143,a1059,w1492,0.294118,0.0,0.0,0.800000,3.0,0.666667,6.0,...,1,1,0,1,True,False,False,True,True,1
423,6475,a752,w2533,0.125000,0.0,0.0,0.695238,4.0,0.428571,3.0,...,1,1,1,1,True,True,True,True,True,1
272,4274,a2361,w1419,0.000000,0.0,0.0,0.000000,9.0,0.000000,-8.0,...,1,1,1,1,True,True,True,True,True,1
76,985,a1329,w1543,0.571429,0.0,0.0,0.948148,1.0,0.888889,8.0,...,1,1,1,1,True,True,False,True,True,0


## c. Predicting the matches

### For each of the six learning methods, we train the matcher based on that method on I, and then report its precision/recall/F-1 on J. 

In [32]:
for i in range (0,6):
    learning_method = result['drill_down_cv_stats']['precision']['Matcher'][i]
    
    # we train the matcher based on that method on I
    learning_method.fit(table=H, 
           exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'], 
           target_attr='match')

    # we then report its precision/recall/F-1 on J.
    predictions = learning_method.predict(
            table=L, 
            exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'], 
            append=True, 
            target_attr='predicted', 
            inplace=False)

    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'match', 'predicted')
    print('Learning method:' + result['drill_down_cv_stats']['precision']['Name'][i])
    em.print_eval_summary(eval_result)
    print('')

Learning method:DecisionTree
Precision : 88.66% (86/97)
Recall : 92.47% (86/93)
F1 : 90.53%
False positives : 11 (out of 97 positive predictions)
False negatives : 7 (out of 38 negative predictions)

Learning method:RF
Precision : 87.88% (87/99)
Recall : 93.55% (87/93)
F1 : 90.62%
False positives : 12 (out of 99 positive predictions)
False negatives : 6 (out of 36 negative predictions)

Learning method:SVM
Precision : 77.88% (88/113)
Recall : 94.62% (88/93)
F1 : 85.44%
False positives : 25 (out of 113 positive predictions)
False negatives : 5 (out of 22 negative predictions)

Learning method:LinReg
Precision : 93.55% (87/93)
Recall : 93.55% (87/93)
F1 : 93.55%
False positives : 6 (out of 93 positive predictions)
False negatives : 6 (out of 42 negative predictions)

Learning method:LogReg
Precision : 87.88% (87/99)
Recall : 93.55% (87/93)
F1 : 90.62%
False positives : 12 (out of 99 positive predictions)
False negatives : 6 (out of 36 negative predictions)

Learning method:NaiveBayes
Pre

### For the final best matcher Y selected, train it on I, then report its precision/recall/F-1 on J.

In [33]:
# Train best matcher on I 
print ('Final best matcher Y:')
result['selected_matcher'].fit(table=H, 
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'], 
       target_attr='match')

# Predict on L 
predictions = result['selected_matcher'].predict(
        table=L, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'match'], 
        append=True, 
        target_attr='predicted', 
        inplace=False)

# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'match', 'predicted')
em.print_eval_summary(eval_result)

Final best matcher Y:
Precision : 88.66% (86/97)
Recall : 92.47% (86/93)
F1 : 90.53%
False positives : 11 (out of 97 positive predictions)
False negatives : 7 (out of 38 negative predictions)


In [34]:
# Convert J into a set of feature vectors using feature table
C_vecs = em.extract_feature_vecs(
        C, 
        feature_table=feature_table, 
        show_progress=False)
C_vecs.fillna(0, inplace=True)
C_vecs.head()

Unnamed: 0,_id,ltable_id,rtable_id,price_price_jac_qgm_3_qgm_3,price_price_cos_dlm_dc0_dlm_dc0,price_price_jac_dlm_dc0_dlm_dc0,price_price_mel,price_price_lev_dist,price_price_lev_sim,price_price_nmw,...,m3,amd,2-in-1,gtx,touch,brands,models,sizes,cpus,operating_systems
0,0,a0,w1389,0.0,0.0,0.0,0.0,9.0,0.0,-8.0,...,1,1,1,0,1,True,False,False,True,True
1,1,a0,w1396,0.0,0.0,0.0,0.0,9.0,0.0,-8.0,...,1,1,1,0,1,True,False,False,True,True
2,2,a0,w1504,0.047619,0.0,0.0,0.75,5.0,0.444444,4.0,...,1,1,1,0,1,True,False,False,True,True
3,3,a0,w1516,0.047619,0.0,0.0,0.666667,5.0,0.444444,4.0,...,1,1,1,0,1,True,False,False,True,True
4,4,a0,w1661,0.047619,0.0,0.0,0.7,6.0,0.333333,3.0,...,1,1,1,0,1,True,False,False,True,True


In [35]:
# Predict on C 
predictions = result['selected_matcher'].predict(
        table=C_vecs, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id'], 
        append=True, 
        target_attr='predicted', 
        inplace=False)

In [36]:
C['predicted_match'] = predictions['predicted']
predicted_matches = C.loc[C['predicted_match'] != 0]
em.to_csv_metadata(predicted_matches, 'data/predicted_matches.csv')

File already exists at data/predicted_matches.csv; Overwriting it
Metadata file already exists at data/predicted_matches.metadata. Overwriting it


True