# Experiments Using Synthetic Dataset

In [11]:
import sys
from os import path
sys.path.insert(0, '../src')
import one_to_n
sys.path.insert(0, '../tests')
import create_synthetic_dataset
import datetime
import textdistance
import editdistance
import pandas as pd
import networkx as nx
import re

### 1. Bipartite Matching

In [12]:
table_a = create_synthetic_dataset.create_first_df(10)

table_b_non_typo = create_synthetic_dataset.create_second_df(10)

table_b = create_synthetic_dataset.add_typo(table_a, table_b_non_typo)

print(table_a)
print(table_b)

now = datetime.datetime.now()
bipartite_graph_result = one_to_n.keycomp_treshold_updated_maximal_construct_graph(table_a, table_b, "name", 0.5)
timing_tresh = (datetime.datetime.now()-now).total_seconds()
print("---- Timing for Graph Construction with Treshold Constraint ----")
print(timing_tresh,"seconds")

                 name age
0    Lorraine Herrera   0
1         Mike Henson   0
2         Jeremy Shaw   8
3     Amy Christensen   1
4       Jeremy Morris   7
5           Mark Moon   8
6  Elizabeth Valencia   1
7   Kimberly Marshall   1
8       Patricia Haas   1
9       Luis Robinson   7
                    name  age
0      Lorraine HerreraY    9
1     Lorraine HerreraXO    5
2         Mike HensonfjO    1
3         Jeremy ShawHYN    7
4     Amy ChristensenGYi    1
5       Jeremy MorrisVTD    4
6           Mark MoonVYN    7
7  Elizabeth ValenciabAh    0
8   Kimberly MarshallFnX    2
9       Patricia HaasFyv    5
---- Timing for Graph Construction with Treshold Constraint ----
0.00945 seconds


In [14]:
def SUM_edit_edge_weight(bip_graph):
    for u,v,d in bip_graph.edges(data=True):
        val_tuple_1 = u.split("_")
        val_tuple_2 = v.split("_")
        
        if len(val_tuple_1) == 4:
            val1 = re.sub("[^0-9]", "", val_tuple_1[2])
        else: 
            val1 = re.sub("[^0-9]", "", val_tuple_1[1])
            
        if len(val_tuple_2) == 4:
            val2 =re.sub("[^0-9]", "", val_tuple_2[2])
        else:
            val2 =re.sub("[^0-9]", "", val_tuple_2[1])

        d['weight'] = float(val1) + float(val2)

    return bip_graph

sum_weighted_graph = SUM_edit_edge_weight(bipartite_graph_result)

## Maximal Matching

In [15]:
print("\n\n 'SUM' MAXIMAL MATCHING:")
now = datetime.datetime.now()
matching_set_maximal = nx.algorithms.matching.max_weight_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Maximal Matching Set is:", matching_set_maximal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")


# print("The Maximal Matching Set is:", matching_set_maximal, "\n")



 'SUM' MAXIMAL MATCHING:
The Maximal Matching Set is: {('Mark MoonVYN_(7,)_2', 'Jeremy Shaw_2_(8,)_1'), ('Mark Moon_2_(8,)_1', 'Lorraine HerreraY_(9,)_2'), ('Mark Moon_0_(8,)_1', 'Lorraine HerreraXO_(5,)_2'), ('Patricia HaasFyv_(5,)_2', 'Jeremy Shaw_1_(8,)_1'), ('Jeremy Morris_2_(7,)_1', 'Kimberly MarshallFnX_(2,)_2'), ('Jeremy Shaw_0_(8,)_1', 'Elizabeth ValenciabAh_(0,)_2'), ('Luis Robinson_1_(7,)_1', 'Amy ChristensenGYi_(1,)_2'), ('Jeremy Morris_1_(7,)_1', 'Mike HensonfjO_(1,)_2'), ('Luis Robinson_2_(7,)_1', 'Jeremy MorrisVTD_(4,)_2'), ('Jeremy ShawHYN_(7,)_2', 'Mark Moon_1_(8,)_1')} 

---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.013824 seconds
The number of edges in the graph is: 300 



## Minimal Matching

In [16]:
def minimal_matching(sum_weighted_graph):

    new_graph = sum_weighted_graph.copy()
    max_weight = max([d['weight'] for u,v,d in new_graph.edges(data=True)])
    for u,v,d in new_graph.edges(data=True):
        d['weight'] = max_weight - d['weight']

    matching_set_minimal = nx.algorithms.matching.max_weight_matching(new_graph)
    return matching_set_minimal

In [17]:
print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
print(nx.bipartite.is_bipartite(sum_weighted_graph))
now = datetime.datetime.now()
matching_set_minimal = minimal_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Minimal Matching Set is:", matching_set_minimal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")



 'SUM' MINIMAL MATCHING RESULTS:
True
The Minimal Matching Set is: {('Jeremy MorrisVTD_(4,)_2', 'Mike Henson_0_(0,)_1'), ('Jeremy ShawHYN_(7,)_2', 'Elizabeth Valencia_2_(1,)_1'), ('Mike Henson_2_(0,)_1', 'Elizabeth ValenciabAh_(0,)_2'), ('Amy Christensen_0_(1,)_1', 'Lorraine HerreraXO_(5,)_2'), ('Lorraine Herrera_1_(0,)_1', 'Kimberly MarshallFnX_(2,)_2'), ('Mark MoonVYN_(7,)_2', 'Kimberly Marshall_2_(1,)_1'), ('Patricia HaasFyv_(5,)_2', 'Patricia Haas_2_(1,)_1'), ('Amy ChristensenGYi_(1,)_2', 'Lorraine Herrera_2_(0,)_1'), ('Lorraine Herrera_0_(0,)_1', 'Lorraine HerreraY_(9,)_2'), ('Mike Henson_1_(0,)_1', 'Mike HensonfjO_(1,)_2')} 

---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.015914 seconds
The number of edges in the graph is: 300 



In [27]:
def fetch_sum(bip_graph, matching):
    output = []
    for u,v,d in bip_graph.edges(data=True):
        l = (u, v)
        k = (v, u)
        if l in matching:
            output.append([u,v, d['weight']])
        if k in matching:
            output.append([v,u, d['weight']])
    return output

out_max = fetch_sum(sum_weighted_graph, matching_set_maximal)
out_min = fetch_sum(sum_weighted_graph, matching_set_minimal)

print("\n MAX MATCHES WITH SUM WEIGHTS")
for val in out_max:
    print(val)
    
print("\n MIN MATCHES WITH SUM WEIGHTS")
for val2 in out_min:
    print(val2)

def formatted_output(out_max, out_min):
    out_dict = {}
    for (val1,val2, weight) in out_min:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "min", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "min", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "min", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "min", weight)]
            
    for (val1,val2, weight) in out_max:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "max", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "max", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "max", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "max", weight)]
    return out_dict
            
    
form_output = formatted_output(out_max,out_min)

print("\n FORMAL OUTPUT")
for i,val in form_output.items():
    print(i, val)


 MAX MATCHES WITH SUM WEIGHTS
['Mark Moon_2_(8,)_1', 'Lorraine HerreraY_(9,)_2', 17.0]
['Mark Moon_0_(8,)_1', 'Lorraine HerreraXO_(5,)_2', 13.0]
['Jeremy Morris_1_(7,)_1', 'Mike HensonfjO_(1,)_2', 8.0]
['Jeremy ShawHYN_(7,)_2', 'Mark Moon_1_(8,)_1', 15.0]
['Luis Robinson_1_(7,)_1', 'Amy ChristensenGYi_(1,)_2', 8.0]
['Luis Robinson_2_(7,)_1', 'Jeremy MorrisVTD_(4,)_2', 11.0]
['Mark MoonVYN_(7,)_2', 'Jeremy Shaw_2_(8,)_1', 15.0]
['Jeremy Shaw_0_(8,)_1', 'Elizabeth ValenciabAh_(0,)_2', 8.0]
['Jeremy Morris_2_(7,)_1', 'Kimberly MarshallFnX_(2,)_2', 9.0]
['Patricia HaasFyv_(5,)_2', 'Jeremy Shaw_1_(8,)_1', 13.0]

 MIN MATCHES WITH SUM WEIGHTS
['Lorraine Herrera_0_(0,)_1', 'Lorraine HerreraY_(9,)_2', 9.0]
['Amy Christensen_0_(1,)_1', 'Lorraine HerreraXO_(5,)_2', 6.0]
['Mike Henson_1_(0,)_1', 'Mike HensonfjO_(1,)_2', 1.0]
['Jeremy ShawHYN_(7,)_2', 'Elizabeth Valencia_2_(1,)_1', 8.0]
['Amy ChristensenGYi_(1,)_2', 'Lorraine Herrera_2_(0,)_1', 1.0]
['Jeremy MorrisVTD_(4,)_2', 'Mike Henson_0_(0,)

### 2. Naive Matching Performance Evaluation

In [None]:
sample_size = 1000

print('Loaded catalogs.')
print('Performing compare all match (edit distance)...')
now = datetime.datetime.now()
compare_all_edit_match = matcher.matcher(amzn,goog,editdistance.eval, matcher.all)
naive_time_edit = (datetime.datetime.now()-now).total_seconds()
print("Naive Edit Distance Matching computation time taken: ", naive_time_edit, " seconds")
print('Compare All Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))



print('Performing compare all match (jaccard distance)...')
now = datetime.datetime.now()
compare_all_jaccard_match = matcher.matcher(amzn,goog,analyze.jaccard_calc, matcher.all)
naive_time_jaccard = (datetime.datetime.now()-now).total_seconds()
print("Naive Jaccard Matching computation time taken: ", naive_time_jaccard, " seconds")
print('Compare All Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))

### 3. Random Sampling Matching Performance Evaluation

In [None]:
print('Performing random sample match (edit distance)...')
now = datetime.datetime.now()
compare_all_edit_match = matcher.matcher(amzn,goog,editdistance.eval, matcher.random_sample, sample_size)
sim_time_edit = (datetime.datetime.now()-now).total_seconds()
print("Simulation-Based Edit Distance Matching computation time taken: ", sim_time_edit, " seconds")
print('Random Sample Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))

print('Performing random sample match (jaccard distance)...')
now = datetime.datetime.now()
compare_all_jaccard_match = matcher.matcher(amzn,goog,analyze.jaccard_calc, matcher.random_sample, sample_size)
sim_time_jaccard = (datetime.datetime.now()-now).total_seconds()
print("Simulation-Based Jaccard Matching computation time taken: ", sim_time_jaccard, " seconds")
print('Random Sample Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))