# All Experiment Scripts

In [1]:
import sys
from os import path
sys.path.insert(0, '../src')
import one_to_n
sys.path.insert(0, '../tests')
import create_synthetic_dataset
import datetime
import textdistance
import editdistance
import pandas as pd
import networkx as nx
import re

import editdistance
from Matching import core2
from Matching import analyze
from Matching import matcher
import sys
import os
import datetime

In [2]:
"""
Converts dictionary output to dataframe. Use for experiment results
"""
def dict_to_df(output_dict):
    df = pd.DataFrame.from_dict(output_dict, orient='index',columns=['Matched Item', 'Max/Min', 'Value'])
    return df
"""
Converts experiment results to CSV format. Use for experiment results
"""
def exp_result_to_csv(filename, experiment_output_df):
    #create the CSV with custom filename describing the experiment
    df.to_csv(filename, index = False)

def create_synth_data(table1_rowcount, table2_rowcount, datafilename1, datafilename2, filename1_dup):
    table_a_non_duplicated = create_synthetic_dataset.create_first_df(table1_rowcount)

    table_b_non_typo = create_synthetic_dataset.create_second_df(table2_rowcount)

    table_b = create_synthetic_dataset.add_typo(table_a_non_duplicated, table_b_non_typo)
    
    table_a_non_duplicated.to_csv(datafilename1, index = False, header=True)
    
    table_b.to_csv(datafilename2, index = False, header=True)
    
    table_a_dup = one_to_n.create_duplicates(table_a_non_duplicated, "name", 3)
    
    table_a_dup.to_csv(filename1_dup, index = False, header=True)
    
    return table_a_non_duplicated, table_b, table_a_dup

def SUM_edit_edge_weight(bip_graph):
    for u,v,d in bip_graph.edges(data=True):
        val_tuple_1 = u.split("_")
        val_tuple_2 = v.split("_")
        
        if len(val_tuple_1) == 4:
            val1 = re.sub("[^0-9]", "", val_tuple_1[2])
        else: 
            val1 = re.sub("[^0-9]", "", val_tuple_1[1])
            
        if len(val_tuple_2) == 4:
            val2 =re.sub("[^0-9]", "", val_tuple_2[2])
        else:
            val2 =re.sub("[^0-9]", "", val_tuple_2[1])

        d['weight'] = float(val1) + float(val2)

    return bip_graph

def minimal_matching(sum_weighted_graph):
    new_graph = sum_weighted_graph.copy()
    max_weight = max([d['weight'] for u,v,d in new_graph.edges(data=True)])
    for u,v,d in new_graph.edges(data=True):
        d['weight'] = max_weight - d['weight']

    matching_set_minimal = nx.algorithms.matching.max_weight_matching(new_graph)
    return matching_set_minimal

def fetch_sum(bip_graph, matching):
    output = []
    for u,v,d in bip_graph.edges(data=True):
        l = (u, v)
        k = (v, u)
        if l in matching:
            output.append([u,v, d['weight']])
        if k in matching:
            output.append([v,u, d['weight']])
    return output

def formatted_output(out_max, out_min):
    out_dict = {}
    for (val1,val2, weight) in out_min:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "min", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "min", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "min", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "min", weight)]
            
    for (val1,val2, weight) in out_max:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "max", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "max", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "max", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "max", weight)]
    return out_dict

def sum_total_weights(max_min_list):
    if max_min_list == [] or max_min_list == None:
        print("ERROR: NO SIMILARITY FOUND IN NAIVE OR RANDOM SAMPLING APPROACH. Suggestion: Decrease Similarity Matching Threshold.")
        return None
    total = 0
    for i in max_min_list:
        total += i[-1]
    return total

def sum_bip_script(table_a_non_duplicated, table_b, column_name, similarity_threshold, n_matches):
    now = datetime.datetime.now()
    bipartite_graph_result = one_to_n.keycomp_treshold_updated_maximal_construct_graph(table_a_non_duplicated, table_b, column_name, similarity_threshold, n_matches)
    timing_tresh = (datetime.datetime.now()-now).total_seconds()
    print("---- Timing for Graph Construction with Treshold Constraint ----")
    print(timing_tresh,"seconds")
    
    sum_weighted_graph = SUM_edit_edge_weight(bipartite_graph_result)
    
    print("\n\n 'SUM' MAXIMAL MATCHING:")
    now = datetime.datetime.now()
    matching_set_maximal = nx.algorithms.matching.max_weight_matching(sum_weighted_graph)
    timing_match = (datetime.datetime.now()-now).total_seconds()
#    print("The Maximal Matching Set is:", matching_set_maximal, "\n")
    print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
    print(timing_match,"seconds")
    print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")
    
    
    print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
    print(nx.bipartite.is_bipartite(sum_weighted_graph))
    now = datetime.datetime.now()
    matching_set_minimal = minimal_matching(sum_weighted_graph)
    timing_match = (datetime.datetime.now()-now).total_seconds()
#    print("The Minimal Matching Set is:", matching_set_minimal, "\n")
    print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
    print(timing_match,"seconds")
    print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")
    
    
    
    out_max = fetch_sum(sum_weighted_graph, matching_set_maximal)
    out_min = fetch_sum(sum_weighted_graph, matching_set_minimal)
    
    form_output = formatted_output(out_max,out_min)
    
#     max_df_output = dict_to_df(form_output)
    
#     min_df_output = dict_to_df(form_output)
    
#     exp_result_to_csv(filename_max_bip_out, max_df_output)
    
#     exp_result_to_csv(filename_min_bip_out, min_df_output)
    
    total_max = sum_total_weights(out_max)
    print("BP Matching: Highest bound for maximum:", total_max)

    total_min = sum_total_weights(out_min)
    print("BP Matching: Lowest bound for minimum:", total_min)
    
    return total_max, total_min

In [3]:
def sum_naive_script(n_matches, sim_threshold, filename1_dup, filename2):
    # print(os.getcwd())
#    cat_table1_dup = core2.data_catalog('s_data1_dup.csv')
    cat_table1_dup = core2.data_catalog(filename1_dup)
    cat_table2 = core2.data_catalog(filename2)
    print('Loaded catalogs.')
    
    
    # NAIVE MAX MATCHING
    print("NAIVE MAX MATCHING")
    print('Performing compare all match (edit distance)...')
    now = datetime.datetime.now()
    max_compare_all_edit_match = matcher.matcher_dup_updated(n_matches, cat_table1_dup,cat_table2,editdistance.eval, matcher.all, sim_threshold)
    naive_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Naive Edit Distance Matching computation time taken: ", naive_time_edit, " seconds")
    #print('Compare All Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))


#     print('Performing compare all match (jaccard distance)...')
#     now = datetime.datetime.now()
#     max_compare_all_jaccard_match = matcher.matcher_dup_updated(n_matches, cat_table1_dup,cat_table2,analyze.jaccard_calc, matcher.all, sim_threshold)
#     naive_time_jaccard = (datetime.datetime.now()-now).total_seconds()
#     print("Naive Jaccard Matching computation time taken: ", naive_time_jaccard, " seconds", "\n")
    #print('Compare All Matcher (Jaccard Distance) Performance: ' + str(core2.eval_matching(compare_all_jaccard_match)))

    # NAIVE MIN MATCHING
    print("NAIVE MIN MATCHING")
    print('Performing compare all match (edit distance)...')
    now = datetime.datetime.now()
    min_compare_all_edit_match = matcher.matcher_updated(n_matches, cat_table1_dup,cat_table2,editdistance.eval, matcher.all, sim_threshold)
    naive_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Naive Edit Distance Matching computation time taken: ", naive_time_edit, " seconds")
    #print('Compare All Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))


#     print('Performing compare all match (jaccard distance)...')
#     now = datetime.datetime.now()
#     min_compare_all_jaccard_match = matcher.matcher_updated(n_matches, cat_table1,cat_table2,analyze.jaccard_calc, matcher.all, sim_threshold)
#     naive_time_jaccard = (datetime.datetime.now()-now).total_seconds()
#     print("Naive Jaccard Matching computation time taken: ", naive_time_jaccard, " seconds")
    #print('Compare All Matcher (Jaccard Distance) Performance: ' + str(core2.eval_matching(compare_all_jaccard_match)))

    naive_total_max = sum_total_weights(max_compare_all_edit_match)
    naive_total_min = sum_total_weights(min_compare_all_edit_match)
    print("MAX Matching Bound:")
    print("NAIVE Matching: ", naive_total_max)
    print("MIN Matching Bound:")
    print("NAIVE Matching: ", naive_total_min)
    return naive_total_max, naive_total_min

In [4]:
def sum_random_sample_script(n_matches, sim_threshold, sample_size, filename1_dup, filename2):
    
    cat_table1_dup = core2.data_catalog(filename1_dup)
    cat_table2 = core2.data_catalog(filename2)
    print('Loaded catalogs.')
    
    # RANDOM SAMPLING MAX MATCHING
    print("RANDOM SAMPLE MAX MATCHING")
    print('Performing random sample match (edit distance)...')
    now = datetime.datetime.now()
    max_compare_sampled_edit_match = matcher.matcher_dup_updated(n_matches, cat_table1_dup,cat_table2,editdistance.eval, matcher.random_sample, sim_threshold, sample_size)
    sim_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Simulation-Based Edit Distance Matching computation time taken: ", sim_time_edit, " seconds")
    #print('Random Sample Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))

    # print('Performing random sample match (jaccard distance)...')
    # now = datetime.datetime.now()
    # max_compare_sampled_jaccard_match = matcher.matcher_dup_updated(cat_table1_dup,cat_table2,analyze.jaccard_calc, matcher.random_sample, sim_threshold, sample_size)
    # sim_time_jaccard = (datetime.datetime.now()-now).total_seconds()
    # print("Simulation-Based Jaccard Matching computation time taken: ", sim_time_jaccard, " seconds", "\n")
    #print('Random Sample Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))


    # RANDOM SAMPLING MIN MATCHING
    print("RANDOM SAMPLE MIN MATCHING")
    print('Performing random sample match (edit distance)...')
    now = datetime.datetime.now()
    min_compare_sampled_edit_match = matcher.matcher_updated(n_matches, cat_table1_dup,cat_table2,editdistance.eval, matcher.random_sample, sim_threshold, sample_size)
    sim_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Simulation-Based Edit Distance Matching computation time taken: ", sim_time_edit, " seconds")
    #print('Random Sample Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))

    # print('Performing random sample match (jaccard distance)...')
    # now = datetime.datetime.now()
    # min_compare_sampled_jaccard_match = matcher.matcher_updated(cat_table1,cat_table2,analyze.jaccard_calc, matcher.random_sample, sim_threshold, sample_size)
    # sim_time_jaccard = (datetime.datetime.now()-now).total_seconds()
    # print("Simulation-Based Jaccard Matching computation time taken: ", sim_time_jaccard, " seconds")
    #print('Random Sample Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))

    sampled_total_max = sum_total_weights(max_compare_sampled_edit_match)
    sampled_total_min = sum_total_weights(min_compare_sampled_edit_match)
    print("MAX Matching Bound:")
    print("SAMPLED Matching: ", sampled_total_max, "\n")
    print("MIN Matching Bound:")
    print("NAIVE Matching: ", naive_total_min)
    return sampled_total_max, sampled_total_min


In [5]:
def sum_results_summaries(bip_max, bip_min, naive_max, naive_min, random_max, random_min):
    print("MAX Matching Bound:")
    print("BP Matching: ", bip_sum_max)
    print("NAIVE Matching: ", naive_total_max)
    print("SAMPLED Matching: ", sampled_total_max, "\n")

    naive_total_min = sum_total_weights(min_compare_all_edit_match)
    sampled_total_min = sum_total_weights(min_compare_sampled_edit_match)
    bp_total_min = sum_total_weights(out_min)

    print("MIN Matching Bound:")
    print("BP Matching: ", bip_sum_min)
    print("NAIVE Matching: ", naive_total_min)
    print("SAMPLED Matching: ", sampled_total_min, "\n")

# SUM:

## Case 1: Balanced 2 tables matching, n=3
#### Matching [Total Max, Total Min] Outcome

### Data Configuration Here

In [6]:
table_a_non_duplicated, table_b, table_a_dup = create_synth_data(100, 100, "table1", "table2", "table1_dup")
# print(table_a_non_duplicated, table_b, table_a_dup)

#### Bipartite Matching Experiment Scripts

In [7]:
total_max, total_min = sum_bip_script(table_a_non_duplicated, table_b, "name", 0.09, 3)

---- Timing for Graph Construction with Treshold Constraint ----
0.227045 seconds


 'SUM' MAXIMAL MATCHING:
---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.239413 seconds
The number of edges in the graph is: 1545 



 'SUM' MINIMAL MATCHING RESULTS:
True
---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.249257 seconds
The number of edges in the graph is: 1545 

BP Matching: Highest bound for maximum: 1086.0
BP Matching: Lowest bound for minimum: 631.0


#### Naive Matching Experiment Scripts

In [8]:
naive_total_max, naive_total_min = sum_naive_script(3, 0.09, "table1_dup", "table2")

Loaded catalogs.
NAIVE MAX MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  0.210387  seconds
NAIVE MIN MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  0.190092  seconds
MAX Matching Bound:
NAIVE Matching:  10176
MIN Matching Bound:
NAIVE Matching:  6669


#### Random Sampling Matching Experiment Scripts

In [9]:
sampled_total_max, sampled_total_min = sum_random_sample_script(3, 0.09, 50, "table1_dup", "table2")

Loaded catalogs.
RANDOM SAMPLE MAX MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  0.135151  seconds
RANDOM SAMPLE MIN MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  0.13062  seconds
MAX Matching Bound:
SAMPLED Matching:  5134 

MIN Matching Bound:
NAIVE Matching:  6669


#### Matching [Total Max, Total Min] Outcome

In [10]:
print("Bipartite Matching: ", "[", total_min, ", ", total_max, "]")
print("Naive Matching: ", "[", naive_total_min, ", ", naive_total_max, "]")
print("Random Sample Matching: ", "[", sampled_total_min, ", ", sampled_total_max, "]")

Bipartite Matching:  [ 631.0 ,  1086.0 ]
Naive Matching:  [ 6669 ,  10176 ]
Random Sample Matching:  [ 3887 ,  5134 ]


# COUNT:

### CASE 1: Write Case Here

#### Bipartite matching Experiment Scripts

In [None]:
sum_bip_script(table_a_non_duplicated, table_b, column_name, similarity_threshold)

#### Naive Matching Experiment Scripts

In [None]:
sum_naive_script(filename1_dup, filename2):

#### Random Sampling Matching Experiment Scripts

In [None]:
sum_random_sample_script(sample_size, filename1_dup, filename2):

## Case 2: 1's table disproportionately bigger than n's table. 
i.e. table_a has 10,000 rows, table_b has 100 rows, n=3, sim_threshold = 0.09

In [11]:
### Data Configuration Here
table_a_non_duplicated, table_b, table_a_dup = create_synth_data(10000, 100, "table1", "table2", "table1_dup")
# print(table_a_non_duplicated, table_b, table_a_dup)

In [12]:
#### Bipartite Matching Experiment Scripts
total_max, total_min = sum_bip_script(table_a_non_duplicated, table_b, "name", 0.09, 3)
#### Naive Matching Experiment Scripts
naive_total_max, naive_total_min = sum_naive_script(3, 0.09, "table1_dup", "table2")
#### Random Sampling Matching Experiment Scripts
sampled_total_max, sampled_total_min = sum_random_sample_script(3, 0.09, 50, "table1_dup", "table2")

10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
110.0% complete
120.0% complete
130.0% complete
140.0% complete
150.0% complete
160.0% complete
170.0% complete
180.0% complete
190.0% complete
200.0% complete
210.0% complete
220.0% complete
230.0% complete
240.0% complete
250.0% complete
260.0% complete
270.0% complete
280.0% complete
---- Timing for Graph Construction with Treshold Constraint ----
21.457723 seconds


 'SUM' MAXIMAL MATCHING:
---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
12.069526 seconds
The number of edges in the graph is: 156996 



 'SUM' MINIMAL MATCHING RESULTS:
True
---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
11.577488 seconds
The number of edges in the graph is: 156996 

BP Matching: Highest bound for maximum: 1304.0
BP Matching: Lowest bound for minimum: 419.0
Loaded ca

In [13]:
"""
Observation: Bipartite Matching gave the lowest outcome. Why?


Matching [Total Max, Total Min] Outcome
"""
print("Bipartite Matching: ", "[", total_min, ", ", total_max, "]")
print("Naive Matching: ", "[", naive_total_min, ", ", naive_total_max, "]")
print("Random Sample Matching: ", "[", sampled_total_min, ", ", sampled_total_max, "]")

Bipartite Matching:  [ 419.0 ,  1304.0 ]
Naive Matching:  [ 444210 ,  1014006 ]
Random Sample Matching:  [ 274261 ,  505805 ]
