# All Experiment Scripts

In [1]:
import sys
from os import path
sys.path.insert(0, '../src')
import one_to_n
sys.path.insert(0, '../tests')
import create_synthetic_dataset
import datetime
import textdistance
import editdistance
import pandas as pd
import networkx as nx
import re

import editdistance
from Matching import core2
from Matching import analyze
from Matching import matcher
import sys
import os
import datetime

In [2]:
"""
Converts dictionary output to dataframe. Use for experiment results
"""
def dict_to_df(output_dict):
    df = pd.DataFrame.from_dict(output_dict, orient='index',columns=['Matched Item', 'Max/Min', 'Value'])
    return df
"""
Converts experiment results to CSV format. Use for experiment results
"""
def exp_result_to_csv(filename, experiment_output_df):
    #create the CSV with custom filename describing the experiment
    df.to_csv(filename, index = False)

def create_synth_data(table1_rowcount, table2_rowcount, datafilename1, datafilename2, filename1_dup):
    table_a_non_duplicated = create_synthetic_dataset.create_first_df(table1_rowcount)

    table_b_non_typo = create_synthetic_dataset.create_second_df(table2_rowcount)

    table_b = create_synthetic_dataset.add_typo(table_a_non_duplicated, table_b_non_typo)
    
    table_a_non_duplicated.to_csv(datafilename1, index = False, header=True)
    
    table_b.to_csv(datafilename2, index = False, header=True)
    
    table_a_dup = one_to_n.create_duplicates(table_a_non_duplicated, "name", 3)
    
    table_a_dup.to_csv(filename1_dup, index = False, header=True)
    
    return table_a_non_duplicated, table_b, table_a_dup

def SUM_edit_edge_weight(bip_graph):
    for u,v,d in bip_graph.edges(data=True):
        val_tuple_1 = u.split("_")
        val_tuple_2 = v.split("_")
        
        if len(val_tuple_1) == 4:
            val1 = re.sub("[^0-9]", "", val_tuple_1[2])
        else: 
            val1 = re.sub("[^0-9]", "", val_tuple_1[1])
            
        if len(val_tuple_2) == 4:
            val2 =re.sub("[^0-9]", "", val_tuple_2[2])
        else:
            val2 =re.sub("[^0-9]", "", val_tuple_2[1])

        d['weight'] = float(val1) + float(val2)

    return bip_graph

def minimal_matching(sum_weighted_graph):
    new_graph = sum_weighted_graph.copy()
    max_weight = max([d['weight'] for u,v,d in new_graph.edges(data=True)])
    for u,v,d in new_graph.edges(data=True):
        d['weight'] = max_weight - d['weight']

    matching_set_minimal = nx.algorithms.matching.max_weight_matching(new_graph)
    return matching_set_minimal

def fetch_sum(bip_graph, matching):
    output = []
    for u,v,d in bip_graph.edges(data=True):
        l = (u, v)
        k = (v, u)
        if l in matching:
            output.append([u,v, d['weight']])
        if k in matching:
            output.append([v,u, d['weight']])
    return output

def formatted_output(out_max, out_min):
    out_dict = {}
    for (val1,val2, weight) in out_min:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "min", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "min", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "min", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "min", weight)]
            
    for (val1,val2, weight) in out_max:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            if splitted1[0] in out_dict:
                out_dict[splitted1[0]].append((splitted2[0], "max", weight))
            else:
                out_dict[splitted1[0]] = [(splitted2[0], "max", weight)]

        if len(splitted2) == 4:
            if splitted2[0] in out_dict:
                out_dict[splitted2[0]].append((splitted1[0], "max", weight))
            else:
                out_dict[splitted2[0]] = [(splitted1[0], "max", weight)]
    return out_dict

def sum_total_weights(max_min_list):
    if max_min_list == [] or max_min_list == None:
        print("ERROR: NO SIMILARITY FOUND IN NAIVE OR RANDOM SAMPLING APPROACH. Suggestion: Decrease Similarity Matching Threshold.")
        return None
    total = 0
    for i in max_min_list:
        total += i[-1]
    return total

def sum_bip_script(table_a_non_duplicated, table_b, column_name, similarity_threshold):
    now = datetime.datetime.now()
    bipartite_graph_result = one_to_n.keycomp_treshold_updated_maximal_construct_graph(table_a_non_duplicated, table_b, column_name, similarity_threshold)
    timing_tresh = (datetime.datetime.now()-now).total_seconds()
    print("---- Timing for Graph Construction with Treshold Constraint ----")
    print(timing_tresh,"seconds")
    
    sum_weighted_graph = SUM_edit_edge_weight(bipartite_graph_result)
    
    print("\n\n 'SUM' MAXIMAL MATCHING:")
    now = datetime.datetime.now()
    matching_set_maximal = nx.algorithms.matching.max_weight_matching(sum_weighted_graph)
    timing_match = (datetime.datetime.now()-now).total_seconds()
    print("The Maximal Matching Set is:", matching_set_maximal, "\n")
    print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
    print(timing_match,"seconds")
    print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")
    
    
    print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
    print(nx.bipartite.is_bipartite(sum_weighted_graph))
    now = datetime.datetime.now()
    matching_set_minimal = minimal_matching(sum_weighted_graph)
    timing_match = (datetime.datetime.now()-now).total_seconds()
    print("The Minimal Matching Set is:", matching_set_minimal, "\n")
    print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
    print(timing_match,"seconds")
    print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")
    
    
    
    out_max = fetch_sum(sum_weighted_graph, matching_set_maximal)
    out_min = fetch_sum(sum_weighted_graph, matching_set_minimal)
    
    form_output = formatted_output(out_max,out_min)
    
#     max_df_output = dict_to_df(form_output)
    
#     min_df_output = dict_to_df(form_output)
    
#     exp_result_to_csv(filename_max_bip_out, max_df_output)
    
#     exp_result_to_csv(filename_min_bip_out, min_df_output)
    
    total_max = sum_total_weights(out_max)
    print("BP Matching: Highest bound for maximum:", total_max)

    total_min = sum_total_weights(out_min)
    print("BP Matching: Lowest bound for minimum:", total_min)
    
    return total_max, total_min

In [3]:
def sum_naive_script(filename1_dup, filename2):
    # print(os.getcwd())
#    cat_table1_dup = core2.data_catalog('s_data1_dup.csv')
    cat_table1_dup = core2.data_catalog(filename1_dup)
    cat_table2 = core2.data_catalog(filename2)
    print('Loaded catalogs.')
    
    
    # NAIVE MAX MATCHING
    print("NAIVE MAX MATCHING")
    print('Performing compare all match (edit distance)...')
    now = datetime.datetime.now()
    max_compare_all_edit_match = matcher.matcher_dup_updated(cat_table1_dup,cat_table2,editdistance.eval, matcher.all, 0.09)
    naive_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Naive Edit Distance Matching computation time taken: ", naive_time_edit, " seconds")
    #print('Compare All Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))


#     print('Performing compare all match (jaccard distance)...')
#     now = datetime.datetime.now()
#     max_compare_all_jaccard_match = matcher.matcher_dup_updated(cat_table1_dup,cat_table2,analyze.jaccard_calc, matcher.all, 0.09)
#     naive_time_jaccard = (datetime.datetime.now()-now).total_seconds()
#     print("Naive Jaccard Matching computation time taken: ", naive_time_jaccard, " seconds", "\n")
    #print('Compare All Matcher (Jaccard Distance) Performance: ' + str(core2.eval_matching(compare_all_jaccard_match)))

    # NAIVE MIN MATCHING
    print("NAIVE MIN MATCHING")
    print('Performing compare all match (edit distance)...')
    now = datetime.datetime.now()
    min_compare_all_edit_match = matcher.matcher_updated(cat_table1_dup,cat_table2,editdistance.eval, matcher.all, 0.09)
    naive_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Naive Edit Distance Matching computation time taken: ", naive_time_edit, " seconds")
    #print('Compare All Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))


#     print('Performing compare all match (jaccard distance)...')
#     now = datetime.datetime.now()
#     min_compare_all_jaccard_match = matcher.matcher_updated(cat_table1,cat_table2,analyze.jaccard_calc, matcher.all, 0.09)
#     naive_time_jaccard = (datetime.datetime.now()-now).total_seconds()
#     print("Naive Jaccard Matching computation time taken: ", naive_time_jaccard, " seconds")
    #print('Compare All Matcher (Jaccard Distance) Performance: ' + str(core2.eval_matching(compare_all_jaccard_match)))

    naive_total_max = sum_total_weights(max_compare_all_edit_match)
    naive_total_min = sum_total_weights(min_compare_all_edit_match)
    print("MAX Matching Bound:")
    print("NAIVE Matching: ", naive_total_max)
    print("MIN Matching Bound:")
    print("NAIVE Matching: ", naive_total_min)
    return naive_total_max, naive_total_min

In [11]:
def sum_random_sample_script(sample_size, filename1_dup, filename2):
    
    cat_table1_dup = core2.data_catalog(filename1_dup)
    cat_table2 = core2.data_catalog(filename2)
    print('Loaded catalogs.')
    
    # RANDOM SAMPLING MAX MATCHING
    print("RANDOM SAMPLE MAX MATCHING")
    print('Performing random sample match (edit distance)...')
    now = datetime.datetime.now()
    max_compare_sampled_edit_match = matcher.matcher_dup_updated(cat_table1_dup,cat_table2,editdistance.eval, matcher.random_sample, 0.09, sample_size)
    sim_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Simulation-Based Edit Distance Matching computation time taken: ", sim_time_edit, " seconds")
    #print('Random Sample Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))

    # print('Performing random sample match (jaccard distance)...')
    # now = datetime.datetime.now()
    # max_compare_sampled_jaccard_match = matcher.matcher_dup_updated(cat_table1_dup,cat_table2,analyze.jaccard_calc, matcher.random_sample, 0.05, sample_size)
    # sim_time_jaccard = (datetime.datetime.now()-now).total_seconds()
    # print("Simulation-Based Jaccard Matching computation time taken: ", sim_time_jaccard, " seconds", "\n")
    #print('Random Sample Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))


    # RANDOM SAMPLING MIN MATCHING
    print("RANDOM SAMPLE MIN MATCHING")
    print('Performing random sample match (edit distance)...')
    now = datetime.datetime.now()
    min_compare_sampled_edit_match = matcher.matcher_updated(cat_table1_dup,cat_table2,editdistance.eval, matcher.random_sample, 0.09, sample_size)
    sim_time_edit = (datetime.datetime.now()-now).total_seconds()
    print("Simulation-Based Edit Distance Matching computation time taken: ", sim_time_edit, " seconds")
    #print('Random Sample Matcher (Edit Distance) Performance: ' + str(core.eval_matching(compare_all_edit_match)))

    # print('Performing random sample match (jaccard distance)...')
    # now = datetime.datetime.now()
    # min_compare_sampled_jaccard_match = matcher.matcher_updated(cat_table1,cat_table2,analyze.jaccard_calc, matcher.random_sample, 0.5, sample_size)
    # sim_time_jaccard = (datetime.datetime.now()-now).total_seconds()
    # print("Simulation-Based Jaccard Matching computation time taken: ", sim_time_jaccard, " seconds")
    #print('Random Sample Matcher (Jaccard Distance) Performance: ' + str(core.eval_matching(compare_all_jaccard_match)))

    sampled_total_max = sum_total_weights(max_compare_sampled_edit_match)
    sampled_total_min = sum_total_weights(min_compare_sampled_edit_match)
    print("MAX Matching Bound:")
    print("SAMPLED Matching: ", sampled_total_max, "\n")
    print("MIN Matching Bound:")
    print("NAIVE Matching: ", naive_total_min)
    return sampled_total_max, sampled_total_min


In [12]:
def sum_results_summaries(bip_max, bip_min, naive_max, naive_min, random_max, random_min):
    print("MAX Matching Bound:")
    print("BP Matching: ", bip_sum_max)
    print("NAIVE Matching: ", naive_total_max)
    print("SAMPLED Matching: ", sampled_total_max, "\n")

    naive_total_min = sum_total_weights(min_compare_all_edit_match)
    sampled_total_min = sum_total_weights(min_compare_sampled_edit_match)
    bp_total_min = sum_total_weights(out_min)

    print("MIN Matching Bound:")
    print("BP Matching: ", bip_sum_min)
    print("NAIVE Matching: ", naive_total_min)
    print("SAMPLED Matching: ", sampled_total_min, "\n")

# SUM:

### CASE 1: Write Case Here

### Data Configuration Here

In [13]:
table_a_non_duplicated, table_b, table_a_dup = create_synth_data(100, 100, "table1", "table2", "table1_dup")
# print(table_a_non_duplicated, table_b, table_a_dup)

#### Bipartite Matching Experiment Scripts

In [18]:
total_max, total_min = sum_bip_script(table_a_non_duplicated, table_b, "name", 0.09)

---- Timing for Graph Construction with Treshold Constraint ----
0.234638 seconds


 'SUM' MAXIMAL MATCHING:
The Maximal Matching Set is: {('Mark HaynesHom_(8,)_2', 'Travis Bishop_1_(9,)_1'), ('Carl Riley_2_(9,)_1', 'Cheryl JonesWDa_(6,)_2'), ('Thomas Valdez_0_(8,)_1', 'Robert Rayhuz_(8,)_2'), ('Kaitlyn Davis_2_(7,)_1', 'Kathryn MartinNTQ_(0,)_2'), ('Carl Riley_1_(9,)_1', 'Laura FieldsQgz_(5,)_2'), ('Joshua SmithyGk_(2,)_2', 'Joshua Smith_2_(4,)_1'), ('James ThomasLPf_(8,)_2', 'Amy Jones_2_(7,)_1'), ('Roberto AllisonvMU_(9,)_2', 'Robert Avery_2_(8,)_1'), ('Todd Bowers_2_(9,)_1', 'David BurnswEy_(9,)_2'), ('Michael GarciaUIU_(6,)_2', 'Michael Garcia_2_(8,)_1'), ('Peter TerrywHX_(8,)_2', 'Paul Meadows_0_(8,)_1'), ('Adam GonzalezleK_(1,)_2', 'Adam Gonzalez_0_(5,)_1'), ('Christopher Walsh_2_(5,)_1', 'Christopher WalshRRC_(1,)_2'), ('Tony Harriswgv_(7,)_2', 'Kaitlyn Harris_1_(8,)_1'), ('Lori Lee_2_(8,)_1', 'Todd BowersAeq_(5,)_2'), ('Todd Bowers_0_(9,)_1', 'Paul BrownrNF_(4,)_2'), ('Brandi 

#### Naive Matching Experiment Scripts

In [19]:
naive_total_max, naive_total_min = sum_naive_script("table1_dup", "table2")

Loaded catalogs.
NAIVE MAX MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  0.177017  seconds
NAIVE MIN MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  0.181481  seconds
MAX Matching Bound:
NAIVE Matching:  15225
MIN Matching Bound:
NAIVE Matching:  10290


#### Random Sampling Matching Experiment Scripts

In [25]:
sampled_total_max, sampled_total_min = sum_random_sample_script(50, "table1_dup", "table2")

Loaded catalogs.
RANDOM SAMPLE MAX MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  0.135096  seconds
RANDOM SAMPLE MIN MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  0.129564  seconds
MAX Matching Bound:
SAMPLED Matching:  7577 

MIN Matching Bound:
NAIVE Matching:  10290


## Case: Balanced 2 tables matching
#### Matching [Total Max, Total Min] Outcome

In [26]:
print("Bipartite Matching: ", "[", total_max, ", ", total_min, "]")
print("Naive Matching: ", "[", naive_total_max, ", ", naive_total_min, "]")
print("Random Sample Matching: ", "[", sampled_total_max, ", ", sampled_total_min, "]")

Bipartite Matching:  [ 1139.0 ,  672.0 ]
Naive Matching:  [ 15225 ,  10290 ]
Random Sample Matching:  [ 7577 ,  5160 ]


# COUNT:

### CASE 1: Write Case Here

#### Bipartite matching Experiment Scripts

In [None]:
sum_bip_script(table_a_non_duplicated, table_b, column_name, similarity_threshold)

#### Naive Matching Experiment Scripts

In [None]:
sum_naive_script(filename1_dup, filename2):

#### Random Sampling Matching Experiment Scripts

In [None]:
sum_random_sample_script(sample_size, filename1_dup, filename2):

## Case 2: 1's table disproportionately bigger than n's table. 
i.e. table_a has 10,000 rows, table_b has 100 rows

In [34]:
### Data Configuration Here

In [35]:
table_a_non_duplicated, table_b, table_a_dup = create_synth_data(10000, 100, "table1", "table2", "table1_dup")
# print(table_a_non_duplicated, table_b, table_a_dup)

In [36]:
#### Bipartite Matching Experiment Scripts
total_max, total_min = sum_bip_script(table_a_non_duplicated, table_b, "name", 0.09)
#### Naive Matching Experiment Scripts
naive_total_max, naive_total_min = sum_naive_script("table1_dup", "table2")
#### Random Sampling Matching Experiment Scripts
sampled_total_max, sampled_total_min = sum_random_sample_script(50, "table1_dup", "table2")

10.0% complete
20.0% complete
30.0% complete
40.0% complete
50.0% complete
60.0% complete
70.0% complete
80.0% complete
90.0% complete
100.0% complete
110.0% complete
120.0% complete
130.0% complete
140.0% complete
150.0% complete
160.0% complete
170.0% complete
180.0% complete
190.0% complete
200.0% complete
210.0% complete
220.0% complete
230.0% complete
240.0% complete
250.0% complete
260.0% complete
270.0% complete
280.0% complete
---- Timing for Graph Construction with Treshold Constraint ----
20.326757 seconds


 'SUM' MAXIMAL MATCHING:
The Maximal Matching Set is: {('Rodney Pitts_2_(9,)_1', 'Dawn BellP_(2,)_2'), ('Michael Mclean_2_(9,)_1', 'Margaret MillerABF_(4,)_2'), ('Michael HarperysZ_(9,)_2', 'Nicholas Perez_2_(9,)_1'), ('Cindy Tate MD_2_(9,)_1', 'Wendy SmithEiD_(2,)_2'), ('Keith AndersonAoY_(3,)_2', 'Sarah Sanders_2_(9,)_1'), ('Amy Bowen_2_(9,)_1', 'Bradley ColeQbg_(1,)_2'), ('Mark Ayers PhD_2_(9,)_1', 'Mark ArmstrongFmh_(7,)_2'), ('Jonathan Gilbert_2_(9,)_1', 'Jonathan Jo

BP Matching: Highest bound for maximum: 1332.0
BP Matching: Lowest bound for minimum: 432.0
Loaded catalogs.
NAIVE MAX MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  16.959016  seconds
NAIVE MIN MATCHING
Performing compare all match (edit distance)...
Naive Edit Distance Matching computation time taken:  17.552474  seconds
MAX Matching Bound:
NAIVE Matching:  1189422
MIN Matching Bound:
NAIVE Matching:  700593
Loaded catalogs.
RANDOM SAMPLE MAX MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  11.088123  seconds
RANDOM SAMPLE MIN MATCHING
Performing random sample match (edit distance)...
Simulation-Based Edit Distance Matching computation time taken:  11.53961  seconds
MAX Matching Bound:
SAMPLED Matching:  599511 

MIN Matching Bound:
NAIVE Matching:  700593


In [None]:
"""
Observation: Graph Construction is the most time-consuming computation.

"""

