# All Experiment Scripts

In [None]:
import experiment_funcs

In [None]:
import editdistance
print(experiment_funcs.matcher.calc_max_weight_edit('Kimberly Fuentes', 'Kimberly FuentesJB', editdistance.eval))

# SUM:

## Case 1: Balanced 2 tables matching, n=3
#### Matching [Total Max, Total Min] Outcome

In [None]:
experiment_funcs.create_csv_table("trial1sample")
for i in range(10):
    # Create Randomized Data
    table_a_non_duplicated, table_b, table_a_dup, tables_map, perfect_mapping = experiment_funcs.create_synth_data(100, 100, "table1", "table2", "table1_dup", 3)
    
    # Bipartite Matching Script
    total_max, total_min, bip_min_matching_time, bip_max_matching_time, out_max, out_min = experiment_funcs.sum_bip_script(table_a_non_duplicated, table_b, "name", 0.2, 3, tables_map)

    # Run Naive Matching Script
    naive_total_max, naive_total_min, naive_min_matching_time, naive_max_matching_time, naive_max, naive_min = experiment_funcs.sum_naive_script(0.2, "table1_dup", "table2", "table1")
    
    # Run Random Matching Script
    sampled_total_max, sampled_total_min, sampled_min_matching_time, sampled_max_matching_time, sampled_max, sampled_min = experiment_funcs.sum_random_sample_script(0.2, 50, "table1_dup", "table2", "table1")
    
    # Run Accuracy Evaluation
    eval_records = experiment_funcs.full_evaluation(out_min, out_max, naive_min, naive_max, sampled_min, sampled_max, perfect_mapping)
    
    # Record Experiment Results
    experiment_funcs.table_csv_output(total_min, total_max, naive_total_min, naive_total_max, sampled_total_min, sampled_total_max, "trial1sample", bip_min_matching_time, bip_max_matching_time, naive_min_matching_time, naive_max_matching_time, sampled_min_matching_time, sampled_max_matching_time, eval_records)
    

In [None]:
print(perfect_mapping)
print(total_max)
print(naive_min)

In [None]:
def fix_form_bp(bp_match):
    new_list = []
    for (val1,val2, weight) in bp_match:
        splitted1 = val1.split("_")
        splitted2 = val2.split("_")
        if len(splitted1) == 4:
            new_list.append((splitted1[0], splitted2[0], weight))
        else:
            new_list.append((splitted2[0], splitted1[0], weight))
            
    return new_list

def fix_form_naive(naive_match):
    new_list = []
    for (val1,val2, weight) in naive_match:
        splitted1 = val1.split("_")[0]
        new_list.append((splitted1, val2, weight))
    return new_list
            
formatted_max_bp = fix_form_bp(out_max)
formatted_min_bp = fix_form_bp(out_min)

formatted_max_naive = fix_form_naive(naive_max)
formatted_min_naive = fix_form_naive(naive_min)

def find_dif_matches(bp_match, naive_match):
    naive_set = set(naive_match)
    bp_set = set(bp_match)
    return naive_set.symmetric_difference(bp_set)

# what's in naive set, but not in bp set
def find_naive_dif(bp_match, naive_match):
    naive_set = set(naive_match)
    bp_set = set(bp_match)
    return naive_set.difference(bp_set)

# what's in bp set, but not in naive set
def find_bp_dif(bp_match, naive_match):
    naive_set = set(naive_match)
    bp_set = set(bp_match)
    return bp_set.difference(naive_set)

print("Symmetric Differences: ", find_dif_matches(formatted_max_bp, formatted_max_naive))
naive_dif = find_naive_dif(formatted_max_bp, formatted_max_naive)
bp_dif = find_bp_dif(formatted_max_bp, formatted_max_naive)
# print("Naive Outcome Differences: ", naive_dif)
# print("BP Outcome Differences: ", bp_dif)
print(len(formatted_max_bp))
print("Naive Outcome Differences: ", len(naive_dif))
print("BP Outcome Differences: ", len(bp_dif))

print("Symmetric Differences: ", find_dif_matches(formatted_min_bp, formatted_min_naive))
naive_dif = find_naive_dif(formatted_min_bp, formatted_min_naive)
bp_dif = find_bp_dif(formatted_min_bp, formatted_min_naive)
# print("Naive Outcome Differences: ", naive_dif)
# print("BP Outcome Differences: ", bp_dif)
print("Naive Outcome Differences: ", len(naive_dif))
print("BP Outcome Differences: ", len(bp_dif))

print(len(out_min))
print(len(naive_min))

print(len(out_max))
print(len(naive_max))


sorted_bp_min = sorted(out_min, key=lambda x:x[-1])
sorted_naive_min = sorted(naive_min, key=lambda x:x[-1])

print("Naive MIN", sorted_naive_min[0:5])
print("\n sorted bp min", sorted_bp_min[0:5])

# COUNT:

### CASE 1: Balanced 2 tables, matching n=3, filter condition age>=5

In [None]:
experiment_funcs.create_csv_table("count_trialsample")
for i in range(10):
    # Create Randomized Data
    table_a_non_duplicated, table_b, table_a_dup = experiment_funcs.create_synth_data(100, 100, "table1", "table2", "table1_dup", 3)
    
    # Bipartite Matching Script
    total_max, total_min, bip_min_matching_time, bip_max_matching_time = experiment_funcs.count_bip_script(table_a_non_duplicated, table_b, "name", 0.09, 3, 5)

    # Run Naive Matching Script
    naive_total_max, naive_total_min, naive_min_matching_time, naive_max_matching_time = experiment_funcs.count_naive_script(0.09, "table1_dup", "table2", "table1", 5)
    
    # Run Random Matching Script
    sampled_total_max, sampled_total_min, sampled_min_matching_time, sampled_max_matching_time = experiment_funcs.count_random_sample_script(0.09, 50, "table1_dup", "table2", "table1", 5)
    
    # Record Experiment Results
    experiment_funcs.table_csv_output(total_min, total_max, naive_total_min, naive_total_max, sampled_total_min, sampled_total_max, "count_trialsample", bip_min_matching_time, bip_max_matching_time, naive_min_matching_time, naive_max_matching_time, sampled_min_matching_time, sampled_max_matching_time)

## SUM Case 2: 1's table disproportionately bigger than n's table. 
i.e. table_a has 10,000 rows, table_b has 100 rows, n=3, sim_threshold = 0.09

In [None]:
experiment_funcs.create_csv_table("unbalancedtable_sum")
for i in range(5):
    # Create Randomized Data
    table_a_non_duplicated, table_b, table_a_dup = experiment_funcs.create_synth_data(10000, 100, "table1", "table2", "table1_dup", 3)
    
    # Bipartite Matching Script
    total_max, total_min, bip_min_matching_time, bip_max_matching_time = experiment_funcs.sum_bip_script(table_a_non_duplicated, table_b, "name", 0.09, 3)

    # Run Naive Matching Script
    naive_total_max, naive_total_min, naive_min_matching_time, naive_max_matching_time = experiment_funcs.sum_naive_script(0.09, "table1_dup", "table2")
    
    # Run Random Matching Script
    sampled_total_max, sampled_total_min, sampled_min_matching_time, sampled_max_matching_time = experiment_funcs.sum_random_sample_script(0.09, 50, "table1_dup", "table2", "table1")
    
    # Record Experiment Results
    experiment_funcs.table_csv_output(total_min, total_max, naive_total_min, naive_total_max, sampled_total_min, sampled_total_max, "unbalancedtable_sum", bip_min_matching_time, bip_max_matching_time, naive_min_matching_time, naive_max_matching_time, sampled_min_matching_time, sampled_max_matching_time)

In [None]:
"""
Observation: Bipartite Matching gave the lowest outcome. Why?


Matching [Total Max, Total Min] Outcome
"""
print("Bipartite Matching: ", "[", total_min, ", ", total_max, "]")
print("Naive Matching: ", "[", naive_total_min, ", ", naive_total_max, "]")
print("Random Sample Matching: ", "[", sampled_total_min, ", ", sampled_total_max, "]")