# Experiments Using Synthetic Dataset

In [1]:
import sys
from os import path
sys.path.insert(0, '../src')
import one_to_n
sys.path.insert(0, '../tests')
import create_synthetic_dataset
import datetime
import textdistance
import editdistance
import pandas as pd
import networkx as nx
import re

In [2]:
table_a = create_synthetic_dataset.create_first_df(10)

table_b_non_typo = create_synthetic_dataset.create_second_df(10)

table_b = create_synthetic_dataset.add_typo(table_a, table_b_non_typo)

print(table_a)
print(table_b)

now = datetime.datetime.now()
bipartite_graph_result = one_to_n.keycomp_treshold_updated_maximal_construct_graph(table_a, table_b, "name", 0.5)
timing_tresh = (datetime.datetime.now()-now).total_seconds()
print("---- Timing for Graph Construction with Treshold Constraint ----")
print(timing_tresh,"seconds")

---- Timing for Graph Construction with Treshold Constraint ----
0.009145 seconds


In [3]:
def SUM_edit_edge_weight(bip_graph):
    for u,v,d in bip_graph.edges(data=True):
        val_tuple_1 = u.split("_")
        val_tuple_2 = v.split("_")
        
        val1 = re.sub("[^0-9]", "", val_tuple_1[2])
        val2 =re.sub("[^0-9]", "", val_tuple_2[2])

        d['weight'] = float(val1) + float(val2)

    return bip_graph

sum_weighted_graph = SUM_edit_edge_weight(bipartite_graph_result)

## Maximal Matching

In [7]:
print("\n\n 'SUM' MAXIMAL MATCHING:")
now = datetime.datetime.now()
matching_set_maximal = nx.algorithms.matching.max_weight_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Maximal Matching Set is:", matching_set_maximal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")


# print("The Maximal Matching Set is:", matching_set_maximal, "\n")



 'SUM' MAXIMAL MATCHING:
The Maximal Matching Set is: {('Kenneth ThompsonxtX_(2,)_2_', 'Mario Guzman_0_(8,)_1'), ('Amber Vasquez_1_(8,)_1', 'Colleen WaltersqLf_(8,)_2_'), ('Colleen Walters_0_(6,)_1', 'Tina MooneyM_(4,)_2_'), ('Mario GuzmanAJu_(2,)_2_', 'Nicholas Haynes_1_(7,)_1'), ('Mario Guzman_2_(8,)_1', 'Tina MooneyCj_(4,)_2_'), ('Mario Guzman_1_(8,)_1', 'Christopher FisherXZg_(7,)_2_'), ('Douglas BrockIKw_(7,)_2_', 'Amber Vasquez_0_(8,)_1'), ('Nicholas Haynes_0_(7,)_1', 'Jonathan ChaseNTS_(1,)_2_'), ('Nicholas Haynes_2_(7,)_1', 'Keith MayergyU_(0,)_2_'), ('Amber Vasquez_2_(8,)_1', 'Nicholas HaynesuBR_(1,)_2_')} 

---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.008359 seconds
The number of edges in the graph is: 300 



## Minimal Matching

In [5]:
def minimal_matching(sum_weighted_graph):

    new_graph = sum_weighted_graph.copy()
    max_weight = max([d['weight'] for u,v,d in new_graph.edges(data=True)])
    for u,v,d in new_graph.edges(data=True):
        d['weight'] = max_weight - d['weight']

    matching_set_minimal = nx.algorithms.matching.max_weight_matching(new_graph)
    return matching_set_minimal

In [6]:
print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
print(nx.bipartite.is_bipartite(sum_weighted_graph))
now = datetime.datetime.now()
matching_set_minimal = minimal_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Minimal Matching Set is:", matching_set_minimal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")



 'SUM' MINIMAL MATCHING RESULTS:
True
The Minimal Matching Set is: {('Nicholas HaynesuBR_(1,)_2_', 'Keith Mayer_2_(1,)_1'), ('Colleen WaltersqLf_(8,)_2_', 'Keith Mayer_0_(1,)_1'), ('Keith Mayer_1_(1,)_1', 'Tina MooneyCj_(4,)_2_'), ('Jonathan Chase_2_(2,)_1', 'Christopher FisherXZg_(7,)_2_'), ('Jonathan Chase_0_(2,)_1', 'Mario GuzmanAJu_(2,)_2_'), ('Tina MooneyM_(4,)_2_', 'Christopher Fisher_0_(3,)_1'), ('Kenneth Thompson_1_(2,)_1', 'Keith MayergyU_(0,)_2_'), ('Jonathan ChaseNTS_(1,)_2_', 'Kenneth Thompson_0_(2,)_1'), ('Kenneth ThompsonxtX_(2,)_2_', 'Jonathan Chase_1_(2,)_1'), ('Douglas BrockIKw_(7,)_2_', 'Kenneth Thompson_2_(2,)_1')} 

---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----
0.010139 seconds
The number of edges in the graph is: 300 

