### Maximal Matching

In [None]:
import sys
from os import path
sys.path.insert(0, '../src')
import one_to_n

import datetime
import textdistance
import editdistance
import pandas as pd
import networkx as nx
import re

In [None]:
table_a = one_to_n.lat_convert_df("../Amazon-GoogleProducts/Amazon.csv")

table_b = one_to_n.lat_convert_df("../Amazon-GoogleProducts/GoogleProducts.csv")

now = datetime.datetime.now()
bipartite_graph_result = one_to_n.valcomp_treshold_updated_maximal_construct_graph(table_a, table_b, "title", 0.5)
timing_tresh = (datetime.datetime.now()-now).total_seconds()
print("---- Timing for Graph Construction with Treshold Constraint ----")
print(timing_tresh,"seconds")

In [None]:
def SUM_edit_edge_weight(bip_graph):
    for u,v,d in bip_graph.edges(data=True):
        val_tuple_1 = u.split("_")
        val_tuple_2 = v.split("_")
        
        val1 = re.sub("[^0-9]", "", val_tuple_1[2])
        val2 =re.sub("[^0-9]", "", val_tuple_2[2])

        d['weight'] = float(val1) + float(val2)

    return bip_graph

sum_weighted_graph = SUM_edit_edge_weight(bipartite_graph_result)

In [None]:
print("\n\n 'SUM' MAXIMAL MATCHING:")
now = datetime.datetime.now()
matching_set_maximal = nx.algorithms.matching.max_weight_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")


# print("The Maximal Matching Set is:", matching_set_maximal, "\n")

### Minimal Matching

In [None]:
data_edge = sum_weighted_graph.edges()
for i in data_edge:
    first = i[0].split("_")[1]
    second = i[1].split("_")[1]
    print((first, second))

# ------------------------------------------------------------------------------

# Minimal Matching (Algorithm Using Max Matching)

In [None]:
def minimal_matching(sum_weighted_graph):

    new_graph = sum_weighted_graph.copy()
    max_weight = max([d['weight'] for u,v,d in new_graph.edges(data=True)])
    for u,v,d in new_graph.edges(data=True):
        d['weight'] = max_weight - d['weight']

    matching_set_minimal = nx.algorithms.matching.max_weight_matching(new_graph)
    return matching_set_minimal

In [None]:
print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
print(nx.bipartite.is_bipartite(sum_weighted_graph))
now = datetime.datetime.now()
matching_set_minimal = minimal_matching(sum_weighted_graph)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Minimal Matching Set is:", matching_set_minimal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")
print("The number of edges in the graph is:", sum_weighted_graph.number_of_edges(), "\n")

# ------------------------------------------------------------------------------

In [None]:
nx.is_connected(sum_weighted_graph)

In [None]:
"""

Constructs a maximal bipartite graph of the given two tables according to the treshold similarity.
The bipartite matching graph only includes those that have passed a certain similarity treshold.
The similarity metric takes into account the **values** in this implementation

Input: Any 2 files in any format
Output: A Bipartite Graph with Maximal Weights
"""
def edited_valcomp_construct_graph(file_one, file_n, col_to_dup, treshold_decimal):
    table_a_unprocessed = one_to_n.convert_df(file_one)
    table_b_unprocessed = one_to_n.convert_df(file_n)
    bipartite_graph = nx.Graph()
    
    table_a_unprocessed = one_to_n.create_duplicates(table_a_unprocessed, col_to_dup, 3) # Assuming that the user inputs 3 duplicates

    table_a = one_to_n.make_dict(table_a_unprocessed)
    table_b = one_to_n.make_dict(table_b_unprocessed)

    i=0
    
    for key1, val1 in table_a.items():
        comp_point_1 = val1[0].split("_")[0]

        id1 = str(key1) + '_'+ str(comp_point_1) + '_1'
        for key2, val2 in table_b.items():

            comp_point_2 = val2[0]
            dist = one_to_n.calc_jaccard(str(comp_point_1).lower(),str(comp_point_2).lower())
            i+=1
            # print("first is: ", comp_point_1, "second is:", comp_point_2, "distance is:", dist)
            if i%100000 == 0:
                print(str(round(100*i/len(file_one)/len(file_n),2))+'% complete')
#             if dist >= treshold_decimal:
                
                #add value to identifier to disitnguish two entries with different values
            id2 = str(key2) + '_' + str(comp_point_2) + '_2'
                
            num1 = re.sub("[^0-9]", "", str(val1[3]))
            num2 =re.sub("[^0-9]", "", str(val2[3]))
                
            add_weight = float(num1) + float(num2)
                
            bipartite_graph.add_edge(id1, id2, weight=add_weight)
                #edit distance and weight should be inv. prop.
                #also adding 1 to denom. to prevent divide by 0
                # add 1,2 to distinguish two key-value tuples belonging to different tables
#             else:
#                 continue
            
    return bipartite_graph


In [None]:
now = datetime.datetime.now()
bipartite_graph_result = edited_valcomp_construct_graph(table_a, table_b, "title", 0.5)
timing_tresh = (datetime.datetime.now()-now).total_seconds()
print("---- Timing for Graph Construction with Treshold Constraint ----")
print(timing_tresh,"seconds")

In [None]:
# print(bipartite_graph_result.edges())
# edited_weight_graph = SUM_edit_edge_weight(bipartite_graph_result)
# print(nx.number_connected_components(bipartite_graph_result))
# print(list(nx.connected_components(bipartite_graph_result)))
# print(list(bipartite_graph_result.edges()))
# print(list(nx.connected_components(bipartite_graph_result)))
# remaining = bipartite_graph_result.edges() - nx.connected_components(bipartite_graph_result)
# print("The number of edges in the graph is:", bipartite_graph_result.number_of_edges(), "\n")
# print("\n\n 'SUM' MINIMAL MATCHING RESULTS:")
# print(nx.bipartite.is_bipartite(bipartite_graph_result))
print(nx.is_connected(bipartite_graph_result))
now = datetime.datetime.now()
matching_set_minimal = nx.algorithms.bipartite.matching.minimum_weight_full_matching(bipartite_graph_result)
timing_match = (datetime.datetime.now()-now).total_seconds()
print("The Minimal Matching Set is:", matching_set_minimal, "\n")
print("---- Timing for Matching (Done on the graph constructed with the treshold constraint) ----")
print(timing_match,"seconds")


In [None]:
print(len(remaining))
print(inside)

In [None]:
# print(list(bipartite_graph_result.edges()))