In [25]:
import os
import pandas as pd
from valentine import valentine_match, valentine_metrics
from valentine.algorithms import Coma, Cupid,DistributionBased,JaccardLevenMatcher,SimilarityFlooding

from improve_tool import neg_entropy, generate_matchings, inconsistent_or_consistent, p_ans, p_ans_v, cost_func, p_v_ans
import pprint
import math
from abc import ABCMeta, abstractmethod

path1 = "./data/purchase1.csv"
path2 = "./data/purchase2.csv"
# path1 = "./data/authors1.csv"
# path2 = "./data/authors2.csv"


df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
c_matcher = Coma(max_n=3, strategy="COMA_OPT")
cu_matcher = Cupid()
distribution_based_matcher = DistributionBased()
jl_matcher = JaccardLevenMatcher()
sf_matcher = SimilarityFlooding()
matcher_list = [c_matcher, cu_matcher, jl_matcher]

match_list = []
for idx,matcher in enumerate(matcher_list): 
    matches = valentine_match(df1, df2, matcher)
    iter_keys = list(matches.keys())
    for c in iter_keys:
        matches[((c[0][0], c[0][1].strip()), (c[1][0], c[1][1].strip()))] = matches.pop(c)
        if idx == 1:
            if matches[((c[0][0], c[0][1].strip()), (c[1][0], c[1][1].strip()))]<0.82:
                matches.pop(((c[0][0], c[0][1].strip()), (c[1][0], c[1][1].strip())))
    match_list.append(matches)

In [26]:
match_list[1]

{(('table_1', 'Delivery Date'), ('table_2', 'delivery date')): 1.0,
 (('table_1', 'Total Price'),
  ('table_2', 'product price')): 0.9555555555555555,
 (('table_1', 'Order status'),
  ('table_2', 'current status')): 0.9356643356643357,
 (('table_1', 'Unit Price'), ('table_2', 'product price')): 0.9111111111111112,
 (('table_1', 'Application Date'),
  ('table_2', 'delivery date')): 0.9076923076923078,
 (('table_1', 'Product'), ('table_2', 'product name')): 0.8857142857142857,
 (('table_1', 'Product ID'), ('table_2', 'ID')): 0.8588235294117648,
 (('table_1', 'Purchasing Department'),
  ('table_2', 'Application Department')): 0.8500000000000001,
 (('table_1', 'Product'),
  ('table_2', 'product requirement')): 0.8400000000000001,
 (('table_1', 'Product'), ('table_2', 'product price')): 0.8400000000000001,
 (('table_1', 'Supplier Name'),
  ('table_2', 'Supplier Company')): 0.8400000000000001,
 (('table_1', 'Order ID'), ('table_2', 'ID')): 0.8400000000000001,
 (('table_1', 'Purchasing Manage

In [27]:
from collections import Counter

def gen_info(matches):
    num = len(matches)    
    key1 = [i[0] for i in matches.keys()]
    a = Counter(key1)
    num_match = 1
    for key in a.keys():
        num_match*=a[key]
    print("num of matches: ",num_match)
    
    matchings, prob, correspondences = generate_matchings(matches, a, num_match)
    return matchings, prob, correspondences

In [28]:
def merge_matchings(matchings, prob):
    s = matchings[0]
    for i in matchings[1:]:
        s.extend(i)
    s_p = prob[0]
    for j in prob[1:]:
        s_p.extend(j)
    sum_p = 0 
    for p in s_p:
        sum_p+=p
    s_p = [p_i/sum_p for p_i in s_p]
    return s,s_p

In [29]:
def correspondence_info(matchings, prob):
    correspondences_list = []
    for matching in matchings:
        for c in matching:
            if c not in correspondences_list:
                correspondences_list.append(c)
    
    c_prob = []
    for c in correspondences_list:
        t_p = 0 
        for idx,m in enumerate(matchings):
            if c in m:
                t_p+=prob[idx]
        c_prob.append(t_p)
        
    assert len(correspondences_list)==len(c_prob)
    return correspondences_list, c_prob

In [30]:
m_l,p_l = [],[]
for i in match_list:
    matchings,prob,c_s = gen_info(i)
    m_l.append(matchings)
    p_l.append(prob)
matches_all,prob_all = merge_matchings(m_l, p_l)
assert len(matches_all) == len(prob_all)

num of matches:  4
num of matches:  18
num of matches:  1


In [31]:
c, c_prob = correspondence_info(matches_all, prob_all)

In [32]:
import json
def save(file_name,save_dic):
    path = f"./data/{file_name}"
    with open(path, "w") as f:
        json.dump(save_dic,f, ensure_ascii=False, indent=2)

data = {"correspondence_set":c, "c_prob":c_prob, "matchings":matches_all,"prob_all":prob_all}
save("purchase.json", data)