In [None]:
import os
import pandas as pd
from valentine import valentine_match, valentine_metrics
from valentine.algorithms import Coma, Cupid, DistributionBased, SimilarityFlooding, JaccardLevenMatcher

In [34]:
name = 'pair_10'
data_path = f'./Data/{name}/'
df1 = pd.read_csv(os.path.join(data_path, 'Table1.csv'))
df2 = pd.read_csv(os.path.join(data_path, 'Table2.csv'))

matcher = SimilarityFlooding()
matches = valentine_match(df1, df2, matcher)

result = {}
for match in matches:
    temp = result.get(match[0][1], {})
    temp[match[1][1]]  = matches[match]
    result[match[0][1]] = temp

result_path = f'./Result/{name}.csv'
df = pd.DataFrame(result)
df.to_csv(result_path)

In [39]:
ground_truth = [tuple(line.strip('<>\n').split(', ')) for line in open(os.path.join(data_path, 'mapping.txt')).readlines()]

def one_to_one_matches(matches: dict):
    set_match_values = set(matches.values())

    if len(set_match_values) < 2:
        return matches

    matched = dict()

    for key in matches.keys():
        matched[key[0]] = False
        matched[key[1]] = False

    median = list(set_match_values)[math.ceil(len(set_match_values)/2)]

    matches1to1 = dict()

    for key in matches.keys():
        if (not matched[key[0]]) and (not matched[key[1]]):
            similarity = matches.get(key)
            if similarity >= median:
                matches1to1[key] = similarity
                matched[key[0]] = True
                matched[key[1]] = True
            else:
                break
    return matches1to1

metrics = valentine_metrics.all_metrics(matches, ground_truth)
metrics

{'precision': 0.8333333333333334,
 'recall': 0.625,
 'f1_score': 0.7142857142857143,
 'precision_at_10_percent': 0.5,
 'precision_at_30_percent': 0.2,
 'precision_at_50_percent': 0.14,
 'precision_at_70_percent': 0.1,
 'precision_at_90_percent': 0.08888888888888889,
 'recall_at_sizeof_ground_truth': 0.625}

# Evaluation Methods

In [None]:
matchers = [Coma, Cupid, DistributionBased, SimilarityFlooding, JaccardLevenMatcher]
metrics = []
for i in range(1, 15):
    path = f'./dataset/raw/Training Data/pair_{i}/'
    df1 = pd.read_csv(os.path.join(path, 'Table1.csv'))
    df2 = pd.read_csv(os.path.join(path, 'Table2.csv'))
    ground_truth = [tuple(line.strip('<>\n').split(', ')) for line in open(os.path.join(path, 'mapping.txt')).readlines()]

    sub_metrics = []
    for j, matcher in enumerate(matchers):
        matches = valentine_match(df1, df2, matcher())
        sub_metrics.append(valentine_metrics.all_metrics(matches, ground_truth))

    metrics.append(sub_metrics)

In [14]:
average_metrics = [0] * 5
for i in range(5):
    average_metrics[i] = {}
    for j in metrics:
        for key in j[i]:
            average_metrics[i][key] = average_metrics[i].get(key, 0) + j[i][key]
    
    for key in average_metrics[i]:
        average_metrics[i][key] /= len(metrics)
    
average_metrics

[{'precision': 0.6904761904761905,
  'recall': 0.5291666666666666,
  'f1_score': 0.5891434307583376,
  'precision_at_10_percent': 0.7142857142857143,
  'precision_at_30_percent': 0.7142857142857143,
  'precision_at_50_percent': 0.6904761904761906,
  'precision_at_70_percent': 0.6726190476190476,
  'precision_at_90_percent': 0.6321428571428571,
  'recall_at_sizeof_ground_truth': 0.5791666666666666},
 {'precision': 0.6678571428571428,
  'recall': 0.40535714285714286,
  'f1_score': 0.4799863670164422,
  'precision_at_10_percent': 0.7142857142857143,
  'precision_at_30_percent': 0.6857142857142857,
  'precision_at_50_percent': 0.6547619047619048,
  'precision_at_70_percent': 0.5416666666666666,
  'precision_at_90_percent': 0.4979591836734694,
  'recall_at_sizeof_ground_truth': 0.4583333333333333},
 {'precision': 0.21428571428571427,
  'recall': 0.04107142857142857,
  'f1_score': 0.06825396825396826,
  'precision_at_10_percent': 0.21428571428571427,
  'precision_at_30_percent': 0.2142857142