### Load libr

In [227]:
# standard libr
import pandas as pd
import numpy as np
import shutil
import os
from tqdm import tqdm
import glob
import pickle
import torch

### Util funs

In [228]:
def load_all_data(dataset_path):
    '''Function for load the dataset'''
    tableA = pd.read_csv(os.path.join(dataset_path, 'tableA.csv'))
    tableB = pd.read_csv(os.path.join(dataset_path, 'tableB.csv'))
    test = pd.read_csv(os.path.join(dataset_path, 'test.csv'))

    return tableA, tableB, test

def retrieve_testing_sample_only(dataset_path):
    '''Function for retrieve only the rows in the tables that will be tested'''
    tableA, tableB, test = load_all_data(dataset_path)

    ## retrieve the testing rows
    tableA_testingId = test.ltable_id.to_list()
    tableB_testingId = test.rtable_id.to_list()
    ## subsetting the original dataframe
    testing_tableA = tableA[tableA.id.isin(tableA_testingId)].reset_index(drop=True)
    testing_tableB = tableB[tableB.id.isin(tableB_testingId)].reset_index(drop=True)

    return testing_tableA, testing_tableB

def prepare_dataset(table, with_labels = True):
  '''This function take as input a table and add a columns with:
      - if schema agnostic: all attribute concatenated
      - otherwise: consider the values of one or two specific attributes per
                   dataset
  '''
  new_table = table.copy()
  if not with_labels:
    # we drop first attribute since is row_id
    new_table.loc[:, 'attribute'] = new_table.apply(lambda sample: ' '.join(str(x) for x in sample.dropna()[1:]), axis = 1)
  else:
    new_table.loc[:, 'attribute'] = new_table.apply(lambda sample: ' '.join(f'<{k}> {v} </{k}>' for k, v in sample.dropna()[1:].to_dict().items()),  axis = 1)
  return new_table

### Import and perform testing

In [234]:
### paths 
## csv files
dataset_table_dir_path = "data/datasets"
datasets_path = [os.path.join(dataset_name, 'exp_data') for dataset_name in os.listdir(dataset_table_dir_path)]
datasets_name = [x.split('\\')[0] for x in datasets_path]

In [235]:
datasets_name

['abt_buy_exp_data',
 'dirty_dblp_acm_exp_data',
 'dirty_dblp_scholar_exp_data',
 'dirty_itunes_amazon_exp_data',
 'dirty_walmart_amazon_exp_data']

In [237]:
for dataset_name, dataset_path in zip(datasets_name, datasets_path):
    ## retrieve tables
    testing_tableA, testing_tableB = retrieve_testing_sample_only(os.path.join(dataset_table_dir_path, dataset_path))
    for type_of_model in ['simce', 'supervised']:
        embedding_dir_path = f'embeddings2/{type_of_model}_embeddings'
        for labels in [0,1]:
            ## retrieve embeddings
            embedding_dir_path_final = os.path.join(embedding_dir_path, dataset_name, 'with_separator') if labels else os.path.join(embedding_dir_path, dataset_name)
            files = glob.glob(f'{embedding_dir_path_final}/*.pt')
            record_embeddings_A = torch.load(files[0])
            record_embeddings_B = torch.load(files[1])
            ## add to dataframe
            testing_tableA[f'{type_of_model}_{labels}'] = record_embeddings_A.tolist()
            testing_tableB[f'{type_of_model}_{labels}'] = record_embeddings_B.tolist()
    ## saves
    testing_tableA.to_pickle(f'data/datasets_with_embeddings/{dataset_name}_tableA.pickle')
    testing_tableB.to_pickle(f'data/datasets_with_embeddings/{dataset_name}_tableB.pickle')

In [None]:
for dataset_name, dataset_path in zip(datasets_name, datasets_path):
    ## retrieve tables
    testing_tableA, testing_tableB = retrieve_testing_sample_only(os.path.join(dataset_table_dir_path, dataset_path))
    embedding_dir_path = f'embeddings2'
    for labels in [0,1]:
        ## retrieve embeddings
        embedding_dir_path_final = os.path.join(embedding_dir_path, dataset_name, 'with_separator') if labels else os.path.join(embedding_dir_path, dataset_name)
        files = glob.glob(f'{embedding_dir_path_final}/*.pt')
        record_embeddings_A = torch.load(files[0])
        record_embeddings_B = torch.load(files[1])
        ## add to dataframe
        testing_tableA[f'supervised_{labels}'] = record_embeddings_A.tolist()
        testing_tableB[f'supervised_{labels}'] = record_embeddings_B.tolist()
    ## saves
    testing_tableA.to_pickle(f'data/datasets_with_embeddings2/{dataset_name}_tableA.pickle')
    testing_tableB.to_pickle(f'data/datasets_with_embeddings2/{dataset_name}_tableB.pickle')

#### 1. Cosine Similarity

In [238]:
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics import confusion_matrix

def cosine_similarity(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [239]:
results = {}
for dataset_name, dataset_path in zip(datasets_name, datasets_path):
    results[dataset_name] = {}
    '''load the dataset'''
    tableA = pd.read_pickle(f'data/datasets_with_embeddings2/{dataset_name}_tableA.pickle')
    tableB = pd.read_pickle(f'data/datasets_with_embeddings2/{dataset_name}_tableB.pickle')
    test = pd.read_csv(os.path.join('data/datasets', dataset_path, 'test.csv'))

    ## merge
    #modalities = ['simce_0','simce_1','supervised_0','supervised_1']
    modalities = ['supervised_0','supervised_1']
    for modality in modalities:
        output = test.copy()
        output = pd.merge(tableA[['id',modality]].rename(columns = {'id': 'ltable_id'}), output, on = 'ltable_id')
        output = pd.merge(tableB[['id',modality]].rename(columns = {'id': 'rtable_id'}), output, on = 'rtable_id')
        
        preds = output[[f'{modality}_x',f'{modality}_y']].apply(lambda pair: int(cosine_similarity(pair[f'{modality}_x'], pair[f'{modality}_y'])>0.5), axis = 1)
        tn, fp, fn, tp = confusion_matrix(output.label, preds, normalize = 'all').ravel()
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2*(precision*recall)/(precision+recall)
    
        results[dataset_name][modality] = {
            'precision': precision, 
            'recall': recall,
            'f1': f1
        }

In [240]:
reform = {(outerKey, innerKey): values for outerKey, innerDict in results.items() for innerKey, values in innerDict.items()}

In [226]:
key_to_keep = [('abt_buy_exp_data', 'supervised_0'), ('dirty_dblp_acm_exp_data', 'supervised_0'), 
               ('dirty_dblp_scholar_exp_data', 'supervised_0'), ('dirty_itunes_amazon_exp_data','supervised_0'), 
               ('dirty_walmart_amazon_exp_data', 'supervised_0')]

pd.DataFrame({k: reform[k] for k in key_to_keep}).to_latex()

'\\begin{tabular}{lrrrrr}\n\\toprule\n & abt_buy_exp_data & dirty_dblp_acm_exp_data & dirty_dblp_scholar_exp_data & dirty_itunes_amazon_exp_data & dirty_walmart_amazon_exp_data \\\\\n & supervised_0 & supervised_0 & supervised_0 & supervised_0 & supervised_0 \\\\\n\\midrule\nprecision & 0.107516 & 0.179612 & 0.186671 & 0.247706 & 0.094192 \\\\\nrecall & 1.000000 & 1.000000 & 1.000000 & 1.000000 & 1.000000 \\\\\nf1 & 0.194156 & 0.304527 & 0.314613 & 0.397059 & 0.172168 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [241]:
pd.DataFrame(reform)

Unnamed: 0_level_0,abt_buy_exp_data,abt_buy_exp_data,dirty_dblp_acm_exp_data,dirty_dblp_acm_exp_data,dirty_dblp_scholar_exp_data,dirty_dblp_scholar_exp_data,dirty_itunes_amazon_exp_data,dirty_itunes_amazon_exp_data,dirty_walmart_amazon_exp_data,dirty_walmart_amazon_exp_data
Unnamed: 0_level_1,supervised_0,supervised_1,supervised_0,supervised_1,supervised_0,supervised_1,supervised_0,supervised_1,supervised_0,supervised_1
precision,0.107516,0.107516,0.179539,0.179539,0.186574,0.186346,0.247706,0.247706,0.094192,0.094192
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
f1,0.194156,0.194156,0.304422,0.304422,0.314475,0.314151,0.397059,0.397059,0.172168,0.172168
