# Defining the final model
Once we have obtained the continuous quality metric for join detection, we will use it to generate a predictive model that can efficiently, and in a very lightweight fashion, detect similarities between columns of a data lake.

In [187]:
import numpy as np
import pandas as pd
import math
import os
from pathlib import Path

from tqdm import tqdm
from collections import defaultdict
from itertools import product

import joblib

# Preparing the data

We will:
- Load the ground truth, which contains a subset of semantic joins, a subset of syntactic joins and a sample of the rest of joins.
- Merge it with the distances. That is, for each selected pair "add" the distances between the metrics of their respective profiles
- Remove unnecessary columns for the models (e.g. dataset and attribute names)

**Important**: the `ground_truth_models.csv` file contains all the semantic and syntactic joins detected in the data lake + a sample of joins that do not have a relationship (i.e. containment < 0.1 and no semantic link), indicated with a *null* value in the relationships cell.

In [16]:
ground_truth = pd.read_csv(f'C:/Projects/data/ground_truths/ground_truth_for_training_models.csv')
ground_truth['relationship'] = ground_truth['relationship'].fillna('unrelated') # Pairs that are neither semantic or syntactic have a NaN. We change it by unrelated to prevent problems.

count_syntactic = (ground_truth['relationship'] == 'syntactic').sum()
count_semantic = (ground_truth['relationship'] == 'semantic').sum()
count_unrelated = (ground_truth['relationship'] == 'unrelated').sum()

print(f"Number of syntactic joins: {count_syntactic}")
print(f"Number of semantic joins: {count_semantic}")
print(f"Number of unrelated pairs: {count_unrelated}")

ground_truth.describe()

Number of syntactic joins: 2703
Number of semantic joins: 1701
Number of unrelated pairs: 18206


Unnamed: 0,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
count,22610.0,22610.0,22610.0,22610.0,22610.0
mean,0.107225,0.2154501,0.043204,0.010597,0.004229
std,0.242172,0.2893796,0.14975,0.050149,0.033486
min,0.0,4.251912e-07,0.0,0.0,0.0
25%,0.0,0.01015228,0.0,0.0,0.0
50%,0.0,0.06189559,0.0,0.0,0.0
75%,0.045455,0.3333333,0.00238,0.000219,2e-06
max,1.0,1.0,1.0,0.5,0.494872


In [17]:
ground_truth

Unnamed: 0,dataset_name_1,attribute_name_1,dataset_name_2,attribute_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07
3,World_countries_env_vars.csv,Country,world_city.csv,District,unrelated,0.053498,0.177892,0.006250,0.002314,6.364830e-05
4,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05
...,...,...,...,...,...,...,...,...,...,...
22605,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00
22606,dataSpotifyClass.csv,song_title,netflix_titles.csv,description,unrelated,0.000000,0.313414,0.000000,0.000000,0.000000e+00
22607,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00
22608,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00


In [18]:
distances = pd.read_csv(f'C:/Projects/data/ground_truths/distances_for_training_models.csv')
distances

Unnamed: 0,attribute_name_1,dataset_name_1,attribute_name_2,dataset_name_2,cardinality,incompleteness,uniqueness,entropy,frequency_avg,frequency_min,...,number_words,words_cnt_max,words_cnt_min,words_cnt_avg,words_cnt_sd,first_word,last_word,is_empty,is_binary,name_dist
0,Name,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,-0.001747,0.000000,0.000000,-0.108910,0.000000,0.000000,...,-0.001025,-0.080856,0.000000,-0.002658,-0.040543,8,7,0,0,0
1,demographics,Distributions_data_2016.csv,raceEthnicity,Tech_sector_diversity_demographics_2016.csv,0.000211,0.000000,-0.135553,0.312345,0.460409,0.760277,...,0.180148,-0.020214,0.000000,-0.016396,-0.055986,4,13,0,0,12
2,country,USA_cars_datasets.csv,Name,world_country.csv,-0.007140,0.000000,-0.999200,-2.138181,0.468604,0.003709,...,0.024177,-0.121284,0.000000,-0.043013,-0.141035,8,7,0,1,7
3,Country,World_countries_env_vars.csv,District,world_city.csv,-0.033833,-0.000981,0.665114,-0.391313,-0.000744,0.000000,...,-0.055464,0.020214,0.000000,0.019129,0.053482,14,8,0,0,7
4,languageCode,books_updated.csv,CountryCode,countries_metadatacountries.csv,-0.006688,0.108400,-0.997500,-1.826932,0.133484,0.000000,...,0.097667,0.000000,0.000000,0.000000,0.000000,2,2,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22605,Set,pte_sulfo.csv,Name,AdventureWorks2014_shift.csv,0.000663,0.000000,0.000000,0.830738,0.000000,0.000000,...,0.000248,0.000000,0.000000,0.000000,0.000000,39,36,0,0,4
22606,song_title,dataSpotifyClass.csv,description,netflix_titles.csv,-0.128763,0.000000,-0.030943,-0.459097,0.000012,0.000000,...,-1.594809,-0.525564,-14.020669,-1.844184,-0.086287,104,135,0,0,9
22607,Arg0,pte_methoxy.csv,1997,countries_data.csv,-0.261564,0.000000,0.148744,-0.609951,-0.000220,0.000000,...,-0.225303,0.000000,0.000000,0.000000,0.000000,22,4,0,0,4
22608,internet,student-mat.csv,Name,AdventureWorks2014_stateprovince.csv,-0.005393,0.000000,-0.994937,-1.860018,0.073753,0.040177,...,0.001498,-0.040428,0.000000,-0.040356,-0.100492,4,6,0,1,7


In [19]:
joined = pd.merge(ground_truth, distances, on=['dataset_name_1', 'dataset_name_2', 'attribute_name_1', 'attribute_name_2'])
joined

Unnamed: 0,dataset_name_1,attribute_name_1,dataset_name_2,attribute_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality,...,number_words,words_cnt_max,words_cnt_min,words_cnt_avg,words_cnt_sd,first_word,last_word,is_empty,is_binary,name_dist
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03,...,-0.001025,-0.080856,0.000000,-0.002658,-0.040543,8,7,0,0,0
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05,...,0.180148,-0.020214,0.000000,-0.016396,-0.055986,4,13,0,0,12
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07,...,0.024177,-0.121284,0.000000,-0.043013,-0.141035,8,7,0,1,7
3,World_countries_env_vars.csv,Country,world_city.csv,District,unrelated,0.053498,0.177892,0.006250,0.002314,6.364830e-05,...,-0.055464,0.020214,0.000000,0.019129,0.053482,14,8,0,0,7
4,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05,...,0.097667,0.000000,0.000000,0.000000,0.000000,2,2,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22607,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00,...,0.000248,0.000000,0.000000,0.000000,0.000000,39,36,0,0,4
22608,dataSpotifyClass.csv,song_title,netflix_titles.csv,description,unrelated,0.000000,0.313414,0.000000,0.000000,0.000000e+00,...,-1.594809,-0.525564,-14.020669,-1.844184,-0.086287,104,135,0,0,9
22609,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00,...,-0.225303,0.000000,0.000000,0.000000,0.000000,22,4,0,0,4
22610,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00,...,0.001498,-0.040428,0.000000,-0.040356,-0.100492,4,6,0,1,7


In [22]:
print(joined.columns)

joined.drop(['dataset_name_1', 'attribute_name_1', 'dataset_name_2', 'attribute_name_2', 
             'relationship', 'containment', 'cardinality_proportion', 'jaccard', 'multiset_jaccard'],
             axis='columns', inplace=True)

Index(['dataset_name_1', 'attribute_name_1', 'dataset_name_2',
       'attribute_name_2', 'relationship', 'containment',
       'cardinality_proportion', 'jaccard', 'multiset_jaccard', 'quality',
       'cardinality', 'incompleteness', 'uniqueness', 'entropy',
       'frequency_avg', 'frequency_min', 'frequency_max', 'frequency_sd',
       'frequency_iqr', 'val_pct_min', 'val_pct_max', 'val_pct_std',
       'constancy', 'frequency_1qo', 'frequency_2qo', 'frequency_3qo',
       'frequency_4qo', 'frequency_5qo', 'frequency_6qo', 'frequency_7qo',
       'freq_word_containment', 'freq_word_soundex_containment',
       'len_max_word', 'len_min_word', 'len_avg_word', 'number_words',
       'words_cnt_max', 'words_cnt_min', 'words_cnt_avg', 'words_cnt_sd',
       'first_word', 'last_word', 'is_empty', 'is_binary', 'name_dist'],
      dtype='object')


# Model selection

Our goal is to define the best regressor model that can approximate the true value of the joinability metric defined ($MJ$ & $K$) by using profiles.

We define four base models to do so: (i) all profile metrics, (ii) all profile metrics + lightweight feature selection, (iii) all profile metrics + lightweight feature selection and (iv) a "custom" set of features obtained after some preliminary testing.

**Important note:** we have remove the "datatype" metrics from the profiles. These metrics measured whether the column fit into an URL, email, etc. These were removed due to their elevated cost of computation (much higher than all the other metrics combined) and given that they did not contribute to the models in meaningful ways.


### Model evaluation methodology

We want to define a regression model. To do so, we will employ 17 base regressors (listed below) evaluated over a 10-split CV (test size = 30%) and 4 different metrics, of which we will primarily focus on the RMSE.

In [23]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

split = ShuffleSplit(n_splits=10, test_size=0.3, random_state=211199)

def scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    return {"R2": r2_score(y, y_pred),
            "MAE": mean_absolute_error(y, y_pred),
            "RMSE": math.sqrt(mean_squared_error(y, y_pred)),
            "MedAE": median_absolute_error(y, y_pred),}

names = [
    "Linear Regression",
    "Ridge Regression",
    "Lasso Regression",
    "ElasticNet Regression",
    "Random Forests",
    "Gradient Boosting",
    "AdaBoost",
    "Extra Trees",
    "Histogram Gradient Boosting",
    "XGBoosting",
    "Light GBM",
    "CatBoost",
    "MLP Relu",
    "MLP logistic",
    "MLP Tanh",
    "SVR Poly",
    "SVR Rbf"
]

regressors = [
    LinearRegression(),
    Ridge(random_state=211199),
    Lasso(random_state=211199),
    ElasticNet(random_state=211199),
    RandomForestRegressor(random_state=211199, n_estimators=50),
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    AdaBoostRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    HistGradientBoostingRegressor(random_state=211199),
    XGBRegressor(random_state=211199, n_estimators=50),
    LGBMRegressor(random_state=211199, n_estimators=50, verbose=0),
    CatBoostRegressor(random_state=211199, verbose=0),
    MLPRegressor(random_state=211199, activation = 'relu'),
    MLPRegressor(random_state=211199, activation = 'logistic'),
    MLPRegressor(random_state=211199, activation = 'tanh'),
    SVR(kernel="poly"),
    SVR(kernel="rbf"),
]

def test_list_of_regressors(predictors, target, names, regressors):
  for name, regressor in zip(names, regressors):
    print(name)
    test_regressor(regressor, predictors, target)

def test_regressor(regressor, predictors, target):
  scores = cross_validate(regressor, predictors, target, cv=split, scoring=scoring, verbose=0)
  print(f"Fit time: {scores['fit_time'].mean():.6f} | "
      f"Score time: {scores['score_time'].mean():.6f}")

  print(f"R2: {scores['test_R2'].mean():.6f} | "
      f"MAE: {scores['test_MAE'].mean():.6f} | "
      f"RMSE: {scores['test_RMSE'].mean():.6f} | "
      f"MedAE: {scores['test_MedAE'].mean():.6f}")
  print("------------------------------")

After testing, the regressors below are the ones that perform best in average. Hence, we define some functions to directly store the models associated with these base regressors.

In [24]:
models_path = 'C:/Projects/results_freyja/models'

In [25]:
best_names = [
        "gradient_boosting",
        "extra_trees",
        "xgboosting",
        "catboost"
    ]

best_regressors = [
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    XGBRegressor(random_state=211199, n_estimators=50),
    CatBoostRegressor(random_state=211199, verbose=0),
]

def store_models(target, predictors, model_typology):
    for name, regressor in tqdm(zip(best_names, best_regressors), total=len(best_names)):
        regressor.fit(predictors, target)
        folder_path = Path(f"{models_path}/{model_typology}")
        folder_path.mkdir(parents=True, exist_ok=True)
        joblib.dump(regressor, folder_path / f"{name}_{model_typology}.pkl")

### Model 1: All metrics

Model that includes all metrics (likely to overfit and contain redundancy).

In [26]:
y_MJ = joined['quality']
predictors = joined.drop(columns=['quality'], axis=1)

test_list_of_regressors(predictors, y_MJ, names, regressors)

Linear Regression
Fit time: 0.015084 | Score time: 0.003200
R2: 0.245259 | MAE: 0.007351 | RMSE: 0.030250 | MedAE: 0.001155
------------------------------
Ridge Regression
Fit time: 0.009687 | Score time: 0.003168
R2: 0.245202 | MAE: 0.007344 | RMSE: 0.030251 | MedAE: 0.001145
------------------------------
Lasso Regression
Fit time: 0.011201 | Score time: 0.003002
R2: -0.000131 | MAE: 0.008125 | RMSE: 0.034828 | MedAE: 0.004196
------------------------------
ElasticNet Regression
Fit time: 0.010195 | Score time: 0.003102
R2: -0.000131 | MAE: 0.008125 | RMSE: 0.034828 | MedAE: 0.004196
------------------------------
Random Forests
Fit time: 21.715015 | Score time: 0.048598
R2: 0.880686 | MAE: 0.001303 | RMSE: 0.011876 | MedAE: 0.000002
------------------------------
Gradient Boosting
Fit time: 4.717526 | Score time: 0.005962
R2: 0.834697 | MAE: 0.002286 | RMSE: 0.014073 | MedAE: 0.000063
------------------------------
AdaBoost
Fit time: 0.475718 | Score time: 0.007401
R2: 0.752848 | MA

In [27]:
target = joined['quality']
predictors = joined.drop(columns=['quality'], axis=1)

store_models(target, predictors, "all")

100%|██████████| 4/4 [00:14<00:00,  3.71s/it]


### Model 2: All metrics + Lightweight Feature Selection

Model 1 + lightweight feature selection process to reduce overfitting and redundancy.

In [32]:
for name, regressor in zip(best_names, best_regressors):
  model_all = joblib.load(f"{models_path}/all/{name}_all.pkl")

  feature_importances = model_all.feature_importances_
  predictors = joined.drop(columns=['quality'], axis=1)

  # Match feature importances with corresponding feature names
  feature_names = list(predictors.columns)
  feature_importance_dict = dict(zip(feature_names, feature_importances))

  # Sort the feature importances in descending order
  sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

  selected_metrics_fs_all = []
  for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics_fs_all.append(feature)

  target = joined['quality']
  predictors = joined[selected_metrics_fs_all]
  print(f"Number of features for {name} -> Original = {len(feature_names)}, new = {len(selected_metrics_fs_all)}")
  test_list_of_regressors(predictors, target, [name], [regressor])

  model = regressor.fit(predictors, target)
  folder_path = Path(f"{models_path}/all_fs_simple")
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, f"{folder_path}/{name}_all_fs_simple.pkl")

Number of features for gradient_boosting -> Original = 35, new = 19
gradient_boosting
Fit time: 2.621250 | Score time: 0.005131
R2: 0.835561 | MAE: 0.002280 | RMSE: 0.014037 | MedAE: 0.000065
------------------------------
Number of features for extra_trees -> Original = 35, new = 33
extra_trees
Fit time: 2.975584 | Score time: 0.054783
R2: 0.901720 | MAE: 0.001226 | RMSE: 0.010735 | MedAE: 0.000002
------------------------------
Number of features for xgboosting -> Original = 35, new = 31
xgboosting
Fit time: 0.123266 | Score time: 0.011395
R2: 0.888968 | MAE: 0.001443 | RMSE: 0.011468 | MedAE: 0.000064
------------------------------
Number of features for catboost -> Original = 35, new = 33
catboost
Fit time: 3.008903 | Score time: 0.005605
R2: 0.900992 | MAE: 0.001417 | RMSE: 0.010765 | MedAE: 0.000154
------------------------------


### Model 3: In-depth feature selection

Model 1 + multi-layer feature selection.

In [34]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression, RFECV
from sklearn.model_selection import KFold

def feature_selection_pipeline(dataset, model_typology_name):
    y = dataset["quality"]
    X = dataset.drop(columns=["quality"], axis=1)
    original_features = X.columns.tolist()

    print(f"Original feature count: {len(original_features)}")

    # 1.1 Variance Threshold
    var_thresh = VarianceThreshold(threshold = 0.01)
    var_thresh.fit(X)
    low_variance_removed = X.columns[~var_thresh.get_support()].tolist()
    X = X[X.columns[var_thresh.get_support()]]
    print(f"Removed low-variance features: {low_variance_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.2 Correlation Filter
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_removed = [column for column in upper.columns if any(upper[column] > 0.9)]
    X = X.drop(columns=high_corr_removed)
    print(f"Removed highly correlated features: {high_corr_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.3 Mutual Information
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    low_mi_removed = mi_scores[mi_scores < 0.01].index.tolist()
    X_post_filter_methods = X[mi_scores[mi_scores >= 0.01].index]
    print(f"Removed low mutual information features: {low_mi_removed}")
    print(f"Remaining features: {len(X_post_filter_methods.columns)}")

    # Utility function for importance + RFECV steps
    def run_model_selection(model, model_name):
        nonlocal X_post_filter_methods, y

        # Feature Importance Filter
        model.fit(X_post_filter_methods, y)
        feature_importances = model.feature_importances_
        feature_names = list(X_post_filter_methods.columns)
        feature_importance_dict = dict(zip(feature_names, feature_importances))
        sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
        selected_metrics = [f for f, imp in sorted_importances if imp > 0.0001]

        X_temp = X_post_filter_methods[selected_metrics]
        print(f"{model_name} - Selected {len(X_temp.columns)} features after importance filtering")

        # RFECV
        rfecv = RFECV(estimator=model, step=1, cv=KFold(5), scoring='r2',verbose=3)
        rfecv.fit(X_temp, y)
        selected = X_temp.columns[rfecv.support_]
        print(f"{model_name} - Optimal number of features: {rfecv.n_features_}")
        print(f"{model_name} - Selected features: {list(selected)}")

        # Train final model
        model.fit(X_temp[selected], y)
        folder_path = Path(models_path) / model_typology_name
        folder_path.mkdir(parents=True, exist_ok=True)
        model_file = folder_path / f"{model_name}_{model_typology_name}.pkl"
        joblib.dump(model, model_file)
        print(f"Saved model to: {model_file}")

        return list(selected)

    run_model_selection(GradientBoostingRegressor(random_state=21111999, n_estimators=50), "gradient_boosting")
    run_model_selection(ExtraTreesRegressor(random_state=21111999, n_estimators=50), "extra_trees")
    run_model_selection(XGBRegressor(random_state=211199, n_estimators=50), "xgboosting")
    run_model_selection(CatBoostRegressor(random_state=211199, verbose=0), "catboost")

In [35]:
feature_selection_pipeline(joined, "all_fs_deep")

Original feature count: 35
Removed low-variance features: ['is_empty']
Remaining features: 34
Removed highly correlated features: ['constancy', 'frequency_1qo', 'frequency_2qo', 'frequency_3qo', 'frequency_4qo', 'frequency_6qo', 'frequency_7qo', 'words_cnt_sd']
Remaining features: 26
Removed low mutual information features: []
Remaining features: 26
gradient_boosting - Selected 18 features after importance filtering
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting est

### Model 4: Custom
Model defined via a custom set of metrics obtained after initial testing.

In [36]:
selected_metrics_fs_no_syntactic = ['name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
                                      'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
                                      'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']

for name, regressor in zip(best_names, best_regressors):
  target = joined['quality']
  predictors = joined[selected_metrics_fs_no_syntactic]


  model = regressor.fit(predictors, target)
  folder_path = Path(f"{models_path}/custom")
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, folder_path / f"{name}_custom.pkl")

# Benchmark evaluation

Once we have define all the models, we will evaluate each of the seven selected benchmarks with all of them, with the goal of discerning which is the best one. To do so, all the distances for all query columns of each benchmark have been obtained and stored.



In [None]:
benchmark_results = {} # variable to store the individual results of the benchmarks, to later decide the best model

In [None]:
import app.core.model.evaluate_benchmark_performance as ebp

models_path = 'C:/Projects/results_freyja/models'

def evaluate_models(k, step, ground_truth_path, distances_folder_path, benchmark_name):
    results = {}

    typologies = ["all", "all_fs_simple", "all_fs_deep", "custom"]

    for name, type in tqdm(product(best_names, typologies), total=len(best_names)*len(typologies), desc="Processing models"):
        try:
            config = ebp.ModelExecutionConfig(
                k = k,
                step = step,
                ground_truth_path = Path(ground_truth_path),
                distances_folder_path = Path(distances_folder_path),
                model_path = Path(f"{models_path}/{type}/{name}_{type}.pkl")
            )
            model_execution = ebp.ModelExecution(config)
            model_results = model_execution.evaluate_benchmark(use_tqdm=False)

            results[f"{name}_{type}"] = model_results['precision']
            # print(f"Model {name}_{type} --------------")
            # print(f"Precision scores: {model_results['precision']}")
            # print(f"Recall scores: {model_results['recall']}")
            # print(f"MAP scores: {model_results['map']}")
        except FileNotFoundError:
            pass
    
    # Compute average precisions
    avg_precisions = {model_name: sum(precisions) / len(precisions) for model_name, precisions in results.items()}

    # Sort models from best to worst
    ranked_models = sorted(avg_precisions.items(), key=lambda x: x[1], reverse=True)

    # Print ranked results
    print("\n==================== MODEL RANKINGS (top 5) ====================")
    for rank, (model_name, avg_precision) in enumerate(ranked_models[:5], start=1):
        print(f"{rank:2d}. Model: {model_name:20s} | Avg Precision: {avg_precision:.4f}")
    
    benchmark_results[benchmark_name] = ranked_models


## Deciding the best base model

### Santos Small

In [167]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/santos_small'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "santos_small")

Processing models: 100%|██████████| 16/16 [00:33<00:00,  2.09s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.9586
 2. Model: gradient_boosting_all | Avg Precision: 0.9490
 3. Model: gradient_boosting_all_fs_simple | Avg Precision: 0.9443
 4. Model: extra_trees_all      | Avg Precision: 0.9386
 5. Model: extra_trees_all_fs_simple | Avg Precision: 0.9369





### TUS Small

In [168]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_small/tus_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_small'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "tus_small")

Processing models: 100%|██████████| 16/16 [02:28<00:00,  9.26s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.8875
 2. Model: catboost_all_fs_deep | Avg Precision: 0.8676
 3. Model: catboost_custom      | Avg Precision: 0.8425
 4. Model: gradient_boosting_all | Avg Precision: 0.8356
 5. Model: catboost_all_fs_simple | Avg Precision: 0.8272





### TUS Big

In [169]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_big/tus_big_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_big'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "tus_big")

Processing models: 100%|██████████| 16/16 [08:18<00:00, 31.17s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.9267
 2. Model: gradient_boosting_all | Avg Precision: 0.9155
 3. Model: gradient_boosting_all_fs_simple | Avg Precision: 0.9143
 4. Model: extra_trees_all      | Avg Precision: 0.9012
 5. Model: extra_trees_all_fs_deep | Avg Precision: 0.9011





### D3L

In [170]:
k = 100
step = 10
ground_truth_path = 'C:/Projects/benchmarks/d3l/d3l_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/d3l'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "d3l")

Processing models: 100%|██████████| 16/16 [01:26<00:00,  5.40s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.7788
 2. Model: catboost_custom      | Avg Precision: 0.6327
 3. Model: catboost_all_fs_deep | Avg Precision: 0.6283
 4. Model: xgboosting_custom    | Avg Precision: 0.6259
 5. Model: gradient_boosting_all_fs_simple | Avg Precision: 0.6054





### Freyja

In [171]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/freyja/freyja_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/freyja'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "freyja")

Processing models: 100%|██████████| 16/16 [00:15<00:00,  1.03it/s]


 1. Model: gradient_boosting_custom | Avg Precision: 0.9624
 2. Model: gradient_boosting_all | Avg Precision: 0.9387
 3. Model: gradient_boosting_all_fs_simple | Avg Precision: 0.9363
 4. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9213
 5. Model: xgboosting_custom    | Avg Precision: 0.8898





### OM CG

In [172]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_city_government/omnimatch_city_government_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_city_government'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "omnimatch_city_government")

Processing models: 100%|██████████| 16/16 [00:22<00:00,  1.43s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.5763
 2. Model: catboost_custom      | Avg Precision: 0.5552
 3. Model: extra_trees_all_fs_deep | Avg Precision: 0.5526
 4. Model: extra_trees_all      | Avg Precision: 0.5483
 5. Model: extra_trees_custom   | Avg Precision: 0.5448





### OM CR

In [173]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/omnimatch_culture_recreation_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_culture_recreation'

evaluate_models(k, step, ground_truth_path, distances_folder_path, "omnimatch_culture_recreation")

Processing models: 100%|██████████| 16/16 [00:23<00:00,  1.45s/it]


 1. Model: gradient_boosting_custom | Avg Precision: 0.5996
 2. Model: extra_trees_all      | Avg Precision: 0.5490
 3. Model: extra_trees_all_fs_simple | Avg Precision: 0.5444
 4. Model: gradient_boosting_all_fs_simple | Avg Precision: 0.5439
 5. Model: gradient_boosting_all | Avg Precision: 0.5436





In [174]:
def obtain_best_overall_model(benchmark_results):
    # Accumulate scores
    scores_sum = defaultdict(float)
    scores_count = defaultdict(int)

    for benchmark, results in benchmark_results.items():
        for model, score in results:
            scores_sum[model] += score
            scores_count[model] += 1

    # Compute average score per model
    avg_scores = {
        model: scores_sum[model] / scores_count[model]
        for model in scores_sum
    }

    # Get top 3 models
    top_3 = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)[:3]

    print("Top 3 models with average scores:")
    for model, score in top_3:
        print(f"{model:35s} {score:.4f}")

    print("\nScores per benchmark for top 3 models:")
    for benchmark, results in benchmark_results.items():
        print(f"\nBenchmark: {benchmark}")
        results_dict = dict(results)
        for model, _ in top_3:
            score = results_dict.get(model, None)
            print(f"{model:35s} {score}")

obtain_best_overall_model(benchmark_results)

Top 3 models with average scores:
gradient_boosting_custom            0.8128
gradient_boosting_all               0.7505
gradient_boosting_all_fs_simple     0.7456

Scores per benchmark for top 3 models:

Benchmark: santos_small
gradient_boosting_custom            0.9585700000000001
gradient_boosting_all               0.94899
gradient_boosting_all_fs_simple     0.9443400000000001

Benchmark: tus_small
gradient_boosting_custom            0.8875333333333333
gradient_boosting_all               0.8356166666666667
gradient_boosting_all_fs_simple     0.8213666666666666

Benchmark: tus_big
gradient_boosting_custom            0.9267166666666666
gradient_boosting_all               0.9154833333333333
gradient_boosting_all_fs_simple     0.9142833333333334

Benchmark: d3l
gradient_boosting_custom            0.7788399999999999
gradient_boosting_all               0.5924699999999999
gradient_boosting_all_fs_simple     0.60544

Benchmark: freyja
gradient_boosting_custom            0.9624200000000002
gr

## Fine-tuning
The best base model is clearly the gradient boosting, specially that with the custom metrics. The most likely explanation is its reduced overfitting, which makes it able to adapt to many benchmarks, as opposed to, for example, xgboost.

Now, we will fine-tune the model to extract even more effectivenes out of the join detection process. We will generate 48 models from the initial gradient boosting, following a grid search over relevant parameters. Each model will be evaluated on each of the seven benchmarks, and we will obtain the best overall model (same as before).

In [113]:
models_path = "C:/Projects/results_freyja/models/fine_tuning"
folder_path = Path(models_path)
folder_path.mkdir(parents=True, exist_ok=True)

selected_metrics = [
    'name_dist', 'frequency_max', 'uniqueness', 'first_word',
    'frequency_4qo', 'freq_word_containment', 'len_avg_word',
    'words_cnt_max', 'frequency_6qo', 'len_max_word',
    'frequency_min', 'frequency_3qo', 'is_empty',
    'frequency_iqr', 'entropy', 'val_pct_std',
    'words_cnt_min', 'cardinality', 'words_cnt_sd',
    'val_pct_max', 'len_min_word', 'words_cnt_avg'
]

predictors = joined[selected_metrics]
target = joined['quality']

param_grid = {
    "n_estimators": [25, 50, 100],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "subsample": [0.8, 1.0],
    "min_samples_leaf": [1, 10]
}

# Wrap the product iterator with tqdm
for values in tqdm(list(product(*param_grid.values())), total=len(list(product(*param_grid.values())))):
    params = dict(zip(param_grid.keys(), values))

    regressor = GradientBoostingRegressor(random_state=211199, **params)
    regressor.fit(predictors, target)

    model_name = (
        f"gradient_boosting_"
        f"ne{params['n_estimators']}_"
        f"lr{params['learning_rate']}_"
        f"md{params['max_depth']}_"
        f"ss{params['subsample']}_"
        f"msl{params['min_samples_leaf']}.pkl"
    )

    joblib.dump(regressor, folder_path / model_name)

100%|██████████| 48/48 [04:31<00:00,  5.65s/it]


In [118]:
benchmark_results_fine_tuning = {}

In [None]:
def evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, benchmark_name):
    results = {}
    fine_tuned_models_path = 'C:/Projects/results_freyja/models/fine_tuning'

    for filename in tqdm(os.listdir(fine_tuned_models_path), desc="Processing models"):
        model_name = os.path.splitext(filename)[0]
        file_path = os.path.join(fine_tuned_models_path, filename)
        try:
            config = ebp.ModelExecutionConfig(
                k = k,
                step = step,
                ground_truth_path = Path(ground_truth_path),
                distances_folder_path = Path(distances_folder_path),
                model_path = Path(file_path)
            )
            model_execution = ebp.ModelExecution(config)

            model_results = model_execution.evaluate_benchmark(use_tqdm=False)
            results[model_name] = model_results['precision']
            # print(f"Model {model_name} --------------")
            # print(f"Precision scores: {model_results['precision']}")
            # print(f"Recall scores: {model_results['recall']}")
            # print(f"MAP scores: {model_results['map']}")
        except FileNotFoundError:
            pass
    
    # Compute average precisions
    avg_precisions = {model_name: sum(precisions) / len(precisions) for model_name, precisions in results.items()}

    # Sort models from best to worst
    ranked_models = sorted(avg_precisions.items(), key=lambda x: x[1], reverse=True)

    # Print ranked results
    print("\n==================== MODEL RANKINGS (top 5) ====================")
    for rank, (model_name, avg_precision) in enumerate(ranked_models[:5], start=1):
        print(f"{rank:2d}. Model: {model_name:20s} | Avg Precision: {avg_precision:.4f}")
    
    benchmark_results_fine_tuning[benchmark_name] = ranked_models


In [161]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/santos_small'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "santos_small")

100%|██████████| 48/48 [01:21<00:00,  1.69s/it]


 1. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9719
 2. Model: gradient_boosting_ne50_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.9714
 3. Model: gradient_boosting_ne50_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9712
 4. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9706
 5. Model: gradient_boosting_ne50_lr0.05_md3_ss0.8_msl1 | Avg Precision: 0.9659





In [176]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_small/tus_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_small'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "tus_small")

Processing models: 100%|██████████| 48/48 [06:16<00:00,  7.85s/it]


 1. Model: gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.8958
 2. Model: gradient_boosting_ne50_lr0.05_md5_ss0.8_msl1 | Avg Precision: 0.8950
 3. Model: gradient_boosting_ne50_lr0.05_md5_ss0.8_msl10 | Avg Precision: 0.8944
 4. Model: gradient_boosting_ne100_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.8936
 5. Model: gradient_boosting_ne25_lr0.1_md5_ss1.0_msl1 | Avg Precision: 0.8936





In [177]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_big/tus_big_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_big'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "tus_big")

Processing models: 100%|██████████| 48/48 [22:46<00:00, 28.46s/it]


 1. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9381
 2. Model: gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.9358
 3. Model: gradient_boosting_ne50_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9351
 4. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9350
 5. Model: gradient_boosting_ne50_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.9326





In [178]:
k = 100
step = 10
ground_truth_path = 'C:/Projects/benchmarks/d3l/d3l_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/d3l'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "d3l")

Processing models: 100%|██████████| 48/48 [03:31<00:00,  4.40s/it]


 1. Model: gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.8005
 2. Model: gradient_boosting_ne50_lr0.05_md5_ss0.8_msl1 | Avg Precision: 0.7993
 3. Model: gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 | Avg Precision: 0.7980
 4. Model: gradient_boosting_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.7930
 5. Model: gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.7843





In [179]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/freyja/freyja_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/freyja'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "freyja")

Processing models: 100%|██████████| 48/48 [00:34<00:00,  1.39it/s]


 1. Model: gradient_boosting_ne50_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9624
 2. Model: gradient_boosting_ne25_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.9623
 3. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9578
 4. Model: gradient_boosting_ne100_lr0.05_md3_ss0.8_msl1 | Avg Precision: 0.9576
 5. Model: gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9570





In [180]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_city_government/omnimatch_city_government_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_city_government'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "omnimatch_city_government")

Processing models: 100%|██████████| 48/48 [00:53<00:00,  1.11s/it]


 1. Model: gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 | Avg Precision: 0.6111
 2. Model: gradient_boosting_ne100_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.6018
 3. Model: gradient_boosting_ne50_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.5990
 4. Model: gradient_boosting_ne100_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.5947
 5. Model: gradient_boosting_ne100_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.5942





In [181]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/omnimatch_culture_recreation_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_culture_recreation'

evaluate_models_fine_tuned(k, step, ground_truth_path, distances_folder_path, "omnimatch_culture_recreation")

Processing models: 100%|██████████| 48/48 [00:53<00:00,  1.11s/it]


 1. Model: gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.6089
 2. Model: gradient_boosting_ne100_lr0.05_md3_ss0.8_msl1 | Avg Precision: 0.6034
 3. Model: gradient_boosting_ne50_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.5997
 4. Model: gradient_boosting_ne50_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.5996
 5. Model: gradient_boosting_ne25_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.5994





In [183]:
obtain_best_overall_model(benchmark_results_fine_tuning)

Top 3 models with average scores:
gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 0.8176
gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 0.8163
gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 0.8157

Scores per benchmark for top 3 models:

Benchmark: santos_small
gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 0.9577000000000002
gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 0.9558500000000001
gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 0.9512700000000001

Benchmark: tus_small
gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 0.8925500000000001
gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 0.88755
gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 0.8957833333333333

Benchmark: tus_big
gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 0.9279999999999999
gradient_boosting_ne100_lr0.05_md3_ss1.0_msl10 0.9293166666666667
gradient_boosting_ne50_lr0.1_md3_ss0.8_msl10 0.9357666666666667

Benchmark: d3l
gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10 0.7980099999999999
gradient_boosting_ne100_lr0.05_md3_ss1.0

The `gradient_boosting_ne100_lr0.05_md3_ss0.8_msl1` model is the best performing model across all benchmarks. Next, we print the specific results for this model for all benchmarks.

In [185]:
def evaluate_single_model(k, step, ground_truth_path, distances_folder_path, benchmark_name):
    try:
        config = ebp.ModelExecutionConfig(
            k = k,
            step = step,
            ground_truth_path = Path(ground_truth_path),
            distances_folder_path = Path(distances_folder_path),
            model_path = Path("C:/Projects/FREYJA/app/core/model/gradient_boosting_ne100_lr0.05_md3_ss0.8_msl10.pkl")
        )
        model_execution = ebp.ModelExecution(config)

        model_results = model_execution.evaluate_benchmark()
        print(f"Model {model_name}, benchmark {benchmark_name}")
        print(f"Precision scores: {model_results['precision']}")
        print(f"Recall scores: {model_results['recall']}")
        print(f"MAP scores: {model_results['map']}")
        print(f"-----------------------")
    except FileNotFoundError:
        pass


ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/santos_small'
evaluate_single_model(10, 1, ground_truth_path, distances_folder_path, "santos_small")

ground_truth_path = 'C:/Projects/benchmarks/tus_small/tus_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_small'
evaluate_single_model(60, 10, ground_truth_path, distances_folder_path, "tus_small")

ground_truth_path = 'C:/Projects/benchmarks/tus_big/tus_big_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/tus_big'
evaluate_single_model(60, 10, ground_truth_path, distances_folder_path, "tus_big")

ground_truth_path = 'C:/Projects/benchmarks/d3l/d3l_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/d3l'
evaluate_single_model(100, 10, ground_truth_path, distances_folder_path, "d3l")

ground_truth_path = 'C:/Projects/benchmarks/freyja/freyja_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/freyja'
evaluate_single_model(10, 1, ground_truth_path, distances_folder_path, "freyja")

ground_truth_path = 'C:/Projects/benchmarks/omnimatch_city_government/omnimatch_city_government_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_city_government'
evaluate_single_model(30, 5, ground_truth_path, distances_folder_path, "omnimatch_city_government")

ground_truth_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/omnimatch_culture_recreation_ground_truth.csv'
distances_folder_path = 'C:/Projects/results_freyja/distances/omnimatch_culture_recreation'
evaluate_single_model(30, 5, ground_truth_path, distances_folder_path, "omnimatch_culture_recreation")

100%|██████████| 50/50 [00:01<00:00, 28.78it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark santos_small
Precision scores: [1.0, 0.99, 0.9667, 0.965, 0.956, 0.95, 0.9457, 0.94, 0.9356, 0.928]
Recall scores: [1.0, 0.988, 0.968, 0.9659, 0.9583, 0.951, 0.9466, 0.9398, 0.9346, 0.9261]
MAP scores: [1.0, 1.0, 1.0, 0.9967, 0.9961, 0.9952, 0.9931, 0.9917, 0.9909, 0.9905]
-----------------------


100%|██████████| 100/100 [00:07<00:00, 12.50it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark tus_small
Precision scores: [0.973, 0.902, 0.839, 0.8682, 0.8834, 0.8897]
Recall scores: [0.9787, 0.9091, 0.847, 0.8758, 0.8906, 0.897]
MAP scores: [0.9803, 0.9778, 0.9692, 0.928, 0.9155, 0.9117]
-----------------------


100%|██████████| 100/100 [00:28<00:00,  3.49it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark tus_big
Precision scores: [0.962, 0.9535, 0.935, 0.9198, 0.9042, 0.8935]
Recall scores: [0.9151, 0.9099, 0.876, 0.8448, 0.8101, 0.7849]
MAP scores: [0.9879, 0.9836, 0.9831, 0.9812, 0.9697, 0.9628]
-----------------------


100%|██████████| 100/100 [00:04<00:00, 22.59it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark d3l
Precision scores: [0.827, 0.8265, 0.824, 0.8108, 0.791, 0.7973, 0.7987, 0.7854, 0.7681, 0.7513]
Recall scores: [0.8276, 0.826, 0.8218, 0.8079, 0.7877, 0.7921, 0.7914, 0.777, 0.7593, 0.7419]
MAP scores: [0.7438, 0.8069, 0.8156, 0.8223, 0.8237, 0.8209, 0.8225, 0.8264, 0.8297, 0.8332]
-----------------------


100%|██████████| 50/50 [00:00<00:00, 69.40it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark freyja
Precision scores: [1.0, 0.98, 0.9667, 0.955, 0.948, 0.9467, 0.9457, 0.94, 0.9267, 0.924]
Recall scores: [1.0, 0.9843, 0.9755, 0.9633, 0.9539, 0.9507, 0.949, 0.9424, 0.9276, 0.9246]
MAP scores: [1.0, 1.0, 1.0, 0.99, 0.9853, 0.9797, 0.9764, 0.9745, 0.9736, 0.9712]
-----------------------


100%|██████████| 50/50 [00:01<00:00, 44.44it/s]


Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark omnimatch_city_government
Precision scores: [0.624, 0.618, 0.5867, 0.607, 0.6184, 0.6127]
Recall scores: [0.6226, 0.6181, 0.5868, 0.6073, 0.619, 0.6132]
MAP scores: [0.622, 0.654, 0.6523, 0.6357, 0.6335, 0.6342]
-----------------------


100%|██████████| 50/50 [00:01<00:00, 43.88it/s]

Model gradient_boosting_ne50_lr0.1_md5_ss1.0_msl10, benchmark omnimatch_culture_recreation
Precision scores: [0.508, 0.558, 0.6027, 0.604, 0.6128, 0.6087]
Recall scores: [0.5042, 0.5559, 0.6, 0.6011, 0.6101, 0.6048]
MAP scores: [0.5016, 0.5569, 0.5762, 0.5889, 0.598, 0.6053]
-----------------------



