In [None]:
import numpy as np
import pandas as pd
import math
from pathlib import Path

from tqdm import tqdm
import time

import joblib

# Preparing the data

We will:
- Load the ground truth, which contains a subset of semantic joins, a subset of syntactic joins and a sample of the rest of joins.
- Merge it with the distances. That is, for each selected pair "add" the distances between the metrics of their respective profiles
- Remove unnecessary columns for the models (e.g. dataset and attribute names)
- Transform categorical variables into dummies

**Important**: the `ground_truth_models.csv` file contains all the semantic and syntactic joins detected in the data lake + a sample of joins that do not have a relationship (i.e. containment < 0.1 and no semantic link), indicated with a *null* value in the relationships cell.

In [57]:
ground_truth = pd.read_csv('C:/Projects/freyja_repo/data/ground_truth_models.csv')
ground_truth['relationship'] = ground_truth['relationship'].fillna('unrelated') # Pairs that are neither semantic or syntactic have a NaN. We change it by unrelated to prevent problems.

count_syntactic = (ground_truth['relationship'] == 'syntactic').sum()
count_semantic = (ground_truth['relationship'] == 'semantic').sum()
count_unrelated = (ground_truth['relationship'] == 'unrelated').sum()

print(f"Number of syntactic joins: {count_syntactic}")
print(f"Number of semantic joins: {count_semantic}")
print(f"Number of unrelated pairs: {count_unrelated}")

ground_truth.describe()

Number of syntactic joins: 2703
Number of semantic joins: 1701
Number of unrelated pairs: 18206


Unnamed: 0,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
count,22610.0,22610.0,22610.0,22610.0,22610.0
mean,0.107225,0.2154501,0.043204,0.010597,0.004229
std,0.242172,0.2893796,0.14975,0.050149,0.033486
min,0.0,4.251912e-07,0.0,0.0,0.0
25%,0.0,0.01015228,0.0,0.0,0.0
50%,0.0,0.06189559,0.0,0.0,0.0
75%,0.045455,0.3333333,0.00238,0.000219,2e-06
max,1.0,1.0,1.0,0.5,0.494872


In [58]:
ground_truth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07
3,World_countries_env_vars.csv,Country,world_city.csv,District,unrelated,0.053498,0.177892,0.006250,0.002314,6.364830e-05
4,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05
...,...,...,...,...,...,...,...,...,...,...
22605,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00
22606,dataSpotifyClass.csv,song_title,netflix_titles.csv,description,unrelated,0.000000,0.313414,0.000000,0.000000,0.000000e+00
22607,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00
22608,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00


In [59]:
distances = pd.read_csv('C:/Projects/freyja_repo/data/distances.csv')
distances

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
joined = pd.merge(ground_truth, distances, left_on=['ds_name', 'ds_name_2', 'att_name', 'att_name_2'], right_on=['dataset_name', 'dataset_name_2', 'attribute_name', 'attribute_name_2'])
joined_2 = pd.merge(ground_truth, distances, left_on=['ds_name', 'ds_name_2', 'att_name', 'att_name_2'], right_on=['dataset_name_2', 'dataset_name', 'attribute_name_2', 'attribute_name'])

merged = pd.concat([joined, joined_2], ignore_index=True)
merged

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality,...,dataset_name,cardinality,pct_date_time,uniqueness,frequency_5qo,frequency_7qo,words_cnt_min,words_cnt_avg,frequency_sd,pct_phones
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03,...,AdventureWorks2014_stateprovince.csv,-0.071857,0.0,0.000000,0.001341,0.001341,0.000000,1.371837,0.004977,0.0
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05,...,Distributions_data_2016.csv,0.143498,0.0,-0.135553,-0.104895,-0.104895,0.000000,-1.294709,-0.389624,0.0
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07,...,USA_cars_datasets.csv,-1.538034,0.0,-0.999200,0.993015,0.993015,-0.409673,-0.830778,3.014091,0.0
3,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05,...,books_updated.csv,-1.979207,0.0,-0.997500,-0.003349,0.002351,0.000000,-0.363103,2.702889,0.0
4,cars.csv,color,colors.csv,name,semantic,0.750000,0.088889,0.065217,0.000233,3.200301e-06,...,cars.csv,-0.860310,0.0,-0.999689,0.089943,0.170423,-0.182574,-1.696849,-0.202640,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22605,us_companies.csv,fulltimeemployees,genes_interactions.csv,Type,unrelated,0.000000,0.333333,0.000000,0.000000,0.000000e+00,...,genes_interactions.csv,-0.478577,0.0,-0.013717,0.215019,0.385905,0.311248,0.456492,1.088152,0.0
22606,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00,...,AdventureWorks2014_shift.csv,-0.707107,0.0,0.000000,0.293333,0.293333,0.000000,0.000000,0.707107,0.0
22607,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00,...,countries_data.csv,1.633221,0.0,-0.148744,-0.033283,-0.033283,-0.133631,-0.147237,-1.312925,0.0
22608,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00,...,AdventureWorks2014_stateprovince.csv,1.290401,0.0,0.994937,-0.827387,-0.827387,0.000000,1.788854,-1.897268,0.0


In [None]:
print(merged.columns)

merged.drop(['ds_name', 'ds_name_2', 'att_name', 'att_name_2', 
             'relationship', 'containment', 'cardinality_proportion', 'jaccard', 'multiset_jaccard', 
             'dataset_name', 'attribute_name', 'dataset_name_2', 'attribute_name_2', 
             'cardinalityRaw','cardinalityRaw_2', 'K'],  
             axis='columns', inplace=True)

Index(['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'relationship',
       'containment', 'cardinality_proportion', 'jaccard', 'multiset_jaccard',
       'quality', 'pct_alphabetic', 'pct_time', 'len_avg_word',
       'dataset_name_2', 'pct_email', 'is_empty_2', 'attribute_name_2',
       'pct_username', 'val_pct_max', 'pct_ip', 'pct_others', 'last_word',
       'constancy', 'pct_alphanumeric', 'len_max_word', 'frequency_iqr',
       'pct_date', 'datatype_2', 'cardinalityRaw', 'frequency_2qo', 'pct_url',
       'frequency_avg', 'frequency_min', 'frequency_4qo', 'val_pct_min',
       'frequency_6qo', 'pct_general', 'freq_word_containment', 'number_words',
       'cardinalityRaw_2', 'val_pct_std', 'freq_word_soundex_containment',
       'words_cnt_sd', 'frequency_max', 'K', 'pct_phrases', 'first_word',
       'entropy', 'pct_date_time_specific', 'datatype', 'len_min_word',
       'attribute_name', 'pct_non_alphanumeric', 'is_empty', 'name_dist',
       'pct_unknown', 'pct_numeric', 

In [None]:
# 2 merged, one with the dummies of the datatypes (merged_all), another without them (merged_no_dummies)
merged_all = merged
merged_all = pd.concat([merged_all.drop('datatype', axis=1), pd.get_dummies(merged_all['datatype'], prefix='datatype_', dtype=int)], axis=1)
merged_all = pd.concat([merged_all.drop('datatype_2', axis=1), pd.get_dummies(merged_all['datatype_2'], prefix='datatype_2_', dtype=int)], axis=1)
merged_all = pd.concat([merged_all.drop('specific_type', axis=1), pd.get_dummies(merged_all['specific_type'], prefix='specific_type_', dtype=int)], axis=1)
merged_all = pd.concat([merged_all.drop('specific_type_2', axis=1), pd.get_dummies(merged_all['specific_type_2'], prefix='specific_type_2_', dtype=int)], axis=1)

merged_no_dummies = merged.drop(['datatype', "datatype_2", "specific_type", "specific_type_2"],  axis='columns')

# Model selection

Our goal is to define the best regressor model that can approximate the true value of the joinability metric defined (MJ & K) by using profiles.

We define four base models to do so, whose metrics vary. The first point of variation is the inclusion of "datatypes" metrics (i.e. semantic types/characteristics of each column: names, URIs, etc. / alphabetical, numerical etc. ). These metrics are the most time consuming to compute, which implies that the already lightweight profile-based approach can be made much faster. The second point of variation is the execution (or lack thereof) of feature selection tasks, which further reduce the number of features while, ideally, keeping, or improving, the evaluation scores.

**Result**: for all models, the best performing regressor has been the Random Forest. Nonetheless, further testing with the benchmarks has shown that the Gradient Booster predictor (with no fine-tuning) works best. This might be due to overfitting produced by the Random Forest.


### Model evaluation methodology

We want to define a regression model. To do so, we will employ 12 base regressors (listed below) evaluated over a 10-split CV (test size = 30%) and 4 different metrics, of which we will primarily focus on the RMSE.

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

split = ShuffleSplit(n_splits=10, test_size=0.3, random_state=211199)

def scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    return {"R2": r2_score(y, y_pred),
            "MAE": mean_absolute_error(y, y_pred),
            "RMSE": math.sqrt(mean_squared_error(y, y_pred)),
            "MedAE": median_absolute_error(y, y_pred),}

names = [
    "Linear Regression",
    "Ridge Regression",
    "Lasso Regression",
    "ElasticNet Regression",
    "Random Forests",
    "Gradient Boosting",
    "AdaBoost",
    "Extra Trees",
    "Histogram Gradient Boosting",
    "XGBoosting",
    "Light GBM",
    "CatBoost",
    "MLP Relu",
    "MLP logistic",
    "MLP Tanh",
    "SVR Poly",
    "SVR Rbf"
]

regressors = [
    LinearRegression(),
    Ridge(random_state=211199),
    Lasso(random_state=211199),
    ElasticNet(random_state=211199),
    RandomForestRegressor(random_state=211199, n_estimators=50),
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    AdaBoostRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    HistGradientBoostingRegressor(random_state=211199),
    XGBRegressor(random_state=211199, n_estimators=50),
    LGBMRegressor(random_state=211199, n_estimators=50, verbose=0),
    CatBoostRegressor(random_state=211199, verbose=0),
    MLPRegressor(random_state=211199, activation = 'relu'),
    MLPRegressor(random_state=211199, activation = 'logistic'),
    MLPRegressor(random_state=211199, activation = 'tanh'),
    SVR(kernel="poly"),
    SVR(kernel="rbf"),
]

def test_list_of_regressors(predictors, target, names, regressors):
  for name, regressor in zip(names, regressors):
    print(name)
    test_regressor(regressor, predictors, target)

def test_regressor(regressor, predictors, target):
  scores = cross_validate(regressor, predictors, target, cv=split, scoring=scoring, verbose=0)
  print(f"Fit time: {scores['fit_time'].mean():.6f} | "
      f"Score time: {scores['score_time'].mean():.6f}")

  print(f"R2: {scores['test_R2'].mean():.6f} | "
      f"MAE: {scores['test_MAE'].mean():.6f} | "
      f"RMSE: {scores['test_RMSE'].mean():.6f} | "
      f"MedAE: {scores['test_MedAE'].mean():.6f}")
  print("------------------------------")

In [None]:
best_names = [
        "gradient_boosting",
        "extra_trees",
        "xgboosting",
        "catboost"
    ]

best_regressors = [
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    XGBRegressor(random_state=211199, n_estimators=50),
    CatBoostRegressor(random_state=211199, verbose=0),
]

models_path = Path(f'C:/Projects/freyja_repo/data/models/')

def store_models(target, predictors, model_typology):
    for name, regressor in tqdm(zip(best_names, best_regressors), total=len(best_names)):
        regressor.fit(predictors, target)
        folder_path = models_path / model_typology
        folder_path.mkdir(parents=True, exist_ok=True)
        joblib.dump(regressor, folder_path / f"{name}_{model_typology}.pkl")

### Model 1: All metrics

Model that includes all metrics (likely to overfit and contain redundancy).

In [None]:
y_MJ = merged_all['quality']
predictors = merged_all.drop(columns=['quality'], axis=1)

test_list_of_regressors(predictors, y_MJ, names, regressors)

# Gradient Boosting -> 0.016195
# Extra Trees       -> 0.009650
# XGBoosting        -> 0.008856
# CatBoost          -> 0.008206

Linear Regression
Fit time: 0.046146 | Score time: 0.004562
R2: 0.238525 | MAE: 0.009050 | RMSE: 0.029562 | MedAE: 0.003237
------------------------------
Ridge Regression
Fit time: 0.020905 | Score time: 0.004700
R2: 0.238603 | MAE: 0.009024 | RMSE: 0.029561 | MedAE: 0.003208
------------------------------
Lasso Regression
Fit time: 0.021186 | Score time: 0.004564
R2: -0.000082 | MAE: 0.007977 | RMSE: 0.033888 | MedAE: 0.004208
------------------------------
ElasticNet Regression
Fit time: 0.025123 | Score time: 0.004704
R2: -0.000082 | MAE: 0.007977 | RMSE: 0.033888 | MedAE: 0.004208
------------------------------
Random Forests
Fit time: 37.921386 | Score time: 0.056000
R2: 0.908023 | MAE: 0.001247 | RMSE: 0.009982 | MedAE: 0.000001
------------------------------
Gradient Boosting
Fit time: 6.471234 | Score time: 0.008255
R2: 0.769585 | MAE: 0.002931 | RMSE: 0.016195 | MedAE: 0.000048
------------------------------
AdaBoost
Fit time: 1.352709 | Score time: 0.034578
R2: 0.627723 | MA

In [None]:
target = merged_all['quality']
predictors = merged_all.drop(columns=['quality'], axis=1)

store_models(target, predictors, "all")

100%|██████████| 4/4 [00:20<00:00,  5.16s/it]


### Model 2: All metrics + Feature Selection

Model 1 + feature selection process to reduce overfitting and redundancy.

In [None]:
for name, regressor in zip(best_names, best_regressors):
  model_all = joblib.load(models_path / "all" / f"{name}_all.pkl")

  feature_importances = model_all.feature_importances_
  predictors = merged_all.drop(columns=['quality'], axis=1)

  # Match feature importances with corresponding feature names
  feature_names = list(predictors.columns)
  feature_importance_dict = dict(zip(feature_names, feature_importances))

  # Sort the feature importances in descending order
  sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

  selected_metrics_fs_all = []
  for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics_fs_all.append(feature)

  target = merged_all['quality']
  predictors = merged_all[selected_metrics_fs_all]
  print(f"Number of features for {name} -> Original = {len(feature_names)}, new = {len(selected_metrics_fs_all)}")
  test_list_of_regressors(predictors, target, [name], [regressor])

  # selected_metrics_fs_all = ['freq_word_containment', 'frequency_7qo', 'uniqueness', 'frequency_min', 'first_word', 'frequency_4qo', 'constancy', 'name_dist', 'frequency_iqr',
  #                     'specific_type_2__pct_username', 'words_cnt_min', 'pct_alphabetic', 'freq_word_soundex_containment', 'len_max_word', 'pct_username', 'last_word',
  #                     'frequency_max', 'pct_general', 'specific_type__pct_username', 'datatype_2__pct_alphanumeric', 'val_pct_min', 'frequency_2qo', 'len_min_word',
  #                     'pct_phrases', 'frequency_sd', 'words_cnt_max', 'val_pct_max', 'frequency_avg', 'len_avg_word', 'frequency_6qo', 'cardinality', 'frequency_3qo',
  #                     'frequency_5qo', 'incompleteness', 'datatype_2__pct_alphabetic', 'pct_unknown', 'specific_type_2__pct_phrases', 'datatype__pct_unknown', 'pct_non_alphanumeric']

  model = regressor.fit(predictors, target)
  folder_path = models_path / "all_fs"
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, folder_path / f"{name}_all_fs.pkl")


# Gradient Boosting -> from 0.016195 to 0.016114, 81 to 39 features
# Extra Trees       -> from 0.009650 to 0.009142, 81 to 54 features
# XGBoosting        -> from 0.008856 to 0.008714, 81 to 50 features
# CatBoost          -> from 0.008206 to 0.007993, 81 to 58 features

Number of features for gradient_boosting -> Original = 81, new = 39
gradient_boosting
Fit time: 4.815652 | Score time: 0.006177
R2: 0.772057 | MAE: 0.002926 | RMSE: 0.016114 | MedAE: 0.000055
------------------------------
Number of features for extra_trees -> Original = 81, new = 54
extra_trees
Fit time: 4.250001 | Score time: 0.055315
R2: 0.919671 | MAE: 0.000720 | RMSE: 0.009142 | MedAE: 0.000000
------------------------------
Number of features for xgboosting -> Original = 81, new = 50
xgboosting
Fit time: 0.137683 | Score time: 0.016001
R2: 0.926385 | MAE: 0.001030 | RMSE: 0.008714 | MedAE: 0.000054
------------------------------
Number of features for catboost -> Original = 81, new = 58
catboost
Fit time: 3.690403 | Score time: 0.006600
R2: 0.938836 | MAE: 0.001200 | RMSE: 0.007993 | MedAE: 0.000238
------------------------------


### Model 3: No datatypes (i.e. no datatypes, specific datatypes or datatype percentages)

Removal of "datatypes" metrics, which are highly costly and their contribution to the predictions of the model is not ensured.

In [None]:
quality = merged_no_dummies['quality']
predictors = merged_no_dummies.drop(columns=['quality', "pct_numeric", "pct_alphanumeric", "pct_alphabetic", "pct_non_alphanumeric", "pct_date_time", "pct_unknown",
                                      "pct_phones", "pct_email", "pct_url", "pct_ip", "pct_username", "pct_phrases", "pct_general", "pct_date", "pct_time",
                                      "pct_date_time_specific", "pct_others"], axis=1)

test_list_of_regressors(predictors, quality, names, regressors)

# Gradient Boosting -> models 1-2 -> from 0.016195 to 0.016114, 81 to 39 features
#                   -> model 3         -> 0.016299, 53 features
# Extra Trees       -> models 1-2 -> from 0.009650 to 0.009142, 81 to 54 features
#                   -> model 3         -> 0.008569, 53 features
# XGBoosting        -> models 1-2 -> from 0.008856 to 0.008714, 81 to 50 features
#                   -> model 3         -> 0.008252, 53 features
# CatBoost          -> models 1-2 -> from 0.008206 to 0.007993, 81 to 58 features
#                   -> model 3         -> 0.008011, 53 features

Linear Regression
Fit time: 0.014426 | Score time: 0.003031
R2: 0.230721 | MAE: 0.008271 | RMSE: 0.029715 | MedAE: 0.002194
------------------------------
Ridge Regression
Fit time: 0.009501 | Score time: 0.003205
R2: 0.230752 | MAE: 0.008253 | RMSE: 0.029714 | MedAE: 0.002175
------------------------------
Lasso Regression
Fit time: 0.009896 | Score time: 0.003100
R2: -0.000082 | MAE: 0.007977 | RMSE: 0.033888 | MedAE: 0.004208
------------------------------
ElasticNet Regression
Fit time: 0.010391 | Score time: 0.003209
R2: -0.000082 | MAE: 0.007977 | RMSE: 0.033888 | MedAE: 0.004208
------------------------------
Random Forests
Fit time: 28.625246 | Score time: 0.052337
R2: 0.915626 | MAE: 0.001210 | RMSE: 0.009588 | MedAE: 0.000001
------------------------------
Gradient Boosting
Fit time: 5.295446 | Score time: 0.005917
R2: 0.767448 | MAE: 0.002973 | RMSE: 0.016299 | MedAE: 0.000062
------------------------------
AdaBoost
Fit time: 0.912972 | Score time: 0.015812
R2: 0.647918 | MA

In [None]:
target = merged_no_dummies['quality']
predictors = merged_no_dummies.drop(columns=['quality', "pct_numeric", "pct_alphanumeric", "pct_alphabetic", "pct_non_alphanumeric", "pct_date_time", "pct_unknown",
                                      "pct_phones", "pct_email", "pct_url", "pct_ip", "pct_username", "pct_phrases", "pct_general", "pct_date", "pct_time",
                                      "pct_date_time_specific", "pct_others"], axis=1)

store_models(target, predictors, "no_syntactic")

100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


### Model 4: No Syntactic + Feature Selection

Model 3 + feature selection process to further trim down the group of metrics, which might cause underfitting.

In [None]:
for name, regressor in zip(best_names, best_regressors):
  model_no_syntactic = joblib.load(models_path / "no_syntactic" / f"{name}_no_syntactic.pkl")

  feature_importances = model_no_syntactic.feature_importances_
  predictors = merged_no_dummies.drop(columns=['quality', "pct_numeric", "pct_alphanumeric", "pct_alphabetic", "pct_non_alphanumeric", "pct_date_time", "pct_unknown",
                                      "pct_phones", "pct_email", "pct_url", "pct_ip", "pct_username", "pct_phrases", "pct_general", "pct_date", "pct_time",
                                      "pct_date_time_specific", "pct_others"], axis=1)

  # Match feature importances with corresponding feature names
  feature_names = list(predictors.columns)
  feature_importance_dict = dict(zip(feature_names, feature_importances))

  # Sort the feature importances in descending order
  sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

  selected_metrics_fs_no_syntactic = []
  for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics_fs_no_syntactic.append(feature)

  target = merged_no_dummies['quality']
  predictors = merged_no_dummies[selected_metrics_fs_no_syntactic]
  print(feature_names)
  print(f"Number of features for {name} -> Original = {len(feature_names)}, new = {len(selected_metrics_fs_no_syntactic)}")
  test_list_of_regressors(predictors, target, [name], [regressor])

  # selected_metrics_fs_no_syntactic = ['name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
  #                                     'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
  #                                     'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']

  model = regressor.fit(predictors, target)
  folder_path = models_path / "no_syntactic_fs"
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, folder_path / f"{name}_no_syntactic_fs.pkl")


# Gradient Boosting -> models 1-2 -> from 0.016195 to 0.016114, 81 to 39 features
#                   -> models 3-4 -> from 0.016299 to 0.016298, 36 to 30 features
# Extra Trees       -> models 1-2 -> from 0.009650 to 0.009142, 81 to 54 features
#                   -> models 3-4 -> from 0.008569 to 0.008568, 36 to 33 features
# XGBoosting        -> models 1-2 -> from 0.008856 to 0.008714, 81 to 50 features
#                   -> models 3-4 -> from 0.008252 to 0.008197, 36 to 32 features
# CatBoost          -> models 1-2 -> from 0.008206 to 0.007993, 81 to 58 features
#                   -> models 3-4 -> from 0.008011 to 0.008055, 36 to 33 features

['len_avg_word', 'is_empty_2', 'val_pct_max', 'last_word', 'constancy', 'len_max_word', 'frequency_iqr', 'frequency_2qo', 'frequency_avg', 'frequency_min', 'frequency_4qo', 'val_pct_min', 'frequency_6qo', 'freq_word_containment', 'number_words', 'val_pct_std', 'freq_word_soundex_containment', 'words_cnt_sd', 'frequency_max', 'first_word', 'entropy', 'len_min_word', 'is_empty', 'name_dist', 'is_binary', 'frequency_1qo', 'words_cnt_max', 'incompleteness', 'frequency_3qo', 'cardinality', 'uniqueness', 'frequency_5qo', 'frequency_7qo', 'words_cnt_min', 'words_cnt_avg', 'frequency_sd']
Number of features for gradient_boosting -> Original = 36, new = 30
gradient_boosting
Fit time: 4.792233 | Score time: 0.005807
R2: 0.767436 | MAE: 0.002981 | RMSE: 0.016298 | MedAE: 0.000064
------------------------------
['len_avg_word', 'is_empty_2', 'val_pct_max', 'last_word', 'constancy', 'len_max_word', 'frequency_iqr', 'frequency_2qo', 'frequency_avg', 'frequency_min', 'frequency_4qo', 'val_pct_min', '

### Model 5: Custom Metrics

In [113]:
y = merged_all['quality']
X = merged_all.drop(columns=['quality'], axis=1)
original_features = X.columns.tolist()

In [114]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression

# 1.1 Threshold: 0.01 => keep features with variance > 1%
var_thresh = VarianceThreshold(threshold=0.01)
var_thresh.fit(X)

low_variance_removed = X.columns[~var_thresh.get_support()].tolist()
print("Removed low-variance features:", low_variance_removed)
print(len(X.columns))

X = X[X.columns[var_thresh.get_support()]] # Update X

# 1.2 Compute absolute correlation matrix
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) # Upper triangle mask (to avoid duplicate pairs)

high_corr_removed = [column for column in upper.columns if any(upper[column] > 0.9)] # Find features with correlation > 0.9
print("Removed highly correlated features:", high_corr_removed)
print(len(X.columns))

X = X.drop(columns=high_corr_removed)

# 1.3 Compute mutual information scores
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

low_mi_removed = mi_scores[mi_scores < 0.01].index.tolist() # Threshold: MI score must be greater than 0.01
print("Removed low mutual information features:", low_mi_removed)
print(len(X.columns))

X_post_filter_methods = X[mi_scores[mi_scores >= 0.01].index]
print(X.columns)

Removed low-variance features: ['pct_time', 'pct_email', 'is_empty_2', 'pct_ip', 'pct_date', 'pct_url', 'pct_general', 'pct_date_time_specific', 'is_empty', 'is_binary', 'pct_date_time', 'pct_phones', 'datatype__pct_date_time', 'datatype_2__pct_date_time', 'specific_type__pct_date', 'specific_type__pct_email', 'specific_type__pct_general', 'specific_type__pct_time', 'specific_type__pct_url', 'specific_type_2__pct_date', 'specific_type_2__pct_email', 'specific_type_2__pct_general', 'specific_type_2__pct_time', 'specific_type_2__pct_url']
81
Removed highly correlated features: ['constancy', 'pct_alphanumeric', 'frequency_4qo', 'val_pct_min', 'frequency_1qo', 'words_cnt_max', 'incompleteness', 'frequency_3qo', 'frequency_5qo', 'frequency_7qo', 'words_cnt_avg', 'specific_type__pct_others', 'specific_type_2__pct_others']
57
Removed low mutual information features: []
44
Index(['pct_alphabetic', 'len_avg_word', 'pct_username', 'val_pct_max',
       'pct_others', 'last_word', 'len_max_word', 

In [None]:
# 2
gb_model = GradientBoostingRegressor(random_state=211199, n_estimators=50)
gb_model.fit(X, y)

feature_importances = gb_model.feature_importances_
feature_names = list(X.columns) # Match feature importances with corresponding feature names
feature_importance_dict = dict(zip(feature_names, feature_importances))

sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True) # Sort the feature importances in descending order

selected_metrics = []
for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics.append(feature)

X = X[selected_metrics]
print(len(X.columns))
print(X.columns)

30
Index(['freq_word_containment', 'uniqueness', 'frequency_2qo', 'first_word',
       'frequency_min', 'val_pct_max', 'frequency_iqr', 'name_dist',
       'val_pct_std', 'frequency_6qo', 'words_cnt_min', 'pct_username',
       'specific_type_2__pct_username', 'freq_word_soundex_containment',
       'pct_alphabetic', 'datatype_2__pct_alphanumeric', 'frequency_max',
       'specific_type__pct_username', 'len_max_word', 'last_word',
       'len_min_word', 'pct_phrases', 'frequency_avg', 'len_avg_word',
       'cardinality', 'pct_non_alphanumeric', 'datatype_2__pct_alphabetic',
       'pct_unknown', 'entropy', 'datatype__pct_unknown'],
      dtype='object')


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# 3
gb_model = GradientBoostingRegressor(random_state=211199, n_estimators=50)
rfecv = RFECV(estimator=gb_model, step=1, cv=KFold(5), scoring='r2', verbose=3)
rfecv.fit(X, y)

# Selected features
selected = X.columns[rfecv.support_]
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected features: {list(selected)}")

gb_model = GradientBoostingRegressor(random_state=211199, n_estimators=50)
gb_model.fit(X[selected], y)
folder_path = models_path / "fs_deep"
folder_path.mkdir(parents=True, exist_ok=True)
joblib.dump(gb_model, folder_path / f"gradient_boosting_fs_deep.pkl")

Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.

In [115]:
# 2
et_model = ExtraTreesRegressor(random_state=211199, n_estimators=50)
et_model.fit(X_post_filter_methods, y)

feature_importances = et_model.feature_importances_
feature_names = list(X_post_filter_methods.columns) # Match feature importances with corresponding feature names
feature_importance_dict = dict(zip(feature_names, feature_importances))

sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True) # Sort the feature importances in descending order

selected_metrics = []
for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics.append(feature)

X = X_post_filter_methods[selected_metrics]
print(len(X.columns))
print(X.columns)

40
Index(['freq_word_containment', 'uniqueness', 'freq_word_soundex_containment',
       'name_dist', 'frequency_6qo', 'datatype_2__pct_alphanumeric',
       'frequency_2qo', 'len_max_word', 'first_word',
       'specific_type_2__pct_username', 'frequency_min', 'val_pct_std',
       'val_pct_max', 'words_cnt_min', 'frequency_iqr', 'last_word',
       'pct_username', 'frequency_avg', 'frequency_sd', 'frequency_max',
       'datatype_2__pct_alphabetic', 'len_avg_word', 'pct_alphabetic',
       'datatype__pct_alphabetic', 'len_min_word', 'pct_unknown', 'entropy',
       'cardinality', 'words_cnt_sd', 'number_words',
       'specific_type__pct_username', 'pct_non_alphanumeric',
       'specific_type__pct_phrases', 'pct_phrases', 'pct_others',
       'specific_type_2__pct_phrases', 'datatype__pct_unknown',
       'datatype__pct_non_alphanumeric', 'datatype_2__pct_unknown',
       'datatype__pct_alphanumeric'],
      dtype='object')


In [116]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# 3
et_model = ExtraTreesRegressor(random_state=211199, n_estimators=50)
rfecv = RFECV(estimator=et_model, step=1, cv=KFold(5), scoring='r2', verbose=3)
rfecv.fit(X, y)

# Selected features
selected = X.columns[rfecv.support_]
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected features: {list(selected)}")

et_model = ExtraTreesRegressor(random_state=211199, n_estimators=50)
et_model.fit(X[selected], y)
folder_path = models_path / "fs_deep"
folder_path.mkdir(parents=True, exist_ok=True)
joblib.dump(et_model, folder_path / f"extra_trees_fs_deep.pkl")

Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 fe

['C:\\Projects\\freyja_repo\\data\\models\\fs_deep\\extra_trees_fs_deep.pkl']

In [117]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression, RFECV
from sklearn.model_selection import KFold

def feature_selection_pipeline(dataset, model_typology_name):

    y = dataset["quality"]
    X = dataset.drop(columns=["quality"], axis=1)
    original_features = X.columns.tolist()

    print(f"Original feature count: {len(original_features)}")

    # 1.1 Variance Threshold
    var_thresh = VarianceThreshold(threshold = 0.01)
    var_thresh.fit(X)
    low_variance_removed = X.columns[~var_thresh.get_support()].tolist()
    X = X[X.columns[var_thresh.get_support()]]
    print(f"Removed low-variance features: {low_variance_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.2 Correlation Filter
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_removed = [column for column in upper.columns if any(upper[column] > 0.9)]
    X = X.drop(columns=high_corr_removed)
    print(f"Removed highly correlated features: {high_corr_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.3 Mutual Information
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    low_mi_removed = mi_scores[mi_scores < 0.01].index.tolist()
    X_post_filter_methods = X[mi_scores[mi_scores >= 0.01].index]
    print(f"Removed low mutual information features: {low_mi_removed}")
    print(f"Remaining features: {len(X_post_filter_methods.columns)}")

    # Utility function for importance + RFECV steps
    def run_model_selection(model, model_name):
        nonlocal X_post_filter_methods, y

        # Feature Importance Filter
        model.fit(X_post_filter_methods, y)
        feature_importances = model.feature_importances_
        feature_names = list(X_post_filter_methods.columns)
        feature_importance_dict = dict(zip(feature_names, feature_importances))
        sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
        selected_metrics = [f for f, imp in sorted_importances if imp > 0.0001]

        X_temp = X_post_filter_methods[selected_metrics]
        print(f"{model_name} - Selected {len(X_temp.columns)} features after importance filtering")

        # RFECV
        rfecv = RFECV(estimator=model, step=1, cv=KFold(5), scoring='r2',verbose=3)
        rfecv.fit(X_temp, y)
        selected = X_temp.columns[rfecv.support_]
        print(f"{model_name} - Optimal number of features: {rfecv.n_features_}")
        print(f"{model_name} - Selected features: {list(selected)}")

        # Train final model
        model.fit(X_temp[selected], y)
        folder_path = Path(models_path) / model_typology_name
        folder_path.mkdir(parents=True, exist_ok=True)
        model_file = folder_path / f"{model_name}_{model_typology_name}.pkl"
        joblib.dump(model, model_file)
        print(f"Saved model to: {model_file}")

        return list(selected)

    run_model_selection(GradientBoostingRegressor(random_state=21111999, n_estimators=50), "gradient_boosting")
    run_model_selection(ExtraTreesRegressor(random_state=21111999, n_estimators=50), "extra_trees")


In [135]:
no_syntactic_features = merged_no_dummies[['quality', 'name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
                                'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
                                'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']]

feature_selection_pipeline(merged_all, "all_fs_deep")
feature_selection_pipeline(no_syntactic_features, "no_syntactic_fs_deep")

Original feature count: 22
Removed low-variance features: ['is_empty']
Remaining features: 21
Removed highly correlated features: ['frequency_3qo', 'words_cnt_sd']
Remaining features: 19
Removed low mutual information features: []
Remaining features: 19
gradient_boosting - Selected 18 features after importance filtering
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fi

In [158]:
target = merged_no_dummies['quality']
predictors = merged_no_dummies[['name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
                                'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
                                'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']]

gb_model = GradientBoostingRegressor(random_state=21111999, n_estimators=50)
gb_model.fit(predictors, y)
folder_path = Path(models_path) / "no_syntactic_fs_deep"
folder_path.mkdir(parents=True, exist_ok=True)
model_file = folder_path / f"gradient_boosting_no_syntactic_fs_deep.pkl"
joblib.dump(gb_model, model_file)

['C:\\Projects\\freyja_repo\\data\\models\\no_syntactic_fs_deep\\gradient_boosting_no_syntactic_fs_deep.pkl']

# Benchmark evaluation

Once we have define all the models, we will evaluate each of the five selected benchmarks with all of them, with the goal of discerning which is the best one. To do so, all the distances for all query columns of each benchmark have been obtained and stored.



## Preparation
Load all the model and define the functions to prepare the data for the models. This data preparation depends on the features defined for each model. We also present the function used to obtain the metrics from the benchmark.

In [159]:
models = {}
typologies = ["all", "all_fs", "no_syntactic", "no_syntactic_fs", "all_fs_deep", "no_syntactic_fs_deep"]

for name in best_names:
    for type in typologies:
        try:
            models[f"{name}_{type}"] = joblib.load(models_path / type / f"{name}_{type}.pkl")
        except FileNotFoundError:
            pass

print(models.keys())
print(len(models.keys()))

dict_keys(['gradient_boosting_all', 'gradient_boosting_all_fs', 'gradient_boosting_no_syntactic', 'gradient_boosting_no_syntactic_fs', 'gradient_boosting_all_fs_deep', 'gradient_boosting_no_syntactic_fs_deep', 'extra_trees_all', 'extra_trees_all_fs', 'extra_trees_no_syntactic', 'extra_trees_no_syntactic_fs', 'extra_trees_all_fs_deep', 'extra_trees_no_syntactic_fs_deep', 'xgboosting_all', 'xgboosting_all_fs', 'xgboosting_no_syntactic', 'xgboosting_no_syntactic_fs', 'catboost_all', 'catboost_all_fs', 'catboost_no_syntactic', 'catboost_no_syntactic_fs'])
20


In [160]:
def prepare_data_for_model(distances, model, type_of_model):
  distances = distances.drop(columns=['dataset_name', 'dataset_name_2', 'attribute_name', 'attribute_name_2'], axis=1)

  maybe_missing_columns = ["datatype__pct_alphabetic", "datatype__pct_date_time", "datatype__pct_non_alphanumeric", "datatype__pct_numeric", "datatype__pct_unknown", "datatype__pct_alphanumeric",
                            "specific_type__pct_date", "specific_type__pct_email", "specific_type__pct_phrases", 'specific_type_2__pct_username', 'datatype_2__pct_alphanumeric', 'datatype_2__pct_alphabetic',
                            'specific_type_2__pct_phrases', "specific_type__pct_general", "specific_type__pct_others", "specific_type__pct_time", "specific_type__pct_url", "specific_type__pct_username",
                            'datatype_2__pct_date_time', 'specific_type_2__pct_date', 'specific_type_2__pct_email', 'specific_type_2__pct_general', 'specific_type_2__pct_url', 'datatype_2__pct_non_alphanumeric',
                            'datatype_2__pct_numeric', 'datatype_2__pct_unknown', 'specific_type_2__pct_others', 'specific_type_2__pct_time']

  if "no_syntactic_fs" in type_of_model or "custom" in type_of_model:
    pass
  elif ("all" in type_of_model):
    distances = pd.concat([distances.drop('datatype', axis=1), pd.get_dummies(distances['datatype'], prefix='datatype_', dtype=int)], axis=1)
    distances = pd.concat([distances.drop('datatype_2', axis=1), pd.get_dummies(distances['datatype_2'], prefix='datatype_2_', dtype=int)], axis=1)
    distances = pd.concat([distances.drop('specific_type', axis=1), pd.get_dummies(distances['specific_type'], prefix='specific_type_', dtype=int)], axis=1)
    distances = pd.concat([distances.drop('specific_type_2', axis=1), pd.get_dummies(distances['specific_type_2'], prefix='specific_type_2_', dtype=int)], axis=1)

    for column in maybe_missing_columns:
      if column not in distances.columns:
        distances[column] = 0
  else:
    distances = distances.drop(columns=['datatype', "datatype_2", "specific_type", "specific_type_2"], axis=1, errors='ignore')
    distances = distances.drop(columns=["pct_numeric", "pct_alphanumeric", "pct_alphabetic", "pct_non_alphanumeric", "pct_date_time", "pct_unknown", "pct_phones", "pct_email", "pct_url", "pct_ip",
                                        "pct_username", "pct_phrases", "pct_general", "pct_date", "pct_time", "pct_date_time_specific", "pct_others"], axis=1, errors='ignore')

  if 'is_empty_2' not in distances.columns:
    distances['is_empty_2'] = 0

  # Arrange the columns as in the model
  if "catboost" in type_of_model:
    distances = distances[model.feature_names_] 
  else:
    distances = distances[model.feature_names_in_] 
  return distances

In [None]:
def compute_and_evaluate_ranking(model, model_type, k, step, ground_truth_path, distances_folder_path):
  # Read the ground truth and obtain, for every target column, the amount of candidate columns that it has a join with. This will allow us to calculate the recall,
  # as it indicates the maximum possible joins, regardless of the value of k
  ground_truth = pd.read_csv(ground_truth_path, header = 0)
  pair_counts = ground_truth.groupby(['target_ds', 'target_attr']).size().reset_index(name='joins_count')

  # Initialize the matrix of metrics
  num_observations = int(k / step)
  precision = [0] * num_observations
  recall = [0] * num_observations
  max_recall = [0] * num_observations
  MAP = [0] * num_observations

  # Initialize execution time
  total_time = 0

  for _, row in tqdm(pair_counts.iterrows(), total=len(pair_counts)):
      dataset = row['target_ds']
      attribute = row['target_attr']
      count = row['joins_count']

      st = time.time()

      # Read the distances and do some preprocessing
      distances = pd.read_csv(distances_folder_path + 'distances_' + dataset.replace(".csv", "_profile_") + attribute.replace("/", "_").replace(": ","_").replace("'","_") + ".csv", header = 0, encoding='latin1', on_bad_lines="skip")

      dataset_names = distances["dataset_name_2"] # We store dataset and attribute names to be used to evaluate the ranking
      attribute_names = distances["attribute_name_2"]
      distances = prepare_data_for_model(distances, model, model_type)

      # # Use the model to predict
      # y_pred = model.predict(distances)
      # distances["predictions"] = y_pred

      # Use the model to predict (preventing some weird lines that might have slipped in)
      distances_numeric = distances.apply(pd.to_numeric, errors='coerce') # Convert everything to float, invalid parsing becomes NaN
      valid_rows = distances_numeric.dropna(axis=0, how='any') # Keep track of valid rows
      y_pred = model.predict(valid_rows) # Predict only on valid rows
      distances.loc[valid_rows.index, "predictions"] = y_pred # Assign predictions back only to the valid rows

      distances["target_ds"] = dataset_names
      distances["target_attr"] = attribute_names

      total_time += (time.time() - st) # In the time assessment we do not consider the evaluation of the ranking

      # Precompute a lookup set of valid (candidate_ds, candidate_attr) for this query
      valid_pairs = set(
          ground_truth.loc[
              (ground_truth['target_ds'] == dataset) &
              (ground_truth['target_attr'] == attribute),
              ['candidate_ds', 'candidate_attr']
          ].itertuples(index=False, name=None)
      )

      # For every k that we want to assess the ranking of, we get the top k joins and check how many appear in the grpund truth
      for k_iter in range(1, num_observations + 1):
        count_sem = 0
        ap = 0
        count_positions = 0

        top_k_joins = distances.sort_values(by='predictions', ascending=False).head(k_iter * step)

        for position in top_k_joins.itertuples(index=False):
            pair = (position.target_ds, position.target_attr)
            if pair in valid_pairs: 
                count_sem += 1
                ap += count_sem / (count_positions + 1)
            count_positions += 1


        precision[k_iter - 1] += count_sem / (k_iter * step)
        if count_sem != 0:
            MAP[k_iter - 1] += ap / count_sem
        recall[k_iter - 1] += count_sem / count
        max_recall[k_iter - 1] += (k_iter * step) / count

  print("AVERAGE time to load the distances and execute the model:")
  print("----%.2f----" % (total_time / len(pair_counts)))

  print("Precisions:", [round(element / len(pair_counts), 4) for element in precision])
  print("Recall:", [round(element / len(pair_counts), 4) for element in recall])
  print("Max recall:", [round(element / len(pair_counts), 4) for element in max_recall])
  print("Recall percentage:", [round((recall_iter / len(pair_counts)) / (max_recall_iter / len(pair_counts)), 4) for recall_iter, max_recall_iter in zip(recall, max_recall)])
  print("MAP:", [round(element / len(pair_counts), 4) for element in MAP])

  return [round(element / len(pair_counts), 4) for element in precision]

## Benchmarks

In [162]:
def evaluate_models(k, step, ground_truth_path, distances_folder_path, models=models):
    results = {}

    for model_type, model in models.items():
        print(f"Model {model_type}")
        precision_scores = compute_and_evaluate_ranking(
            model, model_type, k, step, ground_truth_path, distances_folder_path
        )
        results[model_type] = precision_scores
        print("------------------------------------------------------")

    # Compute average precisions
    avg_precisions = {
        model_name: sum(precisions) / len(precisions)
        for model_name, precisions in results.items()
    }

    # Sort models from best to worst
    ranked_models = sorted(avg_precisions.items(), key=lambda x: x[1], reverse=True)

    # Print ranked results
    print("\n==================== MODEL RANKINGS ====================")
    for rank, (model_name, avg_precision) in enumerate(ranked_models, start=1):
        print(f"{rank:2d}. Model: {model_name:20s} | Avg Precision: {avg_precision:.4f}")

    print("========================================================")

    # return ranked_models, results


### Santos Small

In [163]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/benchmarks/santos_small/distances_all_metrics/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)


Model gradient_boosting_all


100%|██████████| 50/50 [00:05<00:00,  8.76it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.98, 0.96, 0.955, 0.948, 0.9467, 0.9429, 0.9375, 0.9333, 0.928]
Recall: [0.0757, 0.1484, 0.2186, 0.2893, 0.3589, 0.4296, 0.4988, 0.5671, 0.6342, 0.7001]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9797, 0.9625, 0.9551, 0.9482, 0.9457, 0.9412, 0.9362, 0.9307, 0.9247]
MAP: [1.0, 1.0, 1.0, 0.9967, 0.9955, 0.9944, 0.9939, 0.9937, 0.9856, 0.9825]
------------------------------------------------------
Model gradient_boosting_all_fs


100%|██████████| 50/50 [00:04<00:00, 12.29it/s]


AVERAGE time to load the distances and execute the model:
----0.05----
Precisions: [1.0, 0.98, 0.96, 0.955, 0.948, 0.9467, 0.9429, 0.9375, 0.9333, 0.928]
Recall: [0.0757, 0.1484, 0.2186, 0.2893, 0.3589, 0.4296, 0.4988, 0.5671, 0.6342, 0.7001]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9797, 0.9625, 0.9551, 0.9482, 0.9457, 0.9412, 0.9362, 0.9307, 0.9247]
MAP: [1.0, 1.0, 1.0, 0.9967, 0.9955, 0.9944, 0.9939, 0.9937, 0.9856, 0.9825]
------------------------------------------------------
Model gradient_boosting_no_syntactic


100%|██████████| 50/50 [00:03<00:00, 14.26it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [1.0, 0.97, 0.94, 0.935, 0.932, 0.9267, 0.9229, 0.9175, 0.9111, 0.9]
Recall: [0.0757, 0.1467, 0.2133, 0.2821, 0.3509, 0.4187, 0.4863, 0.5524, 0.6164, 0.6768]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9687, 0.939, 0.9314, 0.9269, 0.9218, 0.9175, 0.912, 0.9046, 0.894]
MAP: [1.0, 1.0, 1.0, 0.9933, 0.9918, 0.9916, 0.9909, 0.9905, 0.9898, 0.9896]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 50/50 [00:03<00:00, 15.67it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [1.0, 0.97, 0.94, 0.935, 0.932, 0.9267, 0.9229, 0.9175, 0.9111, 0.9]
Recall: [0.0757, 0.1467, 0.2133, 0.2821, 0.3509, 0.4187, 0.4863, 0.5524, 0.6164, 0.6768]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9687, 0.939, 0.9314, 0.9269, 0.9218, 0.9175, 0.912, 0.9046, 0.894]
MAP: [1.0, 1.0, 1.0, 0.9933, 0.9918, 0.9916, 0.9909, 0.9905, 0.9898, 0.9896]
------------------------------------------------------
Model gradient_boosting_all_fs_deep


100%|██████████| 50/50 [00:03<00:00, 15.12it/s]


AVERAGE time to load the distances and execute the model:
----0.05----
Precisions: [1.0, 0.98, 0.9733, 0.965, 0.96, 0.9567, 0.9543, 0.95, 0.9467, 0.938]
Recall: [0.0757, 0.1487, 0.2211, 0.2921, 0.3619, 0.4318, 0.5017, 0.5702, 0.6397, 0.7039]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9817, 0.9735, 0.9644, 0.9561, 0.9506, 0.9467, 0.9414, 0.9388, 0.9297]
MAP: [1.0, 1.0, 0.9967, 0.9944, 0.9928, 0.9923, 0.9921, 0.9922, 0.9906, 0.9895]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 50/50 [00:02<00:00, 16.85it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [1.0, 0.98, 0.9733, 0.97, 0.964, 0.96, 0.96, 0.96, 0.9556, 0.952]
Recall: [0.0757, 0.1484, 0.221, 0.2936, 0.3653, 0.4363, 0.5082, 0.5801, 0.6497, 0.719]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9797, 0.973, 0.9696, 0.9651, 0.9604, 0.9588, 0.9577, 0.9535, 0.9496]
MAP: [1.0, 1.0, 1.0, 1.0, 1.0, 0.9933, 0.9906, 0.9895, 0.9891, 0.9887]
------------------------------------------------------
Model extra_trees_all


100%|██████████| 50/50 [00:06<00:00,  7.99it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [1.0, 0.97, 0.9667, 0.96, 0.956, 0.9533, 0.9514, 0.9475, 0.94, 0.932]
Recall: [0.0757, 0.1462, 0.2181, 0.2883, 0.3585, 0.4287, 0.499, 0.5688, 0.6343, 0.6983]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9654, 0.9601, 0.952, 0.9471, 0.9438, 0.9415, 0.9391, 0.9309, 0.9223]
MAP: [1.0, 1.0, 0.9967, 0.9961, 0.9961, 0.9962, 0.9964, 0.989, 0.9889, 0.9887]
------------------------------------------------------
Model extra_trees_all_fs


100%|██████████| 50/50 [00:05<00:00,  9.28it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [1.0, 0.96, 0.9533, 0.935, 0.928, 0.93, 0.9286, 0.9275, 0.9222, 0.912]
Recall: [0.0757, 0.1445, 0.2147, 0.2816, 0.3495, 0.4197, 0.4885, 0.5573, 0.6228, 0.6844]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9543, 0.9454, 0.9299, 0.9232, 0.9239, 0.9217, 0.9201, 0.9139, 0.904]
MAP: [1.0, 1.0, 0.9967, 0.9967, 0.9957, 0.9913, 0.9903, 0.9886, 0.9879, 0.9877]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 50/50 [00:04<00:00, 12.04it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.96, 0.9467, 0.94, 0.932, 0.9267, 0.9257, 0.925, 0.9222, 0.916]
Recall: [0.0757, 0.1456, 0.2154, 0.2842, 0.352, 0.4199, 0.4887, 0.5579, 0.6253, 0.6895]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9613, 0.9483, 0.9384, 0.93, 0.9243, 0.9221, 0.9211, 0.9177, 0.9107]
MAP: [1.0, 1.0, 0.9967, 0.9944, 0.9939, 0.9932, 0.9922, 0.9844, 0.9765, 0.9744]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 50/50 [00:04<00:00, 12.33it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.98, 0.96, 0.955, 0.952, 0.9433, 0.94, 0.935, 0.9267, 0.916]
Recall: [0.0757, 0.149, 0.2194, 0.29, 0.3603, 0.4281, 0.4969, 0.5643, 0.6283, 0.6901]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9843, 0.966, 0.9577, 0.9517, 0.9424, 0.9376, 0.9316, 0.9221, 0.9114]
MAP: [1.0, 1.0, 1.0, 0.9933, 0.9857, 0.9856, 0.985, 0.9848, 0.9849, 0.9851]
------------------------------------------------------
Model extra_trees_all_fs_deep


100%|██████████| 50/50 [00:04<00:00, 10.14it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.96, 0.93, 0.9133, 0.91, 0.908, 0.9133, 0.9143, 0.915, 0.9133, 0.91]
Recall: [0.0738, 0.1421, 0.209, 0.2773, 0.3447, 0.4149, 0.4837, 0.5525, 0.6194, 0.685]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [0.9748, 0.9386, 0.9203, 0.9158, 0.9106, 0.9134, 0.9127, 0.9122, 0.909, 0.9047]
MAP: [0.96, 0.96, 0.96, 0.9583, 0.9663, 0.9671, 0.9687, 0.9702, 0.9706, 0.9713]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 50/50 [00:03<00:00, 14.00it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.98, 0.97, 0.9533, 0.945, 0.94, 0.94, 0.94, 0.94, 0.9378, 0.93]
Recall: [0.0748, 0.1477, 0.2175, 0.2868, 0.3556, 0.4258, 0.4968, 0.567, 0.6365, 0.7011]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [0.9874, 0.9754, 0.9578, 0.9471, 0.9394, 0.9374, 0.9374, 0.9361, 0.9341, 0.9261]
MAP: [0.98, 0.99, 0.9867, 0.985, 0.9841, 0.9831, 0.9793, 0.9791, 0.9776, 0.977]
------------------------------------------------------
Model xgboosting_all


100%|██████████| 50/50 [00:07<00:00,  6.55it/s]


AVERAGE time to load the distances and execute the model:
----0.08----
Precisions: [1.0, 0.96, 0.9333, 0.92, 0.904, 0.8933, 0.8857, 0.88, 0.8756, 0.868]
Recall: [0.0757, 0.145, 0.2124, 0.2783, 0.3408, 0.4031, 0.4651, 0.5275, 0.5893, 0.6484]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9577, 0.9352, 0.919, 0.9002, 0.8874, 0.8776, 0.8708, 0.8649, 0.8564]
MAP: [1.0, 1.0, 1.0, 0.9967, 0.9955, 0.9879, 0.9852, 0.9839, 0.9821, 0.9813]
------------------------------------------------------
Model xgboosting_all_fs


100%|██████████| 50/50 [00:06<00:00,  8.28it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [1.0, 0.96, 0.9333, 0.915, 0.896, 0.8833, 0.8686, 0.8475, 0.8356, 0.824]
Recall: [0.0757, 0.1447, 0.2105, 0.2754, 0.3369, 0.3974, 0.4548, 0.5072, 0.562, 0.6153]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9555, 0.9266, 0.9093, 0.8899, 0.8748, 0.8581, 0.8374, 0.8247, 0.8128]
MAP: [1.0, 1.0, 0.9967, 0.9944, 0.9929, 0.9911, 0.9901, 0.99, 0.989, 0.9886]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 50/50 [00:04<00:00, 10.47it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.98, 0.95, 0.9333, 0.925, 0.912, 0.9033, 0.8886, 0.875, 0.8556, 0.842]
Recall: [0.0748, 0.1443, 0.2129, 0.2804, 0.3452, 0.4098, 0.4692, 0.5268, 0.5796, 0.6333]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [0.9874, 0.953, 0.9373, 0.9261, 0.9119, 0.9021, 0.8853, 0.8698, 0.8506, 0.8364]
MAP: [0.98, 0.99, 0.9917, 0.9911, 0.9845, 0.9836, 0.9832, 0.9758, 0.9758, 0.973]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 50/50 [00:04<00:00, 10.93it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.95, 0.9333, 0.915, 0.9, 0.8867, 0.8743, 0.8575, 0.84, 0.822]
Recall: [0.0757, 0.1443, 0.2122, 0.2761, 0.3383, 0.3995, 0.4597, 0.5145, 0.5665, 0.6154]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.953, 0.9342, 0.9117, 0.8938, 0.8793, 0.8675, 0.8495, 0.8313, 0.8128]
MAP: [1.0, 1.0, 0.9933, 0.9911, 0.9911, 0.9889, 0.9867, 0.9859, 0.9856, 0.9852]
------------------------------------------------------
Model catboost_all


100%|██████████| 50/50 [00:05<00:00,  8.74it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.96, 0.9467, 0.925, 0.92, 0.9033, 0.8943, 0.8875, 0.88, 0.868]
Recall: [0.0757, 0.1456, 0.2149, 0.2788, 0.3456, 0.4073, 0.4707, 0.5335, 0.5948, 0.6521]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9613, 0.9462, 0.9206, 0.9129, 0.8966, 0.8881, 0.8809, 0.8729, 0.8613]
MAP: [1.0, 1.0, 0.9933, 0.9928, 0.9841, 0.9841, 0.9799, 0.9755, 0.9743, 0.9661]
------------------------------------------------------
Model catboost_all_fs


100%|██████████| 50/50 [00:04<00:00, 10.21it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.94, 0.92, 0.91, 0.9, 0.89, 0.8829, 0.8675, 0.8533, 0.838]
Recall: [0.0757, 0.1431, 0.2094, 0.2758, 0.3398, 0.4037, 0.466, 0.5228, 0.5781, 0.6308]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.945, 0.922, 0.9106, 0.8977, 0.8887, 0.8792, 0.8631, 0.8485, 0.8332]
MAP: [1.0, 1.0, 0.9967, 0.9961, 0.9901, 0.9835, 0.9798, 0.9793, 0.9774, 0.9764]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 50/50 [00:03<00:00, 14.17it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [1.0, 0.98, 0.9533, 0.94, 0.932, 0.9267, 0.9086, 0.8975, 0.8844, 0.872]
Recall: [0.0757, 0.1479, 0.2163, 0.2835, 0.3507, 0.4183, 0.4776, 0.538, 0.5955, 0.6522]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.977, 0.9525, 0.9361, 0.9265, 0.9208, 0.9011, 0.8883, 0.874, 0.8615]
MAP: [1.0, 1.0, 0.9967, 0.9933, 0.9912, 0.9898, 0.9891, 0.9878, 0.9874, 0.9863]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 50/50 [00:03<00:00, 14.63it/s]

AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [1.0, 0.97, 0.9467, 0.93, 0.928, 0.92, 0.9057, 0.9025, 0.8911, 0.876]
Recall: [0.0757, 0.1469, 0.2152, 0.2822, 0.351, 0.4161, 0.4769, 0.5419, 0.6016, 0.6564]
Max recall: [0.0757, 0.1514, 0.2271, 0.3028, 0.3786, 0.4543, 0.53, 0.6057, 0.6814, 0.7571]
Recall percentage: [1.0, 0.9703, 0.9477, 0.9318, 0.9271, 0.9161, 0.8999, 0.8947, 0.8829, 0.867]
MAP: [1.0, 1.0, 0.9933, 0.99, 0.9842, 0.9821, 0.9813, 0.9795, 0.9791, 0.9785]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9675
 2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9624
 3. Model: extra_trees_all      | Avg Precision: 0.9577
 4. Model: gradient_boosting_all | Avg Precision: 0.9531
 5. Model: gradient_boosting_all_fs | Avg Precision: 0.9531
 6. Model: extra_trees_no_syntactic_fs | Avg Precision: 0.9508
 7. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.




### TUS Small

In [165]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_small/tus_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/benchmarks/tus_small/distances_all_metrics/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

Model gradient_boosting_all


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:17<00:00,  5.61it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.974, 0.8885, 0.816, 0.8105, 0.815, 0.819]
Recall: [0.1072, 0.1944, 0.2667, 0.3524, 0.4421, 0.5323]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9798, 0.8885, 0.8126, 0.8053, 0.8082, 0.8109]
MAP: [0.9859, 0.9807, 0.9742, 0.9491, 0.932, 0.9244]
------------------------------------------------------
Model gradient_boosting_all_fs


100%|██████████| 100/100 [00:14<00:00,  6.78it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.974, 0.8885, 0.816, 0.8105, 0.815, 0.819]
Recall: [0.1072, 0.1944, 0.2667, 0.3524, 0.4421, 0.5323]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9798, 0.8885, 0.8126, 0.8053, 0.8082, 0.8109]
MAP: [0.9859, 0.9807, 0.9742, 0.9491, 0.932, 0.9244]
------------------------------------------------------
Model gradient_boosting_no_syntactic


100%|██████████| 100/100 [00:12<00:00,  7.82it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.977, 0.8365, 0.68, 0.5818, 0.519, 0.4787]
Recall: [0.1074, 0.1783, 0.2128, 0.2372, 0.2597, 0.2831]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9816, 0.8149, 0.6482, 0.5421, 0.4748, 0.4312]
MAP: [0.9893, 0.9837, 0.9806, 0.979, 0.9781, 0.9758]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 100/100 [00:12<00:00,  8.23it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.977, 0.8365, 0.68, 0.5818, 0.519, 0.4787]
Recall: [0.1074, 0.1783, 0.2128, 0.2372, 0.2597, 0.2831]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9816, 0.8149, 0.6482, 0.5421, 0.4748, 0.4312]
MAP: [0.9893, 0.9837, 0.9806, 0.979, 0.9781, 0.9758]
------------------------------------------------------
Model gradient_boosting_all_fs_deep


100%|██████████| 100/100 [00:12<00:00,  7.83it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.973, 0.9135, 0.8603, 0.8732, 0.8894, 0.8967]
Recall: [0.1071, 0.2015, 0.2849, 0.3853, 0.4903, 0.5931]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9786, 0.9207, 0.868, 0.8804, 0.8963, 0.9035]
MAP: [0.9862, 0.9802, 0.9735, 0.9461, 0.928, 0.9206]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 100/100 [00:11<00:00,  8.50it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.976, 0.911, 0.8543, 0.865, 0.8734, 0.8837]
Recall: [0.1073, 0.2008, 0.2831, 0.3819, 0.482, 0.5849]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9811, 0.9177, 0.8625, 0.8726, 0.8811, 0.891]
MAP: [0.9863, 0.9814, 0.973, 0.9434, 0.9261, 0.918]
------------------------------------------------------
Model extra_trees_all


100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


AVERAGE time to load the distances and execute the model:
----0.16----
Precisions: [0.977, 0.849, 0.7313, 0.665, 0.6082, 0.564]
Recall: [0.1074, 0.1827, 0.2305, 0.2731, 0.3072, 0.3383]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.982, 0.835, 0.7023, 0.6241, 0.5616, 0.5154]
MAP: [0.9842, 0.9802, 0.978, 0.9732, 0.9696, 0.9668]
------------------------------------------------------
Model extra_trees_all_fs


100%|██████████| 100/100 [00:18<00:00,  5.34it/s]


AVERAGE time to load the distances and execute the model:
----0.15----
Precisions: [0.972, 0.849, 0.7317, 0.6485, 0.5948, 0.553]
Recall: [0.107, 0.1824, 0.2301, 0.2658, 0.301, 0.3311]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9778, 0.8337, 0.7011, 0.6074, 0.5502, 0.5043]
MAP: [0.98, 0.977, 0.9748, 0.9731, 0.9702, 0.9675]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 100/100 [00:15<00:00,  6.43it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.974, 0.8285, 0.7007, 0.6358, 0.595, 0.5568]
Recall: [0.1072, 0.1766, 0.2168, 0.2556, 0.2939, 0.3253]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9794, 0.8071, 0.6604, 0.5841, 0.5372, 0.4956]
MAP: [0.9771, 0.9759, 0.9751, 0.9676, 0.9638, 0.963]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 100/100 [00:14<00:00,  6.70it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.978, 0.828, 0.6997, 0.631, 0.5882, 0.5468]
Recall: [0.1075, 0.1765, 0.2165, 0.2537, 0.2905, 0.3194]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9828, 0.8068, 0.6596, 0.5796, 0.531, 0.4865]
MAP: [0.9827, 0.9793, 0.9775, 0.9691, 0.9647, 0.9642]
------------------------------------------------------
Model extra_trees_all_fs_deep


100%|██████████| 100/100 [00:17<00:00,  5.65it/s]


AVERAGE time to load the distances and execute the model:
----0.15----
Precisions: [0.98, 0.866, 0.7543, 0.6688, 0.6072, 0.5602]
Recall: [0.1077, 0.1872, 0.2392, 0.2749, 0.3054, 0.3324]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.984, 0.8555, 0.7287, 0.6282, 0.5582, 0.5064]
MAP: [0.9882, 0.983, 0.9801, 0.9786, 0.978, 0.9775]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 100/100 [00:13<00:00,  7.15it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.973, 0.9025, 0.8293, 0.7963, 0.7612, 0.7155]
Recall: [0.1071, 0.199, 0.275, 0.3521, 0.4168, 0.4614]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9785, 0.9096, 0.8379, 0.8045, 0.7619, 0.7029]
MAP: [0.9815, 0.9779, 0.9767, 0.9707, 0.9667, 0.9659]
------------------------------------------------------
Model xgboosting_all


100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


AVERAGE time to load the distances and execute the model:
----0.14----
Precisions: [0.977, 0.848, 0.7333, 0.665, 0.6146, 0.5855]
Recall: [0.1075, 0.1824, 0.231, 0.2752, 0.3124, 0.3528]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9821, 0.8334, 0.7039, 0.6289, 0.5711, 0.5374]
MAP: [0.9825, 0.9787, 0.9771, 0.9742, 0.9726, 0.9539]
------------------------------------------------------
Model xgboosting_all_fs


100%|██████████| 100/100 [00:18<00:00,  5.53it/s]


AVERAGE time to load the distances and execute the model:
----0.14----
Precisions: [0.978, 0.815, 0.6583, 0.5743, 0.527, 0.497]
Recall: [0.1075, 0.172, 0.2016, 0.2287, 0.2572, 0.2865]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9824, 0.7858, 0.6143, 0.5225, 0.4701, 0.4364]
MAP: [0.9765, 0.9764, 0.9747, 0.9734, 0.9608, 0.9537]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 100/100 [00:15<00:00,  6.57it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.973, 0.86, 0.7013, 0.6195, 0.5438, 0.4848]
Recall: [0.107, 0.1866, 0.2293, 0.2704, 0.294, 0.3102]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9783, 0.8528, 0.6986, 0.6178, 0.5375, 0.4725]
MAP: [0.9765, 0.9751, 0.9737, 0.9704, 0.9682, 0.9678]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 100/100 [00:14<00:00,  6.86it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.976, 0.853, 0.6767, 0.5885, 0.5224, 0.4678]
Recall: [0.1073, 0.1848, 0.2213, 0.2575, 0.283, 0.3003]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9808, 0.8444, 0.6742, 0.5883, 0.5174, 0.4574]
MAP: [0.9849, 0.9799, 0.9772, 0.9732, 0.9686, 0.967]
------------------------------------------------------
Model catboost_all


100%|██████████| 100/100 [00:17<00:00,  5.80it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.974, 0.856, 0.7323, 0.7113, 0.7016, 0.6887]
Recall: [0.1071, 0.1857, 0.233, 0.2936, 0.3557, 0.4136]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9791, 0.8488, 0.71, 0.6707, 0.6502, 0.63]
MAP: [0.9756, 0.9753, 0.9675, 0.9264, 0.9132, 0.9107]
------------------------------------------------------
Model catboost_all_fs


100%|██████████| 100/100 [00:15<00:00,  6.37it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.976, 0.8425, 0.7133, 0.6463, 0.6124, 0.5908]
Recall: [0.1073, 0.1812, 0.2239, 0.2628, 0.3054, 0.3504]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9811, 0.8282, 0.6821, 0.6004, 0.5582, 0.5337]
MAP: [0.9783, 0.9765, 0.9755, 0.9614, 0.9506, 0.9388]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 100/100 [00:12<00:00,  8.02it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.972, 0.8625, 0.7487, 0.6758, 0.6092, 0.5683]
Recall: [0.107, 0.1869, 0.2393, 0.2822, 0.3132, 0.3465]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9781, 0.8543, 0.729, 0.6448, 0.5725, 0.5278]
MAP: [0.9757, 0.9758, 0.975, 0.9682, 0.9608, 0.9534]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 100/100 [00:12<00:00,  8.33it/s]

AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.974, 0.867, 0.762, 0.705, 0.6524, 0.6115]
Recall: [0.1072, 0.1881, 0.2447, 0.2976, 0.3363, 0.3711]
Max recall: [0.1094, 0.2188, 0.3282, 0.4377, 0.5471, 0.6565]
Recall percentage: [0.9794, 0.8597, 0.7456, 0.6799, 0.6147, 0.5653]
MAP: [0.9768, 0.9759, 0.9745, 0.9641, 0.9575, 0.953]
------------------------------------------------------

 1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9010
 2. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.8939
 3. Model: gradient_boosting_all | Avg Precision: 0.8538
 4. Model: gradient_boosting_all_fs | Avg Precision: 0.8538
 5. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.8296
 6. Model: catboost_all         | Avg Precision: 0.7773
 7. Model: catboost_no_syntactic_fs | Avg Precision: 0.7620
 8. Model: catboost_no_syntactic | Avg Precision: 0.7394
 9. Model: extra_trees_all_fs_deep | Avg Precision: 0.7394
10. Model: xgboosting




### TUS Big

In [166]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_big/tus_big_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/benchmarks/tus_big/distances_all_metrics/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

Model gradient_boosting_all


100%|██████████| 100/100 [00:56<00:00,  1.78it/s]


AVERAGE time to load the distances and execute the model:
----0.44----
Precisions: [0.97, 0.9615, 0.942, 0.9245, 0.915, 0.906]
Recall: [0.05, 0.0981, 0.1404, 0.179, 0.2182, 0.2545]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9471, 0.9296, 0.8871, 0.8483, 0.827, 0.804]
MAP: [0.983, 0.9787, 0.9745, 0.9717, 0.9671, 0.9645]
------------------------------------------------------
Model gradient_boosting_all_fs


100%|██████████| 100/100 [00:48<00:00,  2.08it/s]


AVERAGE time to load the distances and execute the model:
----0.41----
Precisions: [0.972, 0.9615, 0.9417, 0.924, 0.9128, 0.903]
Recall: [0.0501, 0.0981, 0.1404, 0.1791, 0.2181, 0.2545]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9489, 0.9296, 0.8868, 0.8485, 0.8268, 0.8039]
MAP: [0.985, 0.9804, 0.9755, 0.9727, 0.9682, 0.9657]
------------------------------------------------------
Model gradient_boosting_no_syntactic


100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.984, 0.9705, 0.9437, 0.907, 0.8766, 0.8513]
Recall: [0.051, 0.0996, 0.1425, 0.1768, 0.2085, 0.2377]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9667, 0.944, 0.9003, 0.8378, 0.7903, 0.7509]
MAP: [0.9901, 0.9858, 0.9826, 0.9806, 0.9751, 0.9721]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 100/100 [00:40<00:00,  2.46it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.985, 0.972, 0.9433, 0.9073, 0.8772, 0.8515]
Recall: [0.051, 0.0997, 0.1425, 0.1769, 0.2084, 0.2376]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9668, 0.9446, 0.9, 0.8381, 0.7901, 0.7506]
MAP: [0.9917, 0.9875, 0.9841, 0.9819, 0.9758, 0.9729]
------------------------------------------------------
Model gradient_boosting_all_fs_deep


100%|██████████| 100/100 [00:43<00:00,  2.28it/s]


AVERAGE time to load the distances and execute the model:
----0.40----
Precisions: [0.957, 0.956, 0.9507, 0.946, 0.9344, 0.9217]
Recall: [0.0486, 0.0963, 0.1421, 0.1869, 0.2257, 0.2623]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9214, 0.9125, 0.8978, 0.8856, 0.8554, 0.8285]
MAP: [0.9747, 0.9679, 0.9638, 0.9626, 0.9612, 0.9591]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.987, 0.986, 0.9833, 0.9688, 0.957, 0.949]
Recall: [0.0513, 0.1024, 0.1512, 0.1951, 0.2366, 0.2777]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9721, 0.9708, 0.9553, 0.9244, 0.8967, 0.8771]
MAP: [0.9926, 0.9898, 0.989, 0.9889, 0.9868, 0.9846]
------------------------------------------------------
Model extra_trees_all


100%|██████████| 100/100 [01:08<00:00,  1.46it/s]


AVERAGE time to load the distances and execute the model:
----0.55----
Precisions: [0.971, 0.964, 0.939, 0.9108, 0.8884, 0.8645]
Recall: [0.0498, 0.0964, 0.1383, 0.1737, 0.2081, 0.2395]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9434, 0.9133, 0.8739, 0.8229, 0.7888, 0.7565]
MAP: [0.974, 0.9777, 0.9751, 0.9743, 0.9709, 0.9687]
------------------------------------------------------
Model extra_trees_all_fs


100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


AVERAGE time to load the distances and execute the model:
----0.54----
Precisions: [0.981, 0.974, 0.949, 0.9185, 0.8882, 0.8663]
Recall: [0.049, 0.0945, 0.1351, 0.1699, 0.2009, 0.2324]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9294, 0.8955, 0.8533, 0.8049, 0.7617, 0.7339]
MAP: [0.9867, 0.9862, 0.9868, 0.9871, 0.9859, 0.9838]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 100/100 [00:53<00:00,  1.88it/s]


AVERAGE time to load the distances and execute the model:
----0.46----
Precisions: [0.981, 0.97, 0.954, 0.9317, 0.913, 0.898]
Recall: [0.0498, 0.0959, 0.1393, 0.1744, 0.2094, 0.2436]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9435, 0.9083, 0.88, 0.8263, 0.7939, 0.7693]
MAP: [0.9935, 0.9864, 0.9837, 0.9811, 0.9761, 0.9718]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 100/100 [00:51<00:00,  1.96it/s]


AVERAGE time to load the distances and execute the model:
----0.44----
Precisions: [0.97, 0.962, 0.931, 0.9, 0.8668, 0.8382]
Recall: [0.0489, 0.0959, 0.1348, 0.1672, 0.1946, 0.2204]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9276, 0.9086, 0.8515, 0.7921, 0.7375, 0.6961]
MAP: [0.99, 0.9835, 0.9813, 0.98, 0.9777, 0.9745]
------------------------------------------------------
Model extra_trees_all_fs_deep


100%|██████████| 100/100 [00:59<00:00,  1.69it/s]


AVERAGE time to load the distances and execute the model:
----0.51----
Precisions: [0.959, 0.947, 0.9183, 0.8942, 0.8662, 0.8387]
Recall: [0.0505, 0.0951, 0.1329, 0.1681, 0.2004, 0.2278]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9565, 0.9013, 0.8397, 0.7964, 0.7595, 0.7196]
MAP: [0.9657, 0.9667, 0.9673, 0.9651, 0.9636, 0.9631]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 100/100 [00:47<00:00,  2.09it/s]


AVERAGE time to load the distances and execute the model:
----0.43----
Precisions: [0.979, 0.9675, 0.9473, 0.9172, 0.8952, 0.8797]
Recall: [0.0492, 0.0949, 0.1369, 0.1718, 0.2043, 0.2366]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9328, 0.8997, 0.8651, 0.8138, 0.7743, 0.7472]
MAP: [0.9945, 0.9921, 0.9908, 0.9898, 0.9842, 0.9798]
------------------------------------------------------
Model xgboosting_all


100%|██████████| 100/100 [00:57<00:00,  1.74it/s]


AVERAGE time to load the distances and execute the model:
----0.41----
Precisions: [0.988, 0.975, 0.9403, 0.9105, 0.8858, 0.8547]
Recall: [0.0523, 0.1011, 0.1405, 0.175, 0.2087, 0.2358]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9909, 0.9582, 0.8879, 0.8292, 0.7909, 0.7448]
MAP: [0.9998, 0.9977, 0.9954, 0.9865, 0.9835, 0.9803]
------------------------------------------------------
Model xgboosting_all_fs


100%|██████████| 100/100 [00:52<00:00,  1.90it/s]


AVERAGE time to load the distances and execute the model:
----0.40----
Precisions: [0.984, 0.9625, 0.9247, 0.8878, 0.8586, 0.8303]
Recall: [0.0521, 0.0978, 0.1347, 0.1664, 0.1981, 0.2271]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9866, 0.9271, 0.8508, 0.7887, 0.751, 0.7173]
MAP: [0.9956, 0.9929, 0.9901, 0.9826, 0.9768, 0.9716]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 100/100 [00:44<00:00,  2.25it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.984, 0.9645, 0.943, 0.92, 0.9018, 0.8818]
Recall: [0.0519, 0.0986, 0.1412, 0.1789, 0.215, 0.2465]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.984, 0.9339, 0.8917, 0.8475, 0.815, 0.7785]
MAP: [0.9935, 0.9935, 0.9864, 0.9804, 0.9775, 0.9749]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.989, 0.9685, 0.944, 0.923, 0.907, 0.8823]
Recall: [0.052, 0.0989, 0.1412, 0.1799, 0.2178, 0.2491]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9859, 0.9369, 0.8922, 0.8523, 0.8256, 0.7868]
MAP: [0.9984, 0.9976, 0.9924, 0.987, 0.98, 0.9775]
------------------------------------------------------
Model catboost_all


100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


AVERAGE time to load the distances and execute the model:
----0.40----
Precisions: [0.991, 0.9725, 0.954, 0.9385, 0.9208, 0.909]
Recall: [0.0514, 0.099, 0.1425, 0.1821, 0.2201, 0.2583]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9734, 0.9386, 0.9001, 0.8629, 0.8345, 0.8158]
MAP: [0.9974, 0.9933, 0.9883, 0.9842, 0.9803, 0.977]
------------------------------------------------------
Model catboost_all_fs


100%|██████████| 100/100 [00:50<00:00,  1.99it/s]


AVERAGE time to load the distances and execute the model:
----0.40----
Precisions: [0.983, 0.967, 0.947, 0.9318, 0.9182, 0.8987]
Recall: [0.0511, 0.0986, 0.1411, 0.1819, 0.2221, 0.2562]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9687, 0.934, 0.8914, 0.8621, 0.8419, 0.8092]
MAP: [0.9962, 0.9911, 0.9862, 0.9818, 0.9736, 0.9704]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 100/100 [00:41<00:00,  2.41it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.981, 0.961, 0.935, 0.9108, 0.879, 0.8548]
Recall: [0.0509, 0.0965, 0.1378, 0.1759, 0.2072, 0.239]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.9651, 0.9142, 0.8703, 0.8334, 0.7853, 0.755]
MAP: [0.9969, 0.9918, 0.9842, 0.9784, 0.9736, 0.9676]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 100/100 [00:40<00:00,  2.49it/s]

AVERAGE time to load the distances and execute the model:
----0.33----
Precisions: [0.984, 0.9715, 0.9457, 0.9193, 0.89, 0.8643]
Recall: [0.0507, 0.0979, 0.1378, 0.1732, 0.2053, 0.2373]
Max recall: [0.0528, 0.1055, 0.1583, 0.2111, 0.2638, 0.3166]
Recall percentage: [0.96, 0.9274, 0.8707, 0.8207, 0.7782, 0.7497]
MAP: [0.9978, 0.9905, 0.9867, 0.9846, 0.9831, 0.9772]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9718
 2. Model: catboost_all         | Avg Precision: 0.9476
 3. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9443
 4. Model: extra_trees_no_syntactic | Avg Precision: 0.9413
 5. Model: catboost_all_fs      | Avg Precision: 0.9409
 6. Model: gradient_boosting_all | Avg Precision: 0.9365
 7. Model: gradient_boosting_all_fs | Avg Precision: 0.9358
 8. Model: xgboosting_no_syntactic_fs | Avg Precision: 0.9356
 9. Model: xgboosting_no_syntactic | Avg Precision: 0.9325
10. Model: extra_trees_no_sy




### D3L

In [167]:
k = 100
step = 10
ground_truth_path = 'C:/Projects/benchmarks/d3l/d3l_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/benchmarks/d3l/distances/'

custom_models = {}
custom_typologies = ["no_syntactic", "no_syntactic_fs", "no_syntactic_fs_deep"]

for name in best_names:
    for type in custom_typologies:
        try:
            custom_models[f"{name}_{type}"] = joblib.load(models_path / type / f"{name}_{type}.pkl")
        except FileNotFoundError:
            pass

evaluate_models(k, step, ground_truth_path, distances_folder_path, custom_models)

Model gradient_boosting_no_syntactic


100%|██████████| 100/100 [00:07<00:00, 14.25it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.733, 0.673, 0.5917, 0.5038, 0.4414, 0.3947, 0.3597, 0.3319, 0.3106, 0.2931]
Recall: [0.0516, 0.0949, 0.1252, 0.1423, 0.1558, 0.1671, 0.1776, 0.1872, 0.1969, 0.2064]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7375, 0.6783, 0.5965, 0.5084, 0.4452, 0.3979, 0.3625, 0.3343, 0.3127, 0.295]
MAP: [0.724, 0.7601, 0.7701, 0.7709, 0.7698, 0.7702, 0.7701, 0.7707, 0.7705, 0.7709]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 100/100 [00:06<00:00, 15.60it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.733, 0.6725, 0.5913, 0.5035, 0.4418, 0.3948, 0.3596, 0.3318, 0.3103, 0.2927]
Recall: [0.0516, 0.0949, 0.1252, 0.1422, 0.156, 0.1672, 0.1775, 0.1871, 0.1968, 0.2062]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7375, 0.6778, 0.5962, 0.5081, 0.4458, 0.3981, 0.3624, 0.3342, 0.3125, 0.2947]
MAP: [0.724, 0.7599, 0.77, 0.7707, 0.7697, 0.7702, 0.7702, 0.7708, 0.7705, 0.7709]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 100/100 [00:05<00:00, 16.71it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.814, 0.821, 0.842, 0.8155, 0.7896, 0.7953, 0.8014, 0.7894, 0.7709, 0.7546]
Recall: [0.057, 0.1151, 0.1769, 0.228, 0.2755, 0.3322, 0.3898, 0.438, 0.4806, 0.522]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.815, 0.8227, 0.8426, 0.8147, 0.7874, 0.7911, 0.7957, 0.7824, 0.7631, 0.7459]
MAP: [0.7453, 0.7972, 0.8129, 0.8219, 0.8246, 0.8233, 0.8254, 0.8288, 0.8325, 0.8361]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 100/100 [00:08<00:00, 11.24it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.797, 0.7435, 0.6803, 0.6165, 0.568, 0.55, 0.5477, 0.541, 0.531, 0.5258]
Recall: [0.0558, 0.1043, 0.1434, 0.1734, 0.1998, 0.2315, 0.2681, 0.3019, 0.3325, 0.3651]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7969, 0.7453, 0.6829, 0.6196, 0.5711, 0.5513, 0.5474, 0.5393, 0.5279, 0.5218]
MAP: [0.798, 0.8132, 0.8037, 0.796, 0.7841, 0.7585, 0.7406, 0.7327, 0.7278, 0.7237]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 100/100 [00:08<00:00, 11.98it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.744, 0.689, 0.6183, 0.568, 0.5478, 0.5435, 0.5496, 0.5444, 0.5359, 0.5317]
Recall: [0.0521, 0.0966, 0.1302, 0.1592, 0.1915, 0.2275, 0.268, 0.3028, 0.3347, 0.3684]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7451, 0.6901, 0.62, 0.5686, 0.5472, 0.5418, 0.547, 0.5409, 0.5315, 0.5265]
MAP: [0.7558, 0.7698, 0.7611, 0.7468, 0.7295, 0.7093, 0.6941, 0.6902, 0.6889, 0.6861]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 100/100 [00:07<00:00, 13.96it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.653, 0.712, 0.709, 0.7033, 0.698, 0.6625, 0.6454, 0.6361, 0.6238, 0.6152]
Recall: [0.0461, 0.1003, 0.1497, 0.1976, 0.2442, 0.2771, 0.314, 0.3529, 0.3887, 0.4255]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.6592, 0.7164, 0.7133, 0.7059, 0.6978, 0.66, 0.641, 0.6304, 0.6171, 0.6081]
MAP: [0.6067, 0.6752, 0.7023, 0.7208, 0.733, 0.7374, 0.7412, 0.7434, 0.7459, 0.7456]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 100/100 [00:09<00:00, 10.18it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.762, 0.6905, 0.652, 0.5943, 0.5394, 0.5033, 0.461, 0.4269, 0.393, 0.3669]
Recall: [0.0535, 0.0974, 0.138, 0.1678, 0.1904, 0.2125, 0.2268, 0.2399, 0.2484, 0.2576]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7641, 0.6956, 0.6575, 0.5996, 0.5442, 0.5061, 0.4629, 0.4286, 0.3944, 0.368]
MAP: [0.7655, 0.7793, 0.7721, 0.7652, 0.7554, 0.7416, 0.7348, 0.7292, 0.7265, 0.7195]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 100/100 [00:09<00:00, 10.89it/s]


AVERAGE time to load the distances and execute the model:
----0.05----
Precisions: [0.75, 0.6775, 0.6357, 0.5743, 0.5276, 0.4995, 0.4571, 0.424, 0.3894, 0.3625]
Recall: [0.0527, 0.0955, 0.1345, 0.162, 0.186, 0.2109, 0.2248, 0.2382, 0.246, 0.2544]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7526, 0.6824, 0.6406, 0.5787, 0.5316, 0.5022, 0.459, 0.4255, 0.3906, 0.3636]
MAP: [0.7581, 0.7743, 0.7647, 0.7557, 0.7404, 0.7229, 0.7193, 0.7136, 0.7117, 0.7073]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 100/100 [00:07<00:00, 13.78it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.759, 0.696, 0.5897, 0.492, 0.4346, 0.4072, 0.3971, 0.3891, 0.3843, 0.3742]
Recall: [0.0533, 0.098, 0.1249, 0.1391, 0.1533, 0.1721, 0.1953, 0.2181, 0.2418, 0.261]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7621, 0.7001, 0.5947, 0.4968, 0.4381, 0.4098, 0.3987, 0.3896, 0.3839, 0.373]
MAP: [0.7261, 0.7646, 0.7697, 0.7686, 0.7564, 0.7349, 0.7124, 0.6978, 0.6869, 0.679]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 100/100 [00:06<00:00, 14.63it/s]

AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.777, 0.7075, 0.6273, 0.5443, 0.4966, 0.481, 0.4779, 0.4714, 0.4654, 0.4567]
Recall: [0.0546, 0.0996, 0.1329, 0.154, 0.1753, 0.2029, 0.2343, 0.2633, 0.2917, 0.3174]
Max recall: [0.07, 0.14, 0.2099, 0.2799, 0.3499, 0.4199, 0.4899, 0.5598, 0.6298, 0.6998]
Recall percentage: [0.7797, 0.7116, 0.633, 0.55, 0.5009, 0.4832, 0.4783, 0.4702, 0.4632, 0.4536]
MAP: [0.7888, 0.8067, 0.8019, 0.796, 0.7807, 0.7505, 0.7293, 0.7205, 0.7156, 0.7139]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.7994
 2. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.6658
 3. Model: extra_trees_no_syntactic | Avg Precision: 0.6101
 4. Model: extra_trees_no_syntactic_fs | Avg Precision: 0.5872
 5. Model: catboost_no_syntactic_fs | Avg Precision: 0.5505
 6. Model: xgboosting_no_syntactic | Avg Precision: 0.5389
 7. Model: xgboosting_no_syntactic_fs |




### Freyja

In [168]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/freyja/freyja_ground_truth.csv'
distances_folder_path = 'C:/Projects/benchmarks/freyja/distances/'

custom_models = {}
custom_typologies = ["no_syntactic", "no_syntactic_fs", "no_syntactic_fs_deep"]

for name in best_names:
    for type in custom_typologies:
        try:
            custom_models[f"{name}_{type}"] = joblib.load(models_path / type / f"{name}_{type}.pkl")
        except FileNotFoundError:
            pass

evaluate_models(k, step, ground_truth_path, distances_folder_path, custom_models)

Model gradient_boosting_no_syntactic


100%|██████████| 50/50 [00:01<00:00, 32.86it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.94, 0.9333, 0.925, 0.92, 0.9233, 0.9086, 0.89, 0.86, 0.842]
Recall: [0.0309, 0.0579, 0.0863, 0.1139, 0.1419, 0.1709, 0.1962, 0.2199, 0.2395, 0.261]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9359, 0.9301, 0.9213, 0.9182, 0.9213, 0.9066, 0.8889, 0.8606, 0.8441]
MAP: [1.0, 1.0, 0.9833, 0.975, 0.9649, 0.9578, 0.9535, 0.9513, 0.9489, 0.9439]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 38.89it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.94, 0.9267, 0.92, 0.916, 0.9233, 0.9086, 0.89, 0.86, 0.842]
Recall: [0.0309, 0.0579, 0.0856, 0.1132, 0.1412, 0.1709, 0.1962, 0.2199, 0.2395, 0.261]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9359, 0.9227, 0.9157, 0.9137, 0.9213, 0.9066, 0.8889, 0.8606, 0.8441]
MAP: [1.0, 1.0, 0.9867, 0.9739, 0.9628, 0.9555, 0.9517, 0.9498, 0.9476, 0.9428]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 50/50 [00:01<00:00, 44.21it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.97, 0.96, 0.95, 0.952, 0.9467, 0.9429, 0.93, 0.9267, 0.926]
Recall: [0.0309, 0.0598, 0.0888, 0.1171, 0.147, 0.1757, 0.2039, 0.23, 0.2581, 0.2868]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9673, 0.957, 0.9469, 0.9512, 0.947, 0.9423, 0.9299, 0.9275, 0.9276]
MAP: [1.0, 1.0, 0.9967, 0.9944, 0.9858, 0.9837, 0.9812, 0.9792, 0.9769, 0.9665]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 50/50 [00:01<00:00, 26.68it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.95, 0.9467, 0.935, 0.92, 0.9167, 0.9086, 0.9, 0.8844, 0.868]
Recall: [0.0309, 0.0586, 0.0876, 0.1153, 0.142, 0.1697, 0.1961, 0.2227, 0.2464, 0.2691]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9474, 0.9447, 0.9322, 0.9184, 0.9147, 0.9063, 0.9002, 0.8853, 0.8704]
MAP: [1.0, 1.0, 0.99, 0.9828, 0.9736, 0.9689, 0.9663, 0.9564, 0.954, 0.9503]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 30.73it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.97, 0.94, 0.935, 0.932, 0.93, 0.9257, 0.9125, 0.8889, 0.87]
Recall: [0.0309, 0.06, 0.0873, 0.1156, 0.1442, 0.1726, 0.2006, 0.2264, 0.2482, 0.27]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9697, 0.9412, 0.9346, 0.9329, 0.9305, 0.9271, 0.9155, 0.892, 0.8733]
MAP: [1.0, 1.0, 0.9933, 0.9878, 0.9737, 0.9673, 0.9639, 0.9629, 0.9615, 0.9585]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 50/50 [00:01<00:00, 38.70it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.95, 0.92, 0.925, 0.936, 0.93, 0.9143, 0.915, 0.9067, 0.898]
Recall: [0.0309, 0.0586, 0.0853, 0.1142, 0.1444, 0.1721, 0.1977, 0.2264, 0.2529, 0.2784]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9481, 0.9191, 0.9235, 0.9343, 0.928, 0.9136, 0.9155, 0.9087, 0.9003]
MAP: [1.0, 1.0, 0.9967, 0.9861, 0.9659, 0.9617, 0.9598, 0.9546, 0.9514, 0.9491]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 22.49it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.95, 0.9533, 0.94, 0.912, 0.9033, 0.9029, 0.8625, 0.8489, 0.834]
Recall: [0.0309, 0.0586, 0.0883, 0.1161, 0.141, 0.1675, 0.1953, 0.2132, 0.2365, 0.2586]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9484, 0.9521, 0.9388, 0.912, 0.9031, 0.9025, 0.862, 0.8501, 0.8365]
MAP: [1.0, 1.0, 0.9867, 0.9828, 0.9741, 0.9692, 0.9637, 0.9618, 0.9567, 0.9526]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 50/50 [00:02<00:00, 24.64it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.9467, 0.925, 0.912, 0.8933, 0.8857, 0.855, 0.8356, 0.826]
Recall: [0.0309, 0.0593, 0.0877, 0.1142, 0.1409, 0.1655, 0.1914, 0.211, 0.2324, 0.2558]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9585, 0.9454, 0.9231, 0.9117, 0.892, 0.8843, 0.853, 0.8353, 0.8272]
MAP: [1.0, 1.0, 0.99, 0.985, 0.9738, 0.9648, 0.9575, 0.9544, 0.9511, 0.9462]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 50/50 [00:01<00:00, 32.65it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.97, 0.9467, 0.935, 0.92, 0.91, 0.8971, 0.875, 0.8689, 0.858]
Recall: [0.0309, 0.0599, 0.0875, 0.1152, 0.142, 0.1688, 0.1943, 0.2167, 0.2423, 0.2663]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9686, 0.9433, 0.9317, 0.9184, 0.91, 0.898, 0.876, 0.8707, 0.8613]
MAP: [1.0, 1.0, 0.9967, 0.9878, 0.9749, 0.969, 0.9649, 0.9636, 0.9565, 0.9524]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 35.29it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.94, 0.9, 0.88, 0.872, 0.8467, 0.8343, 0.8275, 0.8067, 0.798]
Recall: [0.0309, 0.058, 0.0832, 0.1084, 0.1343, 0.1563, 0.1803, 0.2047, 0.2249, 0.2475]
Max recall: [0.0309, 0.0618, 0.0928, 0.1237, 0.1546, 0.1855, 0.2164, 0.2473, 0.2783, 0.3092]
Recall percentage: [1.0, 0.9383, 0.8968, 0.8766, 0.8685, 0.8426, 0.8332, 0.8274, 0.8083, 0.8006]
MAP: [1.0, 1.0, 0.9967, 0.9911, 0.9597, 0.9471, 0.939, 0.929, 0.9236, 0.9139]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9504
 2. Model: extra_trees_no_syntactic_fs | Avg Precision: 0.9304
 3. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.9295
 4. Model: extra_trees_no_syntactic | Avg Precision: 0.9229
 5. Model: catboost_no_syntactic | Avg Precision: 0.9181
 6. Model: gradient_boosting_no_syntactic | Avg Precision: 0.9142
 7. Model: gradient_boosting_no_syntactic_fs | A

### OM CG

In [172]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_city_government/omnimatch_city_government_ground_truth.csv'
distances_folder_path = 'C:/Projects/benchmarks/omnimatch_city_government/distances/'

custom_models = {}
custom_typologies = ["no_syntactic", "no_syntactic_fs", "no_syntactic_fs_deep"]

for name in best_names:
    for type in custom_typologies:
        try:
            custom_models[f"{name}_{type}"] = joblib.load(models_path / type / f"{name}_{type}.pkl")
        except FileNotFoundError:
            pass

evaluate_models(k, step, ground_truth_path, distances_folder_path, custom_models)

Model gradient_boosting_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 21.31it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.46, 0.4, 0.348, 0.328, 0.304, 0.2813]
Recall: [0.0722, 0.1254, 0.164, 0.206, 0.2384, 0.2647]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4602, 0.3998, 0.3486, 0.3284, 0.304, 0.2813]
MAP: [0.538, 0.5533, 0.5486, 0.54, 0.5386, 0.5382]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 28.52it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.452, 0.396, 0.3467, 0.331, 0.304, 0.2793]
Recall: [0.0709, 0.1242, 0.1635, 0.2079, 0.2385, 0.2627]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4523, 0.3958, 0.3474, 0.3313, 0.3041, 0.2792]
MAP: [0.5471, 0.5613, 0.5526, 0.5433, 0.5433, 0.5434]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 50/50 [00:01<00:00, 31.26it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.558, 0.5467, 0.539, 0.5272, 0.5267]
Recall: [0.0864, 0.1748, 0.2567, 0.3378, 0.4133, 0.4953]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.5507, 0.5572, 0.5455, 0.5385, 0.527, 0.5263]
MAP: [0.6074, 0.6191, 0.6005, 0.593, 0.5884, 0.5839]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 20.86it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.472, 0.496, 0.4947, 0.496, 0.4864, 0.4747]
Recall: [0.074, 0.1557, 0.233, 0.3115, 0.3817, 0.4469]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4717, 0.4962, 0.4953, 0.4965, 0.4868, 0.4749]
MAP: [0.5368, 0.5472, 0.545, 0.543, 0.5436, 0.5447]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 50/50 [00:02<00:00, 21.55it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.496, 0.506, 0.512, 0.524, 0.5072, 0.488]
Recall: [0.0776, 0.1584, 0.2411, 0.3288, 0.3971, 0.4586]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4948, 0.5049, 0.5123, 0.524, 0.5064, 0.4873]
MAP: [0.5536, 0.5592, 0.5512, 0.5551, 0.5554, 0.5585]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 50/50 [00:02<00:00, 24.71it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.496, 0.508, 0.484, 0.476, 0.4576, 0.4407]
Recall: [0.0777, 0.1593, 0.2277, 0.2982, 0.3583, 0.4137]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4955, 0.5077, 0.4839, 0.4753, 0.4569, 0.4396]
MAP: [0.5696, 0.5736, 0.5701, 0.559, 0.5545, 0.5524]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 50/50 [00:03<00:00, 16.60it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.524, 0.508, 0.5067, 0.492, 0.4648, 0.4393]
Recall: [0.082, 0.1593, 0.2389, 0.3087, 0.3644, 0.414]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.5228, 0.508, 0.5078, 0.492, 0.4647, 0.44]
MAP: [0.5881, 0.5879, 0.5732, 0.5706, 0.566, 0.5646]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 50/50 [00:02<00:00, 18.38it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.488, 0.524, 0.4973, 0.475, 0.4424, 0.4127]
Recall: [0.0765, 0.1643, 0.2345, 0.2985, 0.3476, 0.3896]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4874, 0.5237, 0.4984, 0.4758, 0.4432, 0.414]
MAP: [0.5507, 0.5647, 0.5642, 0.5621, 0.5572, 0.5512]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 24.55it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.516, 0.522, 0.476, 0.461, 0.4392, 0.4113]
Recall: [0.0811, 0.1641, 0.2241, 0.2894, 0.3444, 0.3873]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.5171, 0.5232, 0.4764, 0.4614, 0.4391, 0.4115]
MAP: [0.5929, 0.5772, 0.5695, 0.5638, 0.5586, 0.5508]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 26.10it/s]

AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.468, 0.39, 0.356, 0.329, 0.3064, 0.28]
Recall: [0.0735, 0.1225, 0.1677, 0.2068, 0.2409, 0.2642]
Max recall: [0.1568, 0.3137, 0.4705, 0.6274, 0.7842, 0.941]
Recall percentage: [0.4687, 0.3906, 0.3565, 0.3296, 0.3072, 0.2807]
MAP: [0.5874, 0.5766, 0.5748, 0.5696, 0.5686, 0.5675]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5416
 2. Model: extra_trees_no_syntactic_fs | Avg Precision: 0.5055
 3. Model: xgboosting_no_syntactic | Avg Precision: 0.4891
 4. Model: extra_trees_no_syntactic | Avg Precision: 0.4866
 5. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.4770
 6. Model: xgboosting_no_syntactic_fs | Avg Precision: 0.4732
 7. Model: catboost_no_syntactic | Avg Precision: 0.4709
 8. Model: catboost_no_syntactic_fs | Avg Precision: 0.3549
 9. Model: gradient_boosting_no_syntactic | Avg Precision: 0.3536
10. Model: 




### OM CR

In [174]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/omnimatch_culture_recreation_ground_truth.csv'
distances_folder_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/distances/'

custom_models = {}
custom_typologies = ["no_syntactic", "no_syntactic_fs", "no_syntactic_fs_deep"]

for name in best_names:
    for type in custom_typologies:
        try:
            custom_models[f"{name}_{type}"] = joblib.load(models_path / type / f"{name}_{type}.pkl")
        except FileNotFoundError:
            pass

evaluate_models(k, step, ground_truth_path, distances_folder_path, custom_models)

Model gradient_boosting_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 21.32it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.448, 0.436, 0.416, 0.403, 0.4152, 0.4207]
Recall: [0.0645, 0.1256, 0.1791, 0.2316, 0.2983, 0.3626]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4467, 0.4348, 0.4133, 0.401, 0.4131, 0.4185]
MAP: [0.5071, 0.5224, 0.5244, 0.522, 0.5081, 0.5088]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 27.46it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.456, 0.43, 0.4133, 0.391, 0.4072, 0.4147]
Recall: [0.0658, 0.1238, 0.1781, 0.2245, 0.2925, 0.3577]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4556, 0.4288, 0.4111, 0.3887, 0.4051, 0.4128]
MAP: [0.5062, 0.52, 0.5151, 0.5112, 0.5039, 0.5026]
------------------------------------------------------
Model gradient_boosting_no_syntactic_fs_deep


100%|██████████| 50/50 [00:01<00:00, 28.02it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.568, 0.5733, 0.567, 0.5648, 0.5573]
Recall: [0.0809, 0.1623, 0.2457, 0.3246, 0.404, 0.4773]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.5599, 0.5619, 0.5672, 0.5619, 0.5595, 0.5508]
MAP: [0.5724, 0.6127, 0.6097, 0.6086, 0.6097, 0.6065]
------------------------------------------------------
Model extra_trees_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 20.37it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.536, 0.54, 0.496, 0.482, 0.4824, 0.4647]
Recall: [0.0765, 0.1549, 0.2139, 0.2779, 0.3479, 0.4023]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.5294, 0.5365, 0.4937, 0.4811, 0.4819, 0.4642]
MAP: [0.5771, 0.6062, 0.6016, 0.5859, 0.5747, 0.5671]
------------------------------------------------------
Model extra_trees_no_syntactic_fs


100%|██████████| 50/50 [00:02<00:00, 19.46it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.512, 0.544, 0.5347, 0.529, 0.504, 0.4767]
Recall: [0.0725, 0.1555, 0.2298, 0.3039, 0.3617, 0.4105]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.502, 0.5384, 0.5304, 0.526, 0.5009, 0.4737]
MAP: [0.5562, 0.5884, 0.5845, 0.5823, 0.5724, 0.5665]
------------------------------------------------------
Model extra_trees_no_syntactic_fs_deep


100%|██████████| 50/50 [00:01<00:00, 25.06it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.492, 0.492, 0.4973, 0.509, 0.5152, 0.516]
Recall: [0.0699, 0.1398, 0.2128, 0.2906, 0.3678, 0.4415]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4842, 0.4841, 0.4912, 0.5031, 0.5094, 0.5095]
MAP: [0.5311, 0.551, 0.5528, 0.5503, 0.5498, 0.5535]
------------------------------------------------------
Model xgboosting_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 16.96it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.448, 0.438, 0.4147, 0.401, 0.3976, 0.3867]
Recall: [0.0645, 0.1259, 0.1788, 0.2309, 0.2865, 0.3347]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4463, 0.436, 0.4126, 0.3996, 0.3968, 0.3862]
MAP: [0.4843, 0.5019, 0.4929, 0.4872, 0.4846, 0.4859]
------------------------------------------------------
Model xgboosting_no_syntactic_fs


100%|██████████| 50/50 [00:02<00:00, 18.28it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.412, 0.404, 0.4067, 0.397, 0.4008, 0.394]
Recall: [0.0588, 0.1157, 0.175, 0.2283, 0.2886, 0.3404]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4075, 0.4006, 0.4039, 0.3952, 0.3997, 0.3929]
MAP: [0.4619, 0.488, 0.4762, 0.4665, 0.4623, 0.4643]
------------------------------------------------------
Model catboost_no_syntactic


100%|██████████| 50/50 [00:02<00:00, 24.53it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.524, 0.5, 0.4667, 0.452, 0.4232, 0.398]
Recall: [0.0753, 0.1433, 0.2012, 0.2603, 0.3055, 0.3452]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.5215, 0.4963, 0.4645, 0.4507, 0.4231, 0.3984]
MAP: [0.5574, 0.5763, 0.5773, 0.5561, 0.5472, 0.5417]
------------------------------------------------------
Model catboost_no_syntactic_fs


100%|██████████| 50/50 [00:01<00:00, 25.87it/s]

AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.488, 0.466, 0.4467, 0.422, 0.4024, 0.3767]
Recall: [0.0701, 0.1337, 0.1926, 0.2426, 0.2898, 0.3259]
Max recall: [0.1444, 0.2888, 0.4332, 0.5776, 0.7221, 0.8665]
Recall percentage: [0.4854, 0.4629, 0.4446, 0.4199, 0.4013, 0.3762]
MAP: [0.5473, 0.5705, 0.5753, 0.5585, 0.5535, 0.5451]
------------------------------------------------------

 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5657
 2. Model: extra_trees_no_syntactic_fs | Avg Precision: 0.5167
 3. Model: extra_trees_no_syntactic_fs_deep | Avg Precision: 0.5036
 4. Model: extra_trees_no_syntactic | Avg Precision: 0.5002
 5. Model: catboost_no_syntactic | Avg Precision: 0.4607
 6. Model: catboost_no_syntactic_fs | Avg Precision: 0.4336
 7. Model: gradient_boosting_no_syntactic | Avg Precision: 0.4231
 8. Model: gradient_boosting_no_syntactic_fs | Avg Precision: 0.4187
 9. Model: xgboosting_no_syntactic | Avg Precision: 0.4143


