# Defining the final model
Once we have obtained the continuous quality metric for join detection, we will use it to generate a predictive model that can efficiently, and in a very lightweight fashion, detect similarities between columns of a data lake.

In [514]:
import numpy as np
import pandas as pd
import math
from pathlib import Path

from tqdm import tqdm
import time

import joblib

In [515]:
data_folder = 'C:/Projects/freyja/data/'
models_path = 'C:/Projects/freyja_plus_more/models_prenormalized_base'
distances = pd.read_csv(f'C:/Projects/freyja_plus_more/distances/distances_model_prenormalized_base.csv')

# Preparing the data

We will:
- Load the ground truth, which contains a subset of semantic joins, a subset of syntactic joins and a sample of the rest of joins.
- Merge it with the distances. That is, for each selected pair "add" the distances between the metrics of their respective profiles
- Remove unnecessary columns for the models (e.g. dataset and attribute names)
- Transform categorical variables into dummies

**Important**: the `ground_truth_models.csv` file contains all the semantic and syntactic joins detected in the data lake + a sample of joins that do not have a relationship (i.e. containment < 0.1 and no semantic link), indicated with a *null* value in the relationships cell.

In [516]:
# ground_truth = pd.read_csv(f'D:/Work/research/freyja/drive_repo/data/ground_truths/ground_truth_models.csv')
ground_truth = pd.read_csv(f'C:/Projects/freyja_plus_more/distances/ground_truth_models.csv')
ground_truth['relationship'] = ground_truth['relationship'].fillna('unrelated') # Pairs that are neither semantic or syntactic have a NaN. We change it by unrelated to prevent problems.

count_syntactic = (ground_truth['relationship'] == 'syntactic').sum()
count_semantic = (ground_truth['relationship'] == 'semantic').sum()
count_unrelated = (ground_truth['relationship'] == 'unrelated').sum()

print(f"Number of syntactic joins: {count_syntactic}")
print(f"Number of semantic joins: {count_semantic}")
print(f"Number of unrelated pairs: {count_unrelated}")

ground_truth.describe()

Number of syntactic joins: 2703
Number of semantic joins: 1701
Number of unrelated pairs: 18206


Unnamed: 0,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
count,22610.0,22610.0,22610.0,22610.0,22610.0
mean,0.107225,0.2154501,0.043204,0.010597,0.004229
std,0.242172,0.2893796,0.14975,0.050149,0.033486
min,0.0,4.251912e-07,0.0,0.0,0.0
25%,0.0,0.01015228,0.0,0.0,0.0
50%,0.0,0.06189559,0.0,0.0,0.0
75%,0.045455,0.3333333,0.00238,0.000219,2e-06
max,1.0,1.0,1.0,0.5,0.494872


In [517]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 50) 
ground_truth

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07
3,World_countries_env_vars.csv,Country,world_city.csv,District,unrelated,0.053498,0.177892,0.006250,0.002314,6.364830e-05
4,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05
...,...,...,...,...,...,...,...,...,...,...
22605,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00
22606,dataSpotifyClass.csv,song_title,netflix_titles.csv,description,unrelated,0.000000,0.313414,0.000000,0.000000,0.000000e+00
22607,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00
22608,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00


In [518]:
# distances = pd.read_csv(f'C:/Projects/freyja_plus_more/distances/distances_model_prenormalized_all.csv')
distances

Unnamed: 0,cardinality,uniqueness,entropy,incompleteness,frequency_avg,frequency_min,frequency_max,frequency_sd,val_pct_min,val_pct_max,val_pct_std,constancy,freq_word_containment,freq_word_soundex_containment,frequency_1qo,frequency_2qo,frequency_3qo,frequency_4qo,frequency_5qo,frequency_6qo,frequency_7qo,len_max_word,len_min_word,len_avg_word,words_cnt_max,words_cnt_min,words_cnt_avg,number_words,words_cnt_sd,is_empty,is_binary,frequency_iqr,first_word,last_word,name_dist,attribute_name_1,dataset_name_1,attribute_name_2,dataset_name_2
0,-0.001612,0.000000,-0.108842,0.000000,0.000000,0.000000,0.000000,0.000000,0.001341,0.001341,0.000000,0.001341,0.1,0.250000,0.001341,0.001341,0.001341,0.001341,0.001341,0.001341,0.001341,0.391746,0.000000,-0.038572,-0.080854,0.000000,-0.002658,-0.000977,-0.040542,0,0,0.000000,8,7,0,Name,AdventureWorks2014_stateprovince.csv,Name,world_country.csv
1,0.000195,-0.135553,0.312151,0.000000,0.462026,0.760268,0.130824,-0.000765,-0.013986,-0.104895,-0.033880,-0.104895,0.2,0.428571,-0.013986,-0.104895,-0.104895,-0.104895,-0.104895,-0.104895,-0.104895,1.044655,0.505290,1.680113,-0.020214,0.000000,-0.016396,0.171629,-0.055985,0,0,0.000000,4,13,12,demographics,Distributions_data_2016.csv,raceEthnicity,Tech_sector_diversity_demographics_2016.csv
2,-0.006587,-0.999200,-2.136850,0.000000,0.470250,0.003709,0.265810,0.637923,-0.001383,0.993015,0.497199,0.993015,0.0,0.000000,-0.001383,-0.001383,-0.001383,-0.001383,0.993015,0.993015,0.993015,-0.457037,0.252645,-0.651360,-0.121281,0.000000,-0.043013,0.023034,-0.141034,0,1,0.994398,8,7,7,country,USA_cars_datasets.csv,Name,world_country.csv
3,-0.031214,0.665114,-0.391069,-0.000981,-0.000747,0.000000,-0.007470,-0.002878,0.003870,-0.013291,-0.001374,-0.013291,0.0,0.125000,0.003870,0.003870,0.003870,0.003870,0.003625,0.003380,0.002889,-0.457037,0.252645,-0.095685,0.020214,0.000000,0.019129,-0.052841,0.053482,0,0,-0.000490,14,8,7,Country,World_countries_env_vars.csv,District,world_city.csv
4,-0.006171,-0.997500,-1.825795,0.108400,0.133952,0.000000,0.676529,0.660607,-0.003949,0.630051,0.128668,0.630051,0.0,0.125000,-0.003949,-0.003949,-0.003749,-0.003449,-0.003349,-0.001949,0.002351,0.130582,-0.252645,0.091715,0.000000,0.000000,0.000000,0.093049,0.000000,0,0,0.002000,2,2,8,languageCode,books_updated.csv,CountryCode,countries_metadatacountries.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22605,0.000611,0.000000,0.830221,0.000000,0.000000,0.000000,0.000000,0.000000,-0.293333,-0.293333,0.000000,-0.293333,0.0,0.000000,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,2.219892,8.084643,5.951487,0.000000,0.000000,0.000000,0.000236,0.000000,0,0,0.000000,39,36,4,Set,pte_sulfo.csv,Name,AdventureWorks2014_shift.csv
22606,-0.118797,-0.030943,-0.458812,0.000000,0.000012,0.000000,0.000000,0.000074,0.000335,0.001006,0.000085,0.001006,0.0,0.000000,0.000335,0.000335,0.000335,0.000335,0.000335,0.000335,0.000335,-0.587618,0.252645,-0.035152,-0.525552,-14.027789,-1.844182,-1.519398,-0.086287,0,0,0.000000,104,135,9,song_title,dataSpotifyClass.csv,description,netflix_titles.csv
22607,-0.241318,0.148744,-0.609571,0.000000,-0.000221,0.000000,-1.022797,-0.052781,0.016617,-0.378163,0.010038,-0.378163,0.0,0.000000,0.016617,0.016617,0.016617,0.033283,0.033283,0.033283,0.033283,-1.240528,0.252645,-0.264055,0.000000,0.000000,0.000000,-0.214649,0.000000,0,0,0.016667,22,2,4,Arg0,pte_methoxy.csv,1997,countries_data.csv
22608,-0.004975,-0.994937,-1.858861,0.000000,0.074012,0.040177,0.035000,0.067515,0.161564,0.827387,0.332911,0.827387,0.0,0.000000,0.161564,0.161564,0.161564,0.161564,0.827387,0.827387,0.827387,-1.044655,0.000000,-0.642904,-0.040427,0.000000,-0.040355,0.001428,-0.100492,0,1,0.665823,3,5,7,internet,student-mat.csv,Name,AdventureWorks2014_stateprovince.csv


In [519]:
joined = pd.merge(ground_truth, distances, left_on=['ds_name', 'ds_name_2', 'att_name', 'att_name_2'], right_on=['dataset_name_1', 'dataset_name_2', 'attribute_name_1', 'attribute_name_2'])
# joined_2 = pd.merge(ground_truth, distances, left_on=['ds_name', 'ds_name_2', 'att_name', 'att_name_2'], right_on=['dataset_name_2', 'dataset_name_1', 'attribute_name_2', 'attribute_name_1'])

# merged = pd.concat([joined, joined_2], ignore_index=True)
joined

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,relationship,containment,cardinality_proportion,jaccard,multiset_jaccard,quality,cardinality,uniqueness,entropy,incompleteness,frequency_avg,frequency_min,frequency_max,frequency_sd,val_pct_min,val_pct_max,val_pct_std,constancy,freq_word_containment,freq_word_soundex_containment,frequency_1qo,frequency_2qo,frequency_3qo,frequency_4qo,frequency_5qo,frequency_6qo,frequency_7qo,len_max_word,len_min_word,len_avg_word,words_cnt_max,words_cnt_min,words_cnt_avg,number_words,words_cnt_sd,is_empty,is_binary,frequency_iqr,first_word,last_word,name_dist,attribute_name_1,dataset_name_1,attribute_name_2,dataset_name_2
0,AdventureWorks2014_stateprovince.csv,Name,world_country.csv,Name,unrelated,0.044199,0.757322,0.019417,0.019048,1.381513e-03,-0.001612,0.000000,-0.108842,0.000000,0.000000,0.000000,0.000000,0.000000,0.001341,0.001341,0.000000,0.001341,0.1,0.250000,0.001341,0.001341,0.001341,0.001341,0.001341,0.001341,0.001341,0.391746,0.000000,-0.038572,-0.080854,0.000000,-0.002658,-0.000977,-0.040542,0,0,0.000000,8,7,0,Name,AdventureWorks2014_stateprovince.csv,Name,world_country.csv
1,Distributions_data_2016.csv,demographics,Tech_sector_diversity_demographics_2016.csv,raceEthnicity,syntactic,0.230769,0.461538,0.187500,0.000186,1.041278e-05,0.000195,-0.135553,0.312151,0.000000,0.462026,0.760268,0.130824,-0.000765,-0.013986,-0.104895,-0.033880,-0.104895,0.2,0.428571,-0.013986,-0.104895,-0.104895,-0.104895,-0.104895,-0.104895,-0.104895,1.044655,0.505290,1.680113,-0.020214,0.000000,-0.016396,0.171629,-0.055985,0,0,0.000000,4,13,12,demographics,Distributions_data_2016.csv,raceEthnicity,Tech_sector_diversity_demographics_2016.csv
2,USA_cars_datasets.csv,country,world_country.csv,Name,semantic,0.500000,0.008368,0.004167,0.000365,4.659192e-07,-0.006587,-0.999200,-2.136850,0.000000,0.470250,0.003709,0.265810,0.637923,-0.001383,0.993015,0.497199,0.993015,0.0,0.000000,-0.001383,-0.001383,-0.001383,-0.001383,0.993015,0.993015,0.993015,-0.457037,0.252645,-0.651360,-0.121281,0.000000,-0.043013,0.023034,-0.141034,0,1,0.994398,8,7,7,country,USA_cars_datasets.csv,Name,world_country.csv
3,World_countries_env_vars.csv,Country,world_city.csv,District,unrelated,0.053498,0.177892,0.006250,0.002314,6.364830e-05,-0.031214,0.665114,-0.391069,-0.000981,-0.000747,0.000000,-0.007470,-0.002878,0.003870,-0.013291,-0.001374,-0.013291,0.0,0.125000,0.003870,0.003870,0.003870,0.003870,0.003625,0.003380,0.002889,-0.457037,0.252645,-0.095685,0.020214,0.000000,0.019129,-0.052841,0.053482,0,0,-0.000490,14,8,7,Country,World_countries_env_vars.csv,District,world_city.csv
4,books_updated.csv,languageCode,countries_metadatacountries.csv,CountryCode,syntactic,0.360000,0.101215,0.034091,0.000878,1.381148e-05,-0.006171,-0.997500,-1.825795,0.108400,0.133952,0.000000,0.676529,0.660607,-0.003949,0.630051,0.128668,0.630051,0.0,0.125000,-0.003949,-0.003949,-0.003749,-0.003449,-0.003349,-0.001949,0.002351,0.130582,-0.252645,0.091715,0.000000,0.000000,0.000000,0.093049,0.000000,0,0,0.002000,2,2,8,languageCode,books_updated.csv,CountryCode,countries_metadatacountries.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22607,pte_sulfo.csv,Set,AdventureWorks2014_shift.csv,Name,unrelated,0.000000,0.120000,0.000000,0.000000,0.000000e+00,0.000611,0.000000,0.830221,0.000000,0.000000,0.000000,0.000000,0.000000,-0.293333,-0.293333,0.000000,-0.293333,0.0,0.000000,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,-0.293333,2.219892,8.084643,5.951487,0.000000,0.000000,0.000000,0.000236,0.000000,0,0,0.000000,39,36,4,Set,pte_sulfo.csv,Name,AdventureWorks2014_shift.csv
22608,dataSpotifyClass.csv,song_title,netflix_titles.csv,description,unrelated,0.000000,0.313414,0.000000,0.000000,0.000000e+00,-0.118797,-0.030943,-0.458812,0.000000,0.000012,0.000000,0.000000,0.000074,0.000335,0.001006,0.000085,0.001006,0.0,0.000000,0.000335,0.000335,0.000335,0.000335,0.000335,0.000335,0.000335,-0.587618,0.252645,-0.035152,-0.525552,-14.027789,-1.844182,-1.519398,-0.086287,0,0,0.000000,104,135,9,song_title,dataSpotifyClass.csv,description,netflix_titles.csv
22609,pte_methoxy.csv,Arg0,countries_data.csv,1997,unrelated,0.000000,0.004015,0.000000,0.000000,0.000000e+00,-0.241318,0.148744,-0.609571,0.000000,-0.000221,0.000000,-1.022797,-0.052781,0.016617,-0.378163,0.010038,-0.378163,0.0,0.000000,0.016617,0.016617,0.016617,0.033283,0.033283,0.033283,0.033283,-1.240528,0.252645,-0.264055,0.000000,0.000000,0.000000,-0.214649,0.000000,0,0,0.016667,22,2,4,Arg0,pte_methoxy.csv,1997,countries_data.csv
22610,student-mat.csv,internet,AdventureWorks2014_stateprovince.csv,Name,unrelated,0.000000,0.011050,0.000000,0.000000,0.000000e+00,-0.004975,-0.994937,-1.858861,0.000000,0.074012,0.040177,0.035000,0.067515,0.161564,0.827387,0.332911,0.827387,0.0,0.000000,0.161564,0.161564,0.161564,0.161564,0.827387,0.827387,0.827387,-1.044655,0.000000,-0.642904,-0.040427,0.000000,-0.040355,0.001428,-0.100492,0,1,0.665823,3,5,7,internet,student-mat.csv,Name,AdventureWorks2014_stateprovince.csv


In [520]:
print(joined.columns)

joined.drop(['ds_name', 'ds_name_2', 'att_name', 'att_name_2', 
             'relationship', 'containment', 'cardinality_proportion', 'jaccard', 'multiset_jaccard', 
             'dataset_name_1', 'attribute_name_1', 'dataset_name_2', 'attribute_name_2'],  
             axis='columns', inplace=True)

Index(['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'relationship',
       'containment', 'cardinality_proportion', 'jaccard', 'multiset_jaccard',
       'quality', 'cardinality', 'uniqueness', 'entropy', 'incompleteness',
       'frequency_avg', 'frequency_min', 'frequency_max', 'frequency_sd',
       'val_pct_min', 'val_pct_max', 'val_pct_std', 'constancy',
       'freq_word_containment', 'freq_word_soundex_containment',
       'frequency_1qo', 'frequency_2qo', 'frequency_3qo', 'frequency_4qo',
       'frequency_5qo', 'frequency_6qo', 'frequency_7qo', 'len_max_word',
       'len_min_word', 'len_avg_word', 'words_cnt_max', 'words_cnt_min',
       'words_cnt_avg', 'number_words', 'words_cnt_sd', 'is_empty',
       'is_binary', 'frequency_iqr', 'first_word', 'last_word', 'name_dist',
       'attribute_name_1', 'dataset_name_1', 'attribute_name_2',
       'dataset_name_2'],
      dtype='object')


# Model selection

Our goal is to define the best regressor model that can approximate the true value of the joinability metric defined ($MJ$ & $K$) by using profiles.

We define four base models to do so, whose metrics vary. The first point of variation is the inclusion of "datatypes" metrics (i.e. semantic types/characteristics of each column: names, URIs, etc. / alphabetical, numerical etc. ). These metrics are the most time consuming to compute, which implies that the already lightweight profile-based approach can be made much faster. The second point of variation is the execution (or lack thereof) of feature selection tasks, which further reduce the number of features while, ideally, keeping, or improving, the evaluation scores.

**Result**: for all models, the best performing regressor has been the Random Forest. Nonetheless, further testing with the benchmarks has shown that the Gradient Booster predictor (with no fine-tuning) works best. This might be due to overfitting produced by the Random Forest.


### Model evaluation methodology

We want to define a regression model. To do so, we will employ 17 base regressors (listed below) evaluated over a 10-split CV (test size = 30%) and 4 different metrics, of which we will primarily focus on the RMSE.

In [521]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

split = ShuffleSplit(n_splits=10, test_size=0.3, random_state=211199)

def scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    return {"R2": r2_score(y, y_pred),
            "MAE": mean_absolute_error(y, y_pred),
            "RMSE": math.sqrt(mean_squared_error(y, y_pred)),
            "MedAE": median_absolute_error(y, y_pred),}

names = [
    "Linear Regression",
    "Ridge Regression",
    "Lasso Regression",
    "ElasticNet Regression",
    "Random Forests",
    "Gradient Boosting",
    "AdaBoost",
    "Extra Trees",
    "Histogram Gradient Boosting",
    "XGBoosting",
    "Light GBM",
    "CatBoost",
    "MLP Relu",
    "MLP logistic",
    "MLP Tanh",
    "SVR Poly",
    "SVR Rbf"
]

regressors = [
    LinearRegression(),
    Ridge(random_state=211199),
    Lasso(random_state=211199),
    ElasticNet(random_state=211199),
    RandomForestRegressor(random_state=211199, n_estimators=50),
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    AdaBoostRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    HistGradientBoostingRegressor(random_state=211199),
    XGBRegressor(random_state=211199, n_estimators=50),
    LGBMRegressor(random_state=211199, n_estimators=50, verbose=0),
    CatBoostRegressor(random_state=211199, verbose=0),
    MLPRegressor(random_state=211199, activation = 'relu'),
    MLPRegressor(random_state=211199, activation = 'logistic'),
    MLPRegressor(random_state=211199, activation = 'tanh'),
    SVR(kernel="poly"),
    SVR(kernel="rbf"),
]

def test_list_of_regressors(predictors, target, names, regressors):
  for name, regressor in zip(names, regressors):
    print(name)
    test_regressor(regressor, predictors, target)

def test_regressor(regressor, predictors, target):
  scores = cross_validate(regressor, predictors, target, cv=split, scoring=scoring, verbose=0)
  print(f"Fit time: {scores['fit_time'].mean():.6f} | "
      f"Score time: {scores['score_time'].mean():.6f}")

  print(f"R2: {scores['test_R2'].mean():.6f} | "
      f"MAE: {scores['test_MAE'].mean():.6f} | "
      f"RMSE: {scores['test_RMSE'].mean():.6f} | "
      f"MedAE: {scores['test_MedAE'].mean():.6f}")
  print("------------------------------")

After testing, the regressors below are the ones that perform best in average. Hence, we define some functions to directly store the models associated with these base regressors.

In [522]:
best_names = [
        "gradient_boosting",
        "extra_trees",
        "xgboosting",
        "catboost"
    ]

best_regressors = [
    GradientBoostingRegressor(random_state=211199, n_estimators=50),
    ExtraTreesRegressor(random_state=211199, n_estimators=50),
    XGBRegressor(random_state=211199, n_estimators=50),
    CatBoostRegressor(random_state=211199, verbose=0),
]

def store_models(target, predictors, model_typology):
    for name, regressor in tqdm(zip(best_names, best_regressors), total=len(best_names)):
        regressor.fit(predictors, target)
        folder_path = Path(f"{models_path}/{model_typology}")
        folder_path.mkdir(parents=True, exist_ok=True)
        joblib.dump(regressor, folder_path / f"{name}_{model_typology}.pkl")

### Model 1: All metrics

Model that includes all metrics (likely to overfit and contain redundancy).

In [138]:
y_MJ = joined['quality']
predictors = joined.drop(columns=['quality'], axis=1)

test_list_of_regressors(predictors, y_MJ, names, regressors)

# Gradient Boosting -> 0.016195
# Extra Trees       -> 0.009650
# XGBoosting        -> 0.008856
# CatBoost          -> 0.008206

Linear Regression
Fit time: 0.065528 | Score time: 0.003650
R2: 0.317465 | MAE: 0.007168 | RMSE: 0.028759 | MedAE: 0.001372
------------------------------
Ridge Regression
Fit time: 0.013035 | Score time: 0.003015
R2: 0.317335 | MAE: 0.007165 | RMSE: 0.028762 | MedAE: 0.001371
------------------------------
Lasso Regression
Fit time: 0.020819 | Score time: 0.003402
R2: -0.000131 | MAE: 0.008125 | RMSE: 0.034828 | MedAE: 0.004196
------------------------------
ElasticNet Regression
Fit time: 0.021970 | Score time: 0.003218
R2: -0.000131 | MAE: 0.008126 | RMSE: 0.034828 | MedAE: 0.004196
------------------------------
Random Forests
Fit time: 32.212540 | Score time: 0.058382
R2: 0.836534 | MAE: 0.002037 | RMSE: 0.013992 | MedAE: 0.000003
------------------------------
Gradient Boosting
Fit time: 6.259399 | Score time: 0.007513
R2: 0.724397 | MAE: 0.003287 | RMSE: 0.018245 | MedAE: 0.000089
------------------------------
AdaBoost
Fit time: 0.835050 | Score time: 0.010550
R2: 0.575613 | MA

In [466]:
target = joined['quality']
predictors = joined.drop(columns=['quality'], axis=1)

store_models(target, predictors, "all")

100%|██████████| 4/4 [00:21<00:00,  5.37s/it]


### Model 2: All metrics + Feature Selection

Model 1 + feature selection process to reduce overfitting and redundancy.

In [467]:
for name, regressor in zip(best_names, best_regressors):
  model_all = joblib.load(f"{models_path}/all/{name}_all.pkl")

  feature_importances = model_all.feature_importances_
  predictors = joined.drop(columns=['quality'], axis=1)

  # Match feature importances with corresponding feature names
  feature_names = list(predictors.columns)
  feature_importance_dict = dict(zip(feature_names, feature_importances))

  # Sort the feature importances in descending order
  sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

  selected_metrics_fs_all = []
  for feature, importance in sorted_feature_importances:
    if importance > 0.0001:
      selected_metrics_fs_all.append(feature)

  target = joined['quality']
  predictors = joined[selected_metrics_fs_all]
  print(f"Number of features for {name} -> Original = {len(feature_names)}, new = {len(selected_metrics_fs_all)}")
  # test_list_of_regressors(predictors, target, [name], [regressor])

  model = regressor.fit(predictors, target)
  folder_path = Path(f"{models_path}/all_fs")
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, folder_path / f"{name}_all_fs.pkl")


# Gradient Boosting -> from 0.016195 to 0.016114, 81 to 39 features
# Extra Trees       -> from 0.009650 to 0.009142, 81 to 54 features
# XGBoosting        -> from 0.008856 to 0.008714, 81 to 50 features
# CatBoost          -> from 0.008206 to 0.007993, 81 to 58 features

Number of features for gradient_boosting -> Original = 35, new = 23
Number of features for extra_trees -> Original = 35, new = 33
Number of features for xgboosting -> Original = 35, new = 31
Number of features for catboost -> Original = 35, new = 32


### Model 5: In-depth feature selection

Multi-layer feature selection. We will only do so for the two best performing models: gradient boosting and extra trees

In [468]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression, RFECV
from sklearn.model_selection import KFold

def feature_selection_pipeline(dataset, model_typology_name):
    y = dataset["quality"]
    X = dataset.drop(columns=["quality"], axis=1)
    original_features = X.columns.tolist()

    print(f"Original feature count: {len(original_features)}")

    # 1.1 Variance Threshold
    var_thresh = VarianceThreshold(threshold = 0.01)
    var_thresh.fit(X)
    low_variance_removed = X.columns[~var_thresh.get_support()].tolist()
    X = X[X.columns[var_thresh.get_support()]]
    print(f"Removed low-variance features: {low_variance_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.2 Correlation Filter
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_removed = [column for column in upper.columns if any(upper[column] > 0.9)]
    X = X.drop(columns=high_corr_removed)
    print(f"Removed highly correlated features: {high_corr_removed}")
    print(f"Remaining features: {len(X.columns)}")

    # 1.3 Mutual Information
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
    low_mi_removed = mi_scores[mi_scores < 0.01].index.tolist()
    X_post_filter_methods = X[mi_scores[mi_scores >= 0.01].index]
    print(f"Removed low mutual information features: {low_mi_removed}")
    print(f"Remaining features: {len(X_post_filter_methods.columns)}")

    # Utility function for importance + RFECV steps
    def run_model_selection(model, model_name):
        nonlocal X_post_filter_methods, y

        # Feature Importance Filter
        model.fit(X_post_filter_methods, y)
        feature_importances = model.feature_importances_
        feature_names = list(X_post_filter_methods.columns)
        feature_importance_dict = dict(zip(feature_names, feature_importances))
        sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
        selected_metrics = [f for f, imp in sorted_importances if imp > 0.0001]

        X_temp = X_post_filter_methods[selected_metrics]
        print(f"{model_name} - Selected {len(X_temp.columns)} features after importance filtering")

        # RFECV
        rfecv = RFECV(estimator=model, step=1, cv=KFold(5), scoring='r2',verbose=3)
        rfecv.fit(X_temp, y)
        selected = X_temp.columns[rfecv.support_]
        print(f"{model_name} - Optimal number of features: {rfecv.n_features_}")
        print(f"{model_name} - Selected features: {list(selected)}")

        # Train final model
        model.fit(X_temp[selected], y)
        folder_path = Path(models_path) / model_typology_name
        folder_path.mkdir(parents=True, exist_ok=True)
        model_file = folder_path / f"{model_name}_{model_typology_name}.pkl"
        joblib.dump(model, model_file)
        print(f"Saved model to: {model_file}")

        return list(selected)

    run_model_selection(GradientBoostingRegressor(random_state=21111999, n_estimators=50), "gradient_boosting")
    run_model_selection(ExtraTreesRegressor(random_state=21111999, n_estimators=50), "extra_trees")
    run_model_selection(XGBRegressor(random_state=211199, n_estimators=50), "xgboosting")
    run_model_selection(CatBoostRegressor(random_state=211199, verbose=0), "catboost")


In [469]:
# no_syntactic_features = merged_no_dummies[['quality', 'name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
#                                 'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
#                                 'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']]

feature_selection_pipeline(joined, "all_fs_deep")
# feature_selection_pipeline(no_syntactic_features, "no_syntactic_fs_deep")

Original feature count: 35
Removed low-variance features: ['is_empty']
Remaining features: 34
Removed highly correlated features: ['constancy', 'frequency_1qo', 'frequency_2qo', 'frequency_3qo', 'frequency_4qo', 'frequency_6qo', 'frequency_7qo', 'words_cnt_sd']
Remaining features: 26
Removed low mutual information features: []
Remaining features: 26
gradient_boosting - Selected 17 features after importance filtering
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting esti

### Custom Model

In [470]:
selected_metrics_fs_no_syntactic = ['name_dist', 'frequency_max', 'uniqueness', 'first_word', 'frequency_4qo', 'freq_word_containment', 'len_avg_word', 'words_cnt_max',
                                      'frequency_6qo', 'len_max_word', 'frequency_min', 'frequency_3qo', 'is_empty', 'frequency_iqr', 'entropy', 'val_pct_std',
                                      'words_cnt_min', 'cardinality', 'words_cnt_sd', 'val_pct_max', 'len_min_word', 'words_cnt_avg']

for name, regressor in zip(best_names, best_regressors):
  target = joined['quality']
  predictors = joined[selected_metrics_fs_no_syntactic]


  model = regressor.fit(predictors, target)
  folder_path = Path(f"{models_path}/custom")
  folder_path.mkdir(parents=True, exist_ok=True)
  joblib.dump(regressor, folder_path / f"{name}_custom.pkl")


# Gradient Boosting -> models 1-2 -> from 0.016195 to 0.016114, 81 to 39 features
#                   -> models 3-4 -> from 0.016299 to 0.016298, 36 to 30 features
# Extra Trees       -> models 1-2 -> from 0.009650 to 0.009142, 81 to 54 features
#                   -> models 3-4 -> from 0.008569 to 0.008568, 36 to 33 features
# XGBoosting        -> models 1-2 -> from 0.008856 to 0.008714, 81 to 50 features
#                   -> models 3-4 -> from 0.008252 to 0.008197, 36 to 32 features
# CatBoost          -> models 1-2 -> from 0.008206 to 0.007993, 81 to 58 features
#                   -> models 3-4 -> from 0.008011 to 0.008055, 36 to 33 features

In [523]:
from itertools import product
from pathlib import Path
import joblib
from sklearn.ensemble import GradientBoostingRegressor

# -----------------------------
# Paths and data
# -----------------------------
models_path = "C:/Projects/freyja_plus_more/models_prenormalized_base/"
folder_path = Path(models_path) / "custom_more"
folder_path.mkdir(parents=True, exist_ok=True)

selected_metrics_fs_no_syntactic = [
    'name_dist', 'frequency_max', 'uniqueness', 'first_word',
    'frequency_4qo', 'freq_word_containment', 'len_avg_word',
    'words_cnt_max', 'frequency_6qo', 'len_max_word',
    'frequency_min', 'frequency_3qo', 'is_empty',
    'frequency_iqr', 'entropy', 'val_pct_std',
    'words_cnt_min', 'cardinality', 'words_cnt_sd',
    'val_pct_max', 'len_min_word', 'words_cnt_avg'
]

predictors = joined[selected_metrics_fs_no_syntactic]
target = joined['quality']

# -----------------------------
# Hyperparameter grid
# -----------------------------
param_grid = {
    "n_estimators": [25, 50, 100],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "subsample": [0.8, 1.0],
    "min_samples_leaf": [1, 10]
}

# -----------------------------
# Training loop
# -----------------------------
for values in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), values))

    regressor = GradientBoostingRegressor(
        random_state=211199,
        **params
    )

    regressor.fit(predictors, target)

    # Descriptive model filename
    model_name = (
        f"gradient_boosting_prenormalized_base_"
        f"ne{params['n_estimators']}_"
        f"lr{params['learning_rate']}_"
        f"md{params['max_depth']}_"
        f"ss{params['subsample']}_"
        f"msl{params['min_samples_leaf']}.pkl"
    )

    joblib.dump(regressor, folder_path / model_name)

# -----------------------------
# Optional: save the hyperparameter grid for reference
# -----------------------------
import json
with open(folder_path / "model_configs.json", "w") as f:
    json.dump(param_grid, f, indent=2)


# Benchmark evaluation

Once we have define all the models, we will evaluate each of the seven selected benchmarks with all of them, with the goal of discerning which is the best one. To do so, all the distances for all query columns of each benchmark have been obtained and stored.



## Preparation
Load all the model and define the functions to prepare the data for the models. This data preparation depends on the features defined for each model. We also present the function used to obtain the metrics from the benchmark.

In [537]:
import os

models = {}
typologies = ["custom_more"]

models_path = 'C:/Projects/freyja_plus_more/models_prenormalized_base/custom_more'

for filename in os.listdir(models_path):
    if filename.endswith("gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10.pkl"):  # Only load joblib files
        model_name = os.path.splitext(filename)[0]
        file_path = os.path.join(models_path, filename)
        models[model_name] = joblib.load(file_path)

print(models.keys())
print(len(models.keys()))

dict_keys(['gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10'])
1


In [471]:
best_names = [
        "gradient_boosting",
        "extra_trees",
        # "xgboosting",
        # "catboost"
    ]

models = {}
# typologies = ["all", "all_fs", "no_syntactic", "no_syntactic_fs", "all_fs_deep", "no_syntactic_fs_deep"]
typologies = ["all", "all_fs", "all_fs_deep", "custom"]
# typologies = ["all_fs_deep", "custom"]
# typologies = ["custom_more"]

models_path = 'C:/Projects/freyja_plus_more/models_postnormalized_all/'

for name in best_names:
    for type in typologies:
        try:
            models[f"{name}_{type}"] = joblib.load(f'{models_path}/{type}/{name}_{type}.pkl')
        except FileNotFoundError:
            pass

print(models.keys())
print(len(models.keys()))

dict_keys(['gradient_boosting_all', 'gradient_boosting_all_fs', 'gradient_boosting_all_fs_deep', 'gradient_boosting_custom', 'extra_trees_all', 'extra_trees_all_fs', 'extra_trees_all_fs_deep', 'extra_trees_custom'])
8


In [538]:
def prepare_data_for_model(distances, model, type_of_model):
  distances = distances.drop(columns=['dataset_name', 'dataset_name_2', 'attribute_name', 'attribute_name_2'], axis=1)

  maybe_missing_columns = ["datatype__pct_alphabetic", "datatype__pct_date_time", "datatype__pct_non_alphanumeric", "datatype__pct_numeric", "datatype__pct_unknown", "datatype__pct_alphanumeric",
                            "specific_type__pct_date", "specific_type__pct_email", "specific_type__pct_phrases", 'specific_type_2__pct_username', 'datatype_2__pct_alphanumeric', 'datatype_2__pct_alphabetic',
                            'specific_type_2__pct_phrases', "specific_type__pct_general", "specific_type__pct_others", "specific_type__pct_time", "specific_type__pct_url", "specific_type__pct_username",
                            'datatype_2__pct_date_time', 'specific_type_2__pct_date', 'specific_type_2__pct_email', 'specific_type_2__pct_general', 'specific_type_2__pct_url', 'datatype_2__pct_non_alphanumeric',
                            'datatype_2__pct_numeric', 'datatype_2__pct_unknown', 'specific_type_2__pct_others', 'specific_type_2__pct_time']

  # if "no_syntactic_fs" in type_of_model or "custom" in type_of_model:
  #   pass
  # elif ("all" in type_of_model):
  #   distances = pd.concat([distances.drop('datatype', axis=1), pd.get_dummies(distances['datatype'], prefix='datatype_', dtype=int)], axis=1)
  #   distances = pd.concat([distances.drop('datatype_2', axis=1), pd.get_dummies(distances['datatype_2'], prefix='datatype_2_', dtype=int)], axis=1)
  #   distances = pd.concat([distances.drop('specific_type', axis=1), pd.get_dummies(distances['specific_type'], prefix='specific_type_', dtype=int)], axis=1)
  #   distances = pd.concat([distances.drop('specific_type_2', axis=1), pd.get_dummies(distances['specific_type_2'], prefix='specific_type_2_', dtype=int)], axis=1)

  #   for column in maybe_missing_columns:
  #     if column not in distances.columns:
  #       distances[column] = 0
  # else:
  #   distances = distances.drop(columns=['datatype', "datatype_2", "specific_type", "specific_type_2"], axis=1, errors='ignore')
  #   distances = distances.drop(columns=["pct_numeric", "pct_alphanumeric", "pct_alphabetic", "pct_non_alphanumeric", "pct_date_time", "pct_unknown", "pct_phones", "pct_email", "pct_url", "pct_ip",
  #                                       "pct_username", "pct_phrases", "pct_general", "pct_date", "pct_time", "pct_date_time_specific", "pct_others"], axis=1, errors='ignore')

  if 'is_empty_2' not in distances.columns:
    distances['is_empty_2'] = 0

  # Arrange the columns as in the model
  if "catboost" in type_of_model:
    distances = distances[model.feature_names_] 
  else:
    distances = distances[model.feature_names_in_] 
  return distances

In [539]:
def compute_and_evaluate_ranking(model, model_type, k, step, ground_truth_path, distances_folder_path):
  # Read the ground truth and obtain, for every target column, the amount of candidate columns that it has a join with. This will allow us to calculate the recall,
  # as it indicates the maximum possible joins, regardless of the value of k
  ground_truth = pd.read_csv(ground_truth_path, header = 0)
  pair_counts = ground_truth.groupby(['target_ds', 'target_attr']).size().reset_index(name='joins_count')

  # Initialize the matrix of metrics
  num_observations = int(k / step)
  precision = [0] * num_observations
  recall = [0] * num_observations
  max_recall = [0] * num_observations
  MAP = [0] * num_observations

  # Initialize execution time
  total_time = 0

  for _, row in tqdm(pair_counts.iterrows(), total=len(pair_counts)):
      dataset = row['target_ds']
      attribute = row['target_attr']
      count = row['joins_count']

      st = time.time()

      # Read the distances and do some preprocessing
      distances = pd.read_csv(distances_folder_path + 'distances_' + dataset.replace(".csv", "_profile_") + attribute.replace("/", "_").replace(": ","_") + ".csv", header = 0, encoding='latin1', on_bad_lines="skip")

      dataset_names = distances["dataset_name_2"] # We store dataset and attribute names to be used to evaluate the ranking
      attribute_names = distances["attribute_name_2"]
      distances = prepare_data_for_model(distances, model, model_type)

      # # Use the model to predict
      # y_pred = model.predict(distances)
      # distances["predictions"] = y_pred

      # Use the model to predict (preventing some weird lines that might have slipped in)
      distances_numeric = distances.apply(pd.to_numeric, errors='coerce') # Convert everything to float, invalid parsing becomes NaN
      valid_rows = distances_numeric.dropna(axis=0, how='any') # Keep track of valid rows
      y_pred = model.predict(valid_rows) # Predict only on valid rows
      distances.loc[valid_rows.index, "predictions"] = y_pred # Assign predictions back only to the valid rows

      distances["target_ds"] = dataset_names
      distances["target_attr"] = attribute_names

      total_time += (time.time() - st) # In the time assessment we do not consider the evaluation of the ranking

      # Precompute a lookup set of valid (candidate_ds, candidate_attr) for this query
      valid_pairs = set(
          ground_truth.loc[
              (ground_truth['target_ds'] == dataset) &
              (ground_truth['target_attr'] == attribute),
              ['candidate_ds', 'candidate_attr']
          ].itertuples(index=False, name=None)
      )

      # For every k that we want to assess the ranking of, we get the top k joins and check how many appear in the grpund truth
      for k_iter in range(1, num_observations + 1):
        count_sem = 0
        ap = 0
        count_positions = 0

        top_k_joins = distances.sort_values(by='predictions', ascending=False).head(k_iter * step)

        for position in top_k_joins.itertuples(index=False):
            pair = (position.target_ds, position.target_attr)
            if pair in valid_pairs: 
                count_sem += 1
                ap += count_sem / (count_positions + 1)
            count_positions += 1


        precision[k_iter - 1] += count_sem / (k_iter * step)
        if count_sem != 0:
            MAP[k_iter - 1] += ap / count_sem
        recall[k_iter - 1] += count_sem / count
        max_recall[k_iter - 1] += (k_iter * step) / count

  print("AVERAGE time to load the distances and execute the model:")
  print("----%.2f----" % (total_time / len(pair_counts)))

  print("Precisions:", [round(element / len(pair_counts), 4) for element in precision])
#   print("Recall:", [round(element / len(pair_counts), 4) for element in recall])
#   print("Max recall:", [round(element / len(pair_counts), 4) for element in max_recall])
#   print("Recall percentage:", [round((recall_iter / len(pair_counts)) / (max_recall_iter / len(pair_counts)), 4) for recall_iter, max_recall_iter in zip(recall, max_recall)])
#   print("MAP:", [round(element / len(pair_counts), 4) for element in MAP])

  return [round(element / len(pair_counts), 4) for element in precision]

## Benchmarks

In [540]:
def evaluate_models(k, step, ground_truth_path, distances_folder_path, models=models):
    results = {}

    for model_type, model in models.items():
        print(f"Model {model_type}")
        precision_scores = compute_and_evaluate_ranking(
            model, model_type, k, step, ground_truth_path, distances_folder_path
        )
        results[model_type] = precision_scores
        print("------------------------------------------------------")

    # Compute average precisions
    avg_precisions = {
        model_name: sum(precisions) / len(precisions)
        for model_name, precisions in results.items()
    }

    # Sort models from best to worst
    ranked_models = sorted(avg_precisions.items(), key=lambda x: x[1], reverse=True)

    # Print ranked results
    print("\n==================== MODEL RANKINGS ====================")
    for rank, (model_name, avg_precision) in enumerate(ranked_models, start=1):
        print(f"{rank:2d}. Model: {model_name:20s} | Avg Precision: {avg_precision:.4f}")

    print("========================================================")

    # return ranked_models, results


### Santos Small

In [None]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/santos_small/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:03<00:00, 14.73it/s]

AVERAGE time to load the distances and execute the model:
----0.05----
Precisions: [1.0, 0.97, 0.96, 0.955, 0.94, 0.9333, 0.9286, 0.925, 0.9222, 0.914]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9448





In [543]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/test/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:03<00:00, 13.79it/s]

AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [1.0, 0.97, 0.96, 0.955, 0.94, 0.9333, 0.9286, 0.925, 0.9222, 0.914]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9448





In [None]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/santos_small/santos_small_ground_truth.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/santos_small/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

In [530]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_small/tus_small_ground_truth.csv'
# distances_folder_path = 'C:/Projects/freyja_plus_more/distances/tus_small/'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/tus_small/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

#Original ->  2. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.8939
#Supernormalize ->  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.8974

# Prenormalized_all:  1. Model: gradient_boosting_custom | Avg Precision: 0.9075
                    # 2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.8966

# Prenormalized_base:  1. Model: gradient_boosting_custom | Avg Precision: 0.9085
                    #  2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.8975

# Base                   . Model: gradient_boosting_custom | Avg Precision: 0.8942
                    #   2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.8841

# postnormalized_base:  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.8837
                    #   2. Model: gradient_boosting_custom | Avg Precision: 0.8780

# postnormalized_all:

Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:16<00:00,  5.88it/s]


AVERAGE time to load the distances and execute the model:
----0.15----
Precisions: [0.975, 0.901, 0.835, 0.865, 0.887, 0.901]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:13<00:00,  7.19it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.97, 0.903, 0.8353, 0.8662, 0.8874, 0.9022]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:14<00:00,  6.92it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.969, 0.9155, 0.8597, 0.8712, 0.8864, 0.9008]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:14<00:00,  7.09it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.972, 0.915, 0.8603, 0.8842, 0.903, 0.9143]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:14<00:00,  6.79it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.967, 0.9275, 0.888, 0.9055, 0.9084, 0.8907]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:14<00:00,  7.12it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.969, 0.911, 0.853, 0.874, 0.8896, 0.8727]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:14<00:00,  6.92it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.966, 0.888, 0.8327, 0.8432, 0.8608, 0.865]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:15<00:00,  6.50it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.972, 0.8625, 0.738, 0.6863, 0.6628, 0.6375]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:14<00:00,  7.06it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.97, 0.9225, 0.8657, 0.8747, 0.866, 0.8678]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:14<00:00,  7.03it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.969, 0.9145, 0.848, 0.8365, 0.8234, 0.8038]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:14<00:00,  6.97it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.968, 0.9185, 0.8677, 0.861, 0.8574, 0.8253]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:14<00:00,  6.90it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.969, 0.9065, 0.8347, 0.835, 0.819, 0.7895]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:15<00:00,  6.45it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.971, 0.87, 0.7793, 0.7895, 0.7596, 0.7298]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:14<00:00,  6.94it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.974, 0.9035, 0.839, 0.843, 0.8302, 0.7977]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:12<00:00,  7.85it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.971, 0.855, 0.7177, 0.6493, 0.5832, 0.5265]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:13<00:00,  7.66it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.973, 0.8685, 0.762, 0.7085, 0.6718, 0.623]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:12<00:00,  7.92it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.974, 0.902, 0.8357, 0.8517, 0.8702, 0.8813]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:13<00:00,  7.30it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.975, 0.902, 0.836, 0.8517, 0.8692, 0.879]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:13<00:00,  7.68it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.975, 0.902, 0.835, 0.8517, 0.869, 0.8805]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:11<00:00,  8.46it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.976, 0.902, 0.8363, 0.8515, 0.8688, 0.8803]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:12<00:00,  7.80it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.974, 0.905, 0.8407, 0.8685, 0.8866, 0.8907]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:12<00:00,  7.86it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.964, 0.9065, 0.8453, 0.872, 0.8896, 0.9002]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:12<00:00,  7.90it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.974, 0.906, 0.8473, 0.8722, 0.8896, 0.901]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:11<00:00,  8.47it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.971, 0.9025, 0.8377, 0.8632, 0.8834, 0.8937]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:11<00:00,  8.49it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.977, 0.9025, 0.837, 0.8537, 0.875, 0.891]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:11<00:00,  8.62it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.969, 0.902, 0.8387, 0.8685, 0.889, 0.9033]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:11<00:00,  8.67it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.976, 0.902, 0.838, 0.8537, 0.8738, 0.8903]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:11<00:00,  8.62it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.976, 0.902, 0.8387, 0.8675, 0.8886, 0.9032]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:13<00:00,  7.18it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.968, 0.9065, 0.8477, 0.8757, 0.8952, 0.9067]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:14<00:00,  7.08it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.968, 0.919, 0.869, 0.8937, 0.9072, 0.9105]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:12<00:00,  7.70it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.974, 0.906, 0.848, 0.877, 0.8962, 0.9087]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:14<00:00,  7.04it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.97, 0.9035, 0.8387, 0.8672, 0.8884, 0.8943]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:14<00:00,  6.91it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.966, 0.902, 0.8363, 0.8537, 0.8748, 0.8912]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:13<00:00,  7.48it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.974, 0.9025, 0.8377, 0.8547, 0.8758, 0.8908]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:12<00:00,  7.71it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.978, 0.9025, 0.8357, 0.8532, 0.8742, 0.8905]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:11<00:00,  8.52it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.976, 0.902, 0.8393, 0.8677, 0.8888, 0.9022]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:12<00:00,  8.30it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.972, 0.918, 0.8657, 0.8905, 0.907, 0.9167]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:12<00:00,  8.04it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.971, 0.919, 0.8737, 0.8935, 0.909, 0.9182]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:13<00:00,  7.38it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.967, 0.914, 0.861, 0.8857, 0.9028, 0.9127]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:13<00:00,  7.40it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.971, 0.9025, 0.839, 0.8685, 0.8882, 0.8945]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:15<00:00,  6.56it/s]


AVERAGE time to load the distances and execute the model:
----0.13----
Precisions: [0.966, 0.9275, 0.8833, 0.9012, 0.9156, 0.9247]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:14<00:00,  6.71it/s]


AVERAGE time to load the distances and execute the model:
----0.12----
Precisions: [0.971, 0.9155, 0.8577, 0.884, 0.9016, 0.9138]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:13<00:00,  7.33it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.975, 0.915, 0.8613, 0.8842, 0.901, 0.9143]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.97, 0.9145, 0.8617, 0.8837, 0.9016, 0.9133]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:13<00:00,  7.37it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.976, 0.918, 0.872, 0.887, 0.8852, 0.8757]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:12<00:00,  8.23it/s]


AVERAGE time to load the distances and execute the model:
----0.10----
Precisions: [0.968, 0.9035, 0.8547, 0.8755, 0.8756, 0.8568]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:12<00:00,  7.94it/s]


AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.973, 0.9315, 0.895, 0.9117, 0.9222, 0.9112]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:13<00:00,  7.32it/s]

AVERAGE time to load the distances and execute the model:
----0.11----
Precisions: [0.976, 0.893, 0.8347, 0.843, 0.8288, 0.8163]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1 | Avg Precision: 0.9241
 2. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.9197
 3. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1 | Avg Precision: 0.9145
 4. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10 | Avg Precision: 0.9141
 5. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1 | Avg Precision: 0.9116
 6. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10 | Avg Precision: 0.9112
 7. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9085
 8. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9081
 9. Model: gradient_bo




In [531]:
k = 60
step = 10
ground_truth_path = 'C:/Projects/benchmarks/tus_big/tus_big_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/tus_big/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

#Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9718
#Supernormalized ->  1. Model: gradient_boosting_custom | Avg Precision: 0.9411


Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:50<00:00,  1.99it/s]


AVERAGE time to load the distances and execute the model:
----0.42----
Precisions: [0.96, 0.958, 0.9417, 0.9205, 0.9, 0.8928]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:47<00:00,  2.11it/s]


AVERAGE time to load the distances and execute the model:
----0.39----
Precisions: [0.958, 0.9575, 0.946, 0.929, 0.9098, 0.8962]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


AVERAGE time to load the distances and execute the model:
----0.38----
Precisions: [0.968, 0.961, 0.9457, 0.93, 0.9094, 0.8998]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:43<00:00,  2.29it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.971, 0.9675, 0.9597, 0.944, 0.9208, 0.907]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:49<00:00,  2.00it/s]


AVERAGE time to load the distances and execute the model:
----0.41----
Precisions: [0.979, 0.963, 0.9407, 0.9103, 0.8868, 0.8687]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:42<00:00,  2.35it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.961, 0.945, 0.9187, 0.894, 0.8734, 0.8582]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:50<00:00,  1.98it/s]


AVERAGE time to load the distances and execute the model:
----0.42----
Precisions: [0.967, 0.943, 0.9093, 0.8908, 0.8644, 0.847]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:41<00:00,  2.39it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.963, 0.9225, 0.8713, 0.8378, 0.8034, 0.7708]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:47<00:00,  2.11it/s]


AVERAGE time to load the distances and execute the model:
----0.39----
Precisions: [0.933, 0.9035, 0.8687, 0.8505, 0.8314, 0.8052]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [01:01<00:00,  1.63it/s]


AVERAGE time to load the distances and execute the model:
----0.51----
Precisions: [0.956, 0.95, 0.9407, 0.9252, 0.9042, 0.8877]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


AVERAGE time to load the distances and execute the model:
----0.53----
Precisions: [0.951, 0.927, 0.885, 0.8473, 0.8142, 0.7915]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [01:01<00:00,  1.62it/s]


AVERAGE time to load the distances and execute the model:
----0.51----
Precisions: [0.969, 0.9505, 0.933, 0.9068, 0.8754, 0.8512]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [01:03<00:00,  1.56it/s]


AVERAGE time to load the distances and execute the model:
----0.54----
Precisions: [0.961, 0.93, 0.8937, 0.8705, 0.8416, 0.8142]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:55<00:00,  1.80it/s]


AVERAGE time to load the distances and execute the model:
----0.46----
Precisions: [0.947, 0.9215, 0.8967, 0.875, 0.8492, 0.8282]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:46<00:00,  2.15it/s]


AVERAGE time to load the distances and execute the model:
----0.38----
Precisions: [0.958, 0.9245, 0.8643, 0.8148, 0.7706, 0.7272]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:46<00:00,  2.14it/s]


AVERAGE time to load the distances and execute the model:
----0.38----
Precisions: [0.955, 0.9195, 0.8903, 0.8618, 0.8334, 0.8047]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:40<00:00,  2.46it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.964, 0.9535, 0.9403, 0.922, 0.8982, 0.8832]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:39<00:00,  2.50it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.961, 0.9525, 0.9403, 0.9225, 0.8986, 0.8833]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:40<00:00,  2.47it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.959, 0.9475, 0.9257, 0.9023, 0.8852, 0.87]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:40<00:00,  2.48it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.961, 0.9485, 0.9327, 0.9155, 0.896, 0.8828]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.959, 0.951, 0.923, 0.8902, 0.858, 0.8362]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:41<00:00,  2.41it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.962, 0.9475, 0.924, 0.894, 0.867, 0.8478]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:40<00:00,  2.47it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.957, 0.946, 0.9197, 0.8925, 0.866, 0.8435]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:40<00:00,  2.44it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.965, 0.9545, 0.9327, 0.9045, 0.8782, 0.8622]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:40<00:00,  2.46it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.965, 0.952, 0.9463, 0.93, 0.9148, 0.9032]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:40<00:00,  2.48it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.965, 0.9535, 0.9433, 0.9272, 0.9126, 0.903]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:40<00:00,  2.49it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.961, 0.955, 0.9433, 0.9227, 0.9046, 0.8933]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:40<00:00,  2.49it/s]


AVERAGE time to load the distances and execute the model:
----0.34----
Precisions: [0.965, 0.958, 0.949, 0.9295, 0.9092, 0.8943]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:41<00:00,  2.39it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.969, 0.9575, 0.9443, 0.9287, 0.9088, 0.8907]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:41<00:00,  2.38it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.964, 0.9455, 0.9257, 0.8982, 0.8766, 0.8602]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:40<00:00,  2.44it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.969, 0.9525, 0.9317, 0.9112, 0.8884, 0.873]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:41<00:00,  2.38it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.964, 0.95, 0.9307, 0.9137, 0.8934, 0.8782]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [02:09<00:00,  1.29s/it]


AVERAGE time to load the distances and execute the model:
----1.23----
Precisions: [0.963, 0.955, 0.944, 0.927, 0.9066, 0.8957]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.966, 0.958, 0.9443, 0.924, 0.9038, 0.8957]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:40<00:00,  2.44it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.963, 0.9545, 0.9413, 0.9202, 0.9014, 0.8925]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:40<00:00,  2.44it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.966, 0.9585, 0.945, 0.9265, 0.9082, 0.8987]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:42<00:00,  2.38it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.966, 0.961, 0.941, 0.916, 0.895, 0.8792]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.958, 0.9535, 0.9337, 0.909, 0.8864, 0.8723]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:41<00:00,  2.40it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.972, 0.9545, 0.9377, 0.9128, 0.8886, 0.8717]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:41<00:00,  2.38it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.962, 0.9455, 0.9293, 0.9102, 0.8912, 0.8745]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.965, 0.9535, 0.9413, 0.9245, 0.911, 0.908]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.959, 0.9455, 0.9373, 0.9222, 0.9018, 0.892]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.969, 0.9585, 0.946, 0.9317, 0.9116, 0.9032]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:42<00:00,  2.38it/s]


AVERAGE time to load the distances and execute the model:
----0.35----
Precisions: [0.969, 0.961, 0.9527, 0.9345, 0.915, 0.9007]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:43<00:00,  2.30it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.969, 0.956, 0.925, 0.9008, 0.8792, 0.8655]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:43<00:00,  2.29it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.953, 0.9275, 0.8877, 0.8605, 0.8336, 0.8122]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:43<00:00,  2.29it/s]


AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.976, 0.9645, 0.937, 0.9125, 0.892, 0.8773]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:43<00:00,  2.28it/s]

AVERAGE time to load the distances and execute the model:
----0.36----
Precisions: [0.962, 0.9335, 0.8983, 0.876, 0.8482, 0.8282]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9450
 2. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9388
 3. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9367
 4. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.9356
 5. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.9352
 6. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9342
 7. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.9341
 8. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.9339
 9. Model: gradient_bo




In [532]:
k = 100
step = 10
ground_truth_path = 'C:/Projects/benchmarks/d3l/d3l_ground_truth_sample.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/d3l/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

#Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.7994
#Supernormalized ->   1. Model: model                | Avg Precision: 0.7777

# Prenormalized_all:  1. Model: gradient_boosting_custom | Avg Precision: 0.7878
                    # 2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.7300

# Prenormalized_base:  1. Model: gradient_boosting_custom | Avg Precision: 0.7886
                    #  2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.7290

# Res                    1. Model: gradient_boosting_custom | Avg Precision: 0.7798
                    #   2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.7342

# postnormalized_base:  1. Model: gradient_boosting_custom | Avg Precision: 0.7502
                #       2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.7358


# postnormalized_all:

Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:10<00:00,  9.84it/s]


AVERAGE time to load the distances and execute the model:
----0.08----
Precisions: [0.817, 0.8105, 0.817, 0.8033, 0.787, 0.7872, 0.783, 0.7707, 0.7519, 0.7328]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:09<00:00, 11.01it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.811, 0.8075, 0.796, 0.7788, 0.7634, 0.7728, 0.7814, 0.772, 0.7547, 0.7398]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:09<00:00, 10.85it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.822, 0.8205, 0.8107, 0.7873, 0.7648, 0.7687, 0.7733, 0.7634, 0.7482, 0.7364]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.48it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.825, 0.8165, 0.803, 0.7848, 0.762, 0.7665, 0.7743, 0.7669, 0.7512, 0.7387]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.38it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.808, 0.822, 0.8077, 0.765, 0.7458, 0.7492, 0.7417, 0.7291, 0.715, 0.6969]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:09<00:00, 10.62it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.754, 0.762, 0.7633, 0.738, 0.7242, 0.7293, 0.7216, 0.7097, 0.6944, 0.6777]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:09<00:00, 10.63it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.776, 0.782, 0.78, 0.7603, 0.7532, 0.7458, 0.7313, 0.7149, 0.6941, 0.6759]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:09<00:00, 10.50it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.593, 0.5705, 0.5583, 0.5705, 0.564, 0.5603, 0.5634, 0.5711, 0.5739, 0.5732]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:09<00:00, 11.00it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.608, 0.6705, 0.6817, 0.7042, 0.7198, 0.7308, 0.7413, 0.7365, 0.7228, 0.708]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 11.70it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.803, 0.789, 0.7957, 0.771, 0.74, 0.7435, 0.7421, 0.7356, 0.7231, 0.709]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 11.27it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.735, 0.7175, 0.7333, 0.722, 0.703, 0.7072, 0.7056, 0.6977, 0.6891, 0.6826]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.57it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.783, 0.7685, 0.7467, 0.7153, 0.6864, 0.6928, 0.6951, 0.6907, 0.6807, 0.6714]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.33it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.753, 0.724, 0.686, 0.6305, 0.615, 0.6058, 0.5927, 0.5766, 0.558, 0.5385]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 11.35it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.781, 0.762, 0.743, 0.718, 0.6946, 0.6727, 0.6536, 0.6357, 0.6202, 0.6004]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:09<00:00, 10.92it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.692, 0.676, 0.6523, 0.6268, 0.6144, 0.593, 0.5797, 0.5636, 0.5476, 0.535]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.25it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.742, 0.685, 0.6393, 0.5983, 0.5682, 0.5615, 0.552, 0.5434, 0.529, 0.5168]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:07<00:00, 12.55it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.723, 0.7345, 0.737, 0.7395, 0.7502, 0.7593, 0.7676, 0.7599, 0.7472, 0.738]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 12.23it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.721, 0.7345, 0.7363, 0.7395, 0.7502, 0.7598, 0.7679, 0.7601, 0.7471, 0.7368]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 12.42it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.725, 0.7375, 0.734, 0.736, 0.7436, 0.752, 0.76, 0.7507, 0.7381, 0.7289]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 12.39it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.79, 0.769, 0.7533, 0.737, 0.732, 0.7372, 0.7423, 0.7277, 0.7118, 0.6977]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 12.25it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.766, 0.784, 0.7887, 0.7863, 0.7862, 0.7868, 0.7876, 0.7704, 0.7541, 0.7398]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 12.23it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.746, 0.7635, 0.7607, 0.7653, 0.7792, 0.7833, 0.7846, 0.7724, 0.7567, 0.7422]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 12.31it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.76, 0.777, 0.786, 0.7895, 0.791, 0.7927, 0.7941, 0.78, 0.7623, 0.748]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 12.31it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.762, 0.784, 0.795, 0.7998, 0.801, 0.801, 0.8024, 0.789, 0.7728, 0.7578]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 12.39it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.802, 0.79, 0.7833, 0.7645, 0.7552, 0.7642, 0.771, 0.7607, 0.7459, 0.7357]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 12.30it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.803, 0.7875, 0.7823, 0.766, 0.758, 0.7652, 0.7726, 0.7627, 0.7481, 0.738]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:07<00:00, 12.67it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.824, 0.806, 0.7853, 0.7658, 0.757, 0.7657, 0.7706, 0.7612, 0.7464, 0.7362]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:07<00:00, 12.55it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.821, 0.801, 0.787, 0.766, 0.7568, 0.7648, 0.7704, 0.7602, 0.7458, 0.7352]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.96it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.801, 0.7985, 0.7953, 0.772, 0.757, 0.7553, 0.7589, 0.7457, 0.7281, 0.7174]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 11.57it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.799, 0.7955, 0.795, 0.761, 0.7364, 0.7327, 0.7289, 0.7082, 0.688, 0.6712]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 11.41it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.811, 0.8185, 0.8113, 0.7685, 0.7472, 0.7537, 0.76, 0.7484, 0.7334, 0.7217]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 12.03it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.805, 0.804, 0.811, 0.7903, 0.7702, 0.7732, 0.7741, 0.7632, 0.746, 0.7277]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.76it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.809, 0.806, 0.7883, 0.7685, 0.7576, 0.7655, 0.7739, 0.765, 0.753, 0.7437]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 11.27it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.805, 0.7975, 0.779, 0.7623, 0.755, 0.7642, 0.7706, 0.7605, 0.7458, 0.7344]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 11.48it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.816, 0.8015, 0.7873, 0.7663, 0.7574, 0.7667, 0.7721, 0.7622, 0.7473, 0.7371]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.16it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.821, 0.804, 0.7847, 0.7655, 0.7568, 0.7662, 0.7714, 0.7617, 0.747, 0.7366]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 100/100 [13:31<00:00,  8.11s/it]


AVERAGE time to load the distances and execute the model:
----8.09----
Precisions: [0.814, 0.8245, 0.8243, 0.7928, 0.773, 0.78, 0.7797, 0.7592, 0.7391, 0.7217]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 100/100 [00:10<00:00,  9.59it/s]


AVERAGE time to load the distances and execute the model:
----0.07----
Precisions: [0.741, 0.7885, 0.8047, 0.7965, 0.7932, 0.7918, 0.7906, 0.7689, 0.7484, 0.7313]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 11.45it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.826, 0.8055, 0.7947, 0.7853, 0.7902, 0.7942, 0.7959, 0.7829, 0.7667, 0.7525]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 12.02it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.693, 0.7505, 0.7573, 0.7595, 0.7532, 0.7555, 0.7599, 0.7502, 0.7351, 0.7221]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.72it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.808, 0.79, 0.792, 0.7745, 0.7672, 0.7772, 0.782, 0.7721, 0.7553, 0.7381]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 100/100 [00:08<00:00, 11.81it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.81, 0.8025, 0.8047, 0.7853, 0.7642, 0.7645, 0.769, 0.7619, 0.7478, 0.7337]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 100/100 [00:08<00:00, 12.23it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.824, 0.8215, 0.8117, 0.7953, 0.7824, 0.7868, 0.7894, 0.7756, 0.7561, 0.7434]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.65it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.819, 0.8145, 0.807, 0.7865, 0.7682, 0.7718, 0.7741, 0.7659, 0.7533, 0.7422]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 100/100 [00:08<00:00, 11.28it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.79, 0.7865, 0.7713, 0.7535, 0.741, 0.7282, 0.7143, 0.6991, 0.6834, 0.6637]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 100/100 [00:09<00:00, 10.81it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.748, 0.7405, 0.745, 0.723, 0.6996, 0.697, 0.6909, 0.6809, 0.6683, 0.6522]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 100/100 [00:09<00:00, 10.76it/s]


AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.818, 0.823, 0.8217, 0.789, 0.7636, 0.758, 0.7489, 0.7346, 0.7153, 0.6969]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 100/100 [00:08<00:00, 11.37it/s]

AVERAGE time to load the distances and execute the model:
----0.06----
Precisions: [0.731, 0.6865, 0.7077, 0.6833, 0.6392, 0.6273, 0.6321, 0.6326, 0.6288, 0.6153]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1 | Avg Precision: 0.7894
 2. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.7886
 3. Model: gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10 | Avg Precision: 0.7865
 4. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1 | Avg Precision: 0.7860
 5. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1 | Avg Precision: 0.7808
 6. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.7802
 7. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.7795
 8. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10 | Avg Pre




In [533]:
k = 10
step = 1
ground_truth_path = 'C:/Projects/benchmarks/freyja/freyja_ground_truth.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/freyja/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

# Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9504
# Supernormalized -> 1. Model: model                | Avg Precision: 0.8942

# Prenormalized_all:  1. Model: gradient_boosting_custom | Avg Precision: 0.9411
                    # 2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9300

# prenormalized_base:  1. Model: gradient_boosting_custom | Avg Precision: 0.9411
                    #  2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9300

# Res                    1. Model: gradient_boosting_custom | Avg Precision: 0.9411
                    #   2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9300       


# postnormalized_base:  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.9392
                    #   2. Model: extra_trees_all_fs_deep | Avg Precision: 0.8882


# postnormalized_all:

Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 25.57it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [1.0, 0.98, 0.96, 0.95, 0.94, 0.93, 0.9286, 0.9275, 0.9178, 0.908]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 35.98it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.98, 0.9667, 0.945, 0.928, 0.9267, 0.9286, 0.9275, 0.9244, 0.926]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.42it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.96, 0.96, 0.952, 0.9467, 0.94, 0.9325, 0.92, 0.916]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.80it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.94, 0.935, 0.924, 0.9233, 0.92, 0.9125, 0.9067, 0.904]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 34.03it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.9267, 0.915, 0.912, 0.91, 0.9086, 0.8975, 0.8911, 0.89]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.61it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.94, 0.92, 0.895, 0.884, 0.8833, 0.8886, 0.885, 0.8733, 0.872]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 36.02it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.94, 0.9267, 0.915, 0.912, 0.91, 0.9086, 0.91, 0.8911, 0.88]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 34.74it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.91, 0.9, 0.885, 0.876, 0.8667, 0.8543, 0.845, 0.8422, 0.846]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 36.02it/s]


AVERAGE time to load the distances and execute the model:
----0.01----
Precisions: [1.0, 0.94, 0.9267, 0.935, 0.94, 0.9367, 0.92, 0.9175, 0.8978, 0.898]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 34.87it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.9533, 0.945, 0.94, 0.9267, 0.9286, 0.925, 0.9156, 0.906]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.80it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.95, 0.9267, 0.925, 0.908, 0.9, 0.8943, 0.885, 0.8689, 0.858]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 34.50it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.94, 0.95, 0.94, 0.9367, 0.9257, 0.915, 0.9022, 0.904]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.92it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.93, 0.9, 0.9, 0.9, 0.89, 0.88, 0.87, 0.8644, 0.848]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 34.57it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.9, 0.8733, 0.865, 0.86, 0.8533, 0.8486, 0.8375, 0.8267, 0.814]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 34.13it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.9333, 0.91, 0.9, 0.8867, 0.8743, 0.8675, 0.86, 0.848]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 29.56it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.94, 0.9, 0.88, 0.868, 0.8567, 0.8514, 0.8275, 0.8133, 0.798]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.00it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.98, 0.96, 0.96, 0.965, 0.956, 0.9467, 0.9429, 0.935, 0.9311, 0.932]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 29.19it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.98, 0.96, 0.96, 0.965, 0.956, 0.9467, 0.9429, 0.935, 0.9311, 0.932]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 29.06it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.98, 0.97, 0.96, 0.97, 0.968, 0.9567, 0.94, 0.935, 0.9311, 0.928]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.09it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9667, 0.975, 0.964, 0.9567, 0.9486, 0.9425, 0.9311, 0.922]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.51it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.9467, 0.925, 0.92, 0.92, 0.9114, 0.915, 0.9044, 0.888]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.10it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.9667, 0.94, 0.936, 0.9333, 0.9257, 0.9275, 0.9156, 0.898]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.98it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.94, 0.93, 0.924, 0.9267, 0.9229, 0.9225, 0.9089, 0.898]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.18it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9533, 0.935, 0.924, 0.9267, 0.9257, 0.92, 0.9067, 0.898]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.70it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9733, 0.95, 0.932, 0.9367, 0.9343, 0.9325, 0.9289, 0.928]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 34.00it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.98, 0.98, 0.964, 0.9533, 0.9486, 0.945, 0.9422, 0.936]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.40it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.9867, 0.98, 0.956, 0.95, 0.9457, 0.935, 0.9311, 0.932]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.28it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.9867, 0.98, 0.976, 0.96, 0.9543, 0.9425, 0.9422, 0.934]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.69it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9333, 0.915, 0.904, 0.9033, 0.8914, 0.89, 0.8822, 0.884]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 29.43it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9333, 0.93, 0.924, 0.92, 0.9171, 0.91, 0.8978, 0.896]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.63it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9333, 0.93, 0.928, 0.92, 0.9086, 0.8925, 0.8911, 0.886]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.98it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.9533, 0.935, 0.916, 0.9167, 0.9086, 0.9025, 0.9, 0.89]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 30.83it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9667, 0.965, 0.96, 0.9567, 0.9514, 0.9475, 0.9356, 0.928]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.59it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9733, 0.95, 0.936, 0.9367, 0.9343, 0.935, 0.9311, 0.928]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.65it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.98, 0.98, 0.964, 0.9567, 0.9543, 0.945, 0.9333, 0.932]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 34.39it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.99, 0.9867, 0.975, 0.968, 0.96, 0.9514, 0.9375, 0.9311, 0.93]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.97it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9467, 0.935, 0.92, 0.9133, 0.9086, 0.9025, 0.8956, 0.89]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.89it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.94, 0.925, 0.908, 0.9067, 0.8943, 0.8875, 0.8822, 0.884]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.41it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9467, 0.92, 0.904, 0.91, 0.9143, 0.9075, 0.9022, 0.898]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.32it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.94, 0.92, 0.912, 0.92, 0.92, 0.9075, 0.8933, 0.89]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 30.63it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.9533, 0.945, 0.94, 0.9367, 0.9343, 0.9325, 0.9289, 0.924]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 30.06it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.98, 0.96, 0.955, 0.948, 0.9433, 0.9314, 0.9175, 0.9133, 0.91]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 30.39it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.9533, 0.95, 0.932, 0.93, 0.9286, 0.9175, 0.9156, 0.914]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 29.97it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.97, 0.9533, 0.945, 0.94, 0.9367, 0.9286, 0.9225, 0.92, 0.914]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 30.79it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.93, 0.9067, 0.905, 0.896, 0.9, 0.9029, 0.895, 0.8889, 0.884]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 31.45it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.93, 0.8733, 0.865, 0.868, 0.86, 0.86, 0.8425, 0.8311, 0.83]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.74it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.96, 0.9333, 0.935, 0.916, 0.9, 0.8971, 0.8925, 0.8778, 0.874]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.00it/s]

AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [1.0, 0.93, 0.8933, 0.895, 0.892, 0.8833, 0.8829, 0.885, 0.8778, 0.874]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.9666
 2. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.9639
 3. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.9635
 4. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9630
 5. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1 | Avg Precision: 0.9607
 6. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1 | Avg Precision: 0.9591
 7. Model: gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.9587
 8. Model: gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1 | Avg Precision: 0.9




In [534]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_city_government/omnimatch_city_government_ground_truth.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/omnimatch_city_government/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

# Originial -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5416
# Supernormalized -> 1. Model: gradient_boosting_custom | Avg Precision: 0.5725

# prenormalized_all:  1. Model: gradient_boosting_custom | Avg Precision: 0.5824
                    # 4. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.5554

# prenormalized_base:  2. Model: gradient_boosting_custom | Avg Precision: 0.5731
                    #  4. gradient_boosting_all_fs_deep | Avg Precision: 0.5560

# Res:                  2. Model: gradient_boosting_custom | Avg Precision: 0.5713
                    #   4. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.5356

# postnormalized_base:   1. Model: gradient_boosting_custom | Avg Precision: 0.5805
                    #    2. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.5696


# postnormalized_all:

Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:02<00:00, 20.16it/s]


AVERAGE time to load the distances and execute the model:
----0.04----
Precisions: [0.604, 0.62, 0.6013, 0.585, 0.5688, 0.566]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 28.78it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.612, 0.604, 0.5893, 0.577, 0.576, 0.574]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 29.77it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.576, 0.594, 0.58, 0.573, 0.5752, 0.584]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 29.85it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.61, 0.584, 0.585, 0.5736, 0.57]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 28.42it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.584, 0.6, 0.5933, 0.579, 0.5712, 0.5713]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 28.86it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.608, 0.628, 0.6013, 0.583, 0.5776, 0.568]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 27.16it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.556, 0.608, 0.6013, 0.571, 0.564, 0.566]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 26.84it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.52, 0.544, 0.508, 0.503, 0.52, 0.5227]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 26.38it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.544, 0.568, 0.54, 0.539, 0.5392, 0.5453]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 26.90it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.616, 0.602, 0.5907, 0.569, 0.5624, 0.5767]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 27.11it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.584, 0.602, 0.5787, 0.568, 0.5688, 0.5733]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 25.92it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.62, 0.616, 0.592, 0.583, 0.5736, 0.582]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 25.08it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.564, 0.592, 0.584, 0.567, 0.5528, 0.548]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 26.09it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.636, 0.624, 0.584, 0.562, 0.5496, 0.5447]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 26.41it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.592, 0.584, 0.572, 0.566, 0.5568, 0.544]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 26.36it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.56, 0.574, 0.5707, 0.541, 0.532, 0.5253]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 27.55it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.532, 0.552, 0.56, 0.54, 0.5408, 0.5473]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 29.66it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.532, 0.554, 0.5613, 0.539, 0.5408, 0.5473]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 30.21it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.548, 0.5453, 0.525, 0.5256, 0.5307]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.93it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.566, 0.56, 0.534, 0.5392, 0.5507]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.18it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.56, 0.5387, 0.541, 0.552, 0.56]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.10it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.56, 0.578, 0.5747, 0.576, 0.5696, 0.5673]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.60it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.56, 0.584, 0.56, 0.552, 0.5592, 0.5607]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 26.97it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.56, 0.58, 0.56, 0.562, 0.5736, 0.574]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.31it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.596, 0.612, 0.5893, 0.567, 0.5664, 0.5647]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.92it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.588, 0.582, 0.568, 0.548, 0.5528, 0.5553]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.76it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.548, 0.558, 0.5627, 0.539, 0.5408, 0.548]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.16it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.568, 0.576, 0.556, 0.539, 0.5456, 0.548]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.15it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.58, 0.614, 0.6147, 0.612, 0.62, 0.616]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.16it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.612, 0.632, 0.608, 0.597, 0.5872, 0.5867]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.23it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.588, 0.61, 0.6173, 0.606, 0.588, 0.594]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.84it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.584, 0.634, 0.6027, 0.585, 0.592, 0.61]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.89it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.604, 0.576, 0.5493, 0.536, 0.5328, 0.5287]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.50it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.568, 0.578, 0.5747, 0.558, 0.5632, 0.5653]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.32it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.596, 0.582, 0.5627, 0.542, 0.5456, 0.5493]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.37it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.576, 0.572, 0.5613, 0.539, 0.5376, 0.544]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.93it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.592, 0.606, 0.5867, 0.566, 0.5768, 0.5753]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.12it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.596, 0.644, 0.6107, 0.586, 0.5952, 0.5993]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.86it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.588, 0.626, 0.624, 0.609, 0.6072, 0.5993]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.91it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.576, 0.6, 0.5787, 0.579, 0.5776, 0.57]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.82it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.576, 0.598, 0.5813, 0.575, 0.5728, 0.57]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.18it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.572, 0.596, 0.5933, 0.584, 0.5832, 0.586]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.43it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.556, 0.58, 0.5867, 0.576, 0.576, 0.564]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.46it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.604, 0.6, 0.58, 0.57, 0.5792, 0.5753]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.35it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.616, 0.614, 0.592, 0.572, 0.5672, 0.564]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.47it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.596, 0.586, 0.5587, 0.534, 0.5408, 0.542]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.96it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.592, 0.614, 0.5893, 0.592, 0.5872, 0.5833]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.82it/s]

AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.584, 0.594, 0.5813, 0.573, 0.5752, 0.568]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1 | Avg Precision: 0.6095
 2. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1 | Avg Precision: 0.6089
 3. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10 | Avg Precision: 0.6052
 4. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10 | Avg Precision: 0.6038
 5. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10 | Avg Precision: 0.6013
 6. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1 | Avg Precision: 0.6006
 7. Model: gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.5944
 8. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10 | Avg Precision: 0.5943
 9. Model: gradient_bo




In [535]:
k = 30
step = 5
ground_truth_path = 'C:/Projects/benchmarks/omnimatch_culture_recreation/omnimatch_culture_recreation_ground_truth.csv'
distances_folder_path = 'C:/Projects/freyja_plus_more/distances/omnimatch_culture_recreation/distances_prenormalized_base/'

evaluate_models(k, step, ground_truth_path, distances_folder_path)

# Originail -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5657
# Supernormalized -> 1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.6275

# Prenormalized_all:  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.6259
                    # 2. Model: gradient_boosting_custom | Avg Precision: 0.5956

# prenormalized_base:  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.6230
                    #  2. Model: gradient_boosting_custom | Avg Precision: 0.5918

# res:                  1. Model: gradient_boosting_custom | Avg Precision: 0.5796
                    #   3. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.5585     

# postnormalized_base:  1. Model: gradient_boosting_all_fs_deep | Avg Precision: 0.6088
                    #   2. Model: gradient_boosting_custom | Avg Precision: 0.6010


# postnormalized_all:

Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:02<00:00, 23.38it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.576, 0.596, 0.588, 0.579, 0.5808, 0.578]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.27it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.544, 0.6, 0.6227, 0.618, 0.6136, 0.6167]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 32.26it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.54, 0.596, 0.6173, 0.606, 0.6056, 0.6027]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.44it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.556, 0.612, 0.644, 0.628, 0.608, 0.6073]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.51it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.548, 0.61, 0.604, 0.609, 0.6, 0.5987]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 31.30it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.544, 0.602, 0.5987, 0.597, 0.5904, 0.5833]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.54it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.536, 0.578, 0.576, 0.582, 0.572, 0.574]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.73it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.504, 0.572, 0.5627, 0.565, 0.5704, 0.5727]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 32.23it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.504, 0.58, 0.5933, 0.594, 0.5896, 0.5853]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 32.45it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.56, 0.62, 0.6133, 0.619, 0.6072, 0.5947]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.12it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.496, 0.568, 0.5627, 0.57, 0.5704, 0.5667]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.72it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.556, 0.612, 0.6013, 0.593, 0.576, 0.5653]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 30.60it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.524, 0.564, 0.5707, 0.569, 0.5496, 0.536]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 31.78it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.548, 0.552, 0.544, 0.554, 0.5496, 0.5393]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.80it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.512, 0.534, 0.5013, 0.472, 0.4552, 0.4413]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne100_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.49it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.5, 0.548, 0.5467, 0.531, 0.5192, 0.504]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.88it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.508, 0.564, 0.5667, 0.57, 0.5704, 0.5627]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.73it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.508, 0.564, 0.5667, 0.57, 0.5704, 0.5627]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.24it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.496, 0.554, 0.5693, 0.574, 0.568, 0.5613]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.60it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.512, 0.564, 0.564, 0.566, 0.5656, 0.5567]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 33.61it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.532, 0.582, 0.5933, 0.582, 0.576, 0.5933]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 33.75it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.576, 0.5893, 0.594, 0.5904, 0.5993]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.19it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.568, 0.582, 0.5773, 0.589, 0.5936, 0.604]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 33.85it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.582, 0.5787, 0.591, 0.596, 0.6033]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 29.94it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.596, 0.6253, 0.626, 0.6216, 0.6213]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 34.01it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.5, 0.56, 0.5707, 0.569, 0.564, 0.566]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 33.42it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.512, 0.562, 0.5707, 0.567, 0.5616, 0.5693]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 32.21it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.488, 0.562, 0.5813, 0.574, 0.564, 0.5653]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.27it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.584, 0.612, 0.612, 0.617, 0.6096, 0.61]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 31.45it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.584, 0.59, 0.5987, 0.597, 0.6, 0.6073]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 30.77it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.544, 0.576, 0.5707, 0.576, 0.5888, 0.6007]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 30.50it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.56, 0.576, 0.572, 0.585, 0.5848, 0.592]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.57it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.496, 0.568, 0.5587, 0.541, 0.5376, 0.5427]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 31.81it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.54, 0.594, 0.6213, 0.621, 0.6176, 0.6227]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 30.40it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.516, 0.566, 0.5733, 0.566, 0.5616, 0.566]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 30.16it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.504, 0.56, 0.5693, 0.566, 0.5608, 0.5713]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 29.60it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.544, 0.578, 0.5747, 0.592, 0.6056, 0.6107]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 27.05it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.528, 0.576, 0.5827, 0.606, 0.6016, 0.608]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 28.48it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.568, 0.59, 0.58, 0.583, 0.5856, 0.5947]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 26.98it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.544, 0.588, 0.5907, 0.588, 0.5808, 0.5927]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 28.26it/s]


AVERAGE time to load the distances and execute the model:
----0.03----
Precisions: [0.524, 0.604, 0.6133, 0.606, 0.6064, 0.612]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 30.87it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.528, 0.59, 0.62, 0.614, 0.6048, 0.5987]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 31.67it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.6, 0.588, 0.596, 0.6008, 0.602]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.75it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.564, 0.612, 0.6107, 0.609, 0.6112, 0.6087]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl1


100%|██████████| 50/50 [00:01<00:00, 31.04it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.552, 0.59, 0.5973, 0.592, 0.5896, 0.5833]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss0.8_msl10


100%|██████████| 50/50 [00:01<00:00, 30.80it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.512, 0.568, 0.56, 0.564, 0.5568, 0.5573]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1


100%|██████████| 50/50 [00:01<00:00, 30.90it/s]


AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.568, 0.612, 0.612, 0.607, 0.6048, 0.6133]
------------------------------------------------------
Model gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl10


100%|██████████| 50/50 [00:01<00:00, 31.39it/s]

AVERAGE time to load the distances and execute the model:
----0.02----
Precisions: [0.56, 0.58, 0.58, 0.583, 0.5808, 0.5847]
------------------------------------------------------

 1. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10 | Avg Precision: 0.6092
 2. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md5_ss0.8_msl1 | Avg Precision: 0.6074
 3. Model: gradient_boosting_prenormalized_base_ne25_lr0.1_md3_ss0.8_msl1 | Avg Precision: 0.6070
 4. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md5_ss1.0_msl1 | Avg Precision: 0.6028
 5. Model: gradient_boosting_prenormalized_base_ne50_lr0.05_md3_ss0.8_msl10 | Avg Precision: 0.6028
 6. Model: gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10 | Avg Precision: 0.6026
 7. Model: gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss0.8_msl10 | Avg Precision: 0.6025
 8. Model: gradient_boosting_prenormalized_base_ne100_lr0.1_md3_ss0.8_msl10 | Avg Precision: 0.6024
 9. Model: gradient_boos




In [513]:
# Prenormalized all

import re
from collections import defaultdict

# Path to your file
file_path = "final_tests_prenormalized_all.txt"

# Dictionary: model_name -> list of scores
model_scores = defaultdict(list)

# Regex to capture model name and Avg Precision
pattern = re.compile(
    r"Model:\s+(.*?)\s+\|\s+Avg Precision:\s+([0-9.]+)"
)

with open(file_path, "r") as f:
    for line in f:
        match = pattern.search(line)
        if match:
            model_name = match.group(1).strip()
            score = float(match.group(2))
            model_scores[model_name].append(score)

# Compute average score per model
model_avg = {
    model: sum(scores) / len(scores)
    for model, scores in model_scores.items()
}

# Sort models by average score (descending)
sorted_models = sorted(
    model_avg.items(),
    key=lambda x: x[1],
    reverse=True
)

# Print top 3 models with individual scores
print("Top 3 models with individual benchmark scores:\n")

for rank, (model, avg_score) in enumerate(sorted_models[:3], start=1):
    scores = model_scores[model]
    print(f"{rank}. {model}")
    print(f"   Average score: {avg_score:.4f}")
    print(f"   Individual scores: {['{:.4f}'.format(s) for s in scores]}")
    print()


Top 3 models with individual benchmark scores:

1. gradient_boosting_prenormalized_all_ne100_lr0.05_md3_ss1.0_msl10
   Average score: 0.8148
   Individual scores: ['0.9448', '0.9074', '0.7824', '0.9325', '0.5821', '0.6105', '0.9439']

2. gradient_boosting_prenormalized_all_ne50_lr0.05_md5_ss1.0_msl1
   Average score: 0.8139
   Individual scores: ['0.9547', '0.9074', '0.7895', '0.9283', '0.6060', '0.5883', '0.9229']

3. gradient_boosting_prenormalized_all_ne100_lr0.05_md3_ss1.0_msl1
   Average score: 0.8139
   Individual scores: ['0.9444', '0.9001', '0.7808', '0.9497', '0.5794', '0.6011', '0.9415']



In [None]:
# Santos_small ->   Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9675
# TUS_small ->      Original -> 2. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.8939
# D3L ->            Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.7994
# Freyja ->         Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.9504
# OM_CG ->          Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5416
# OM_CR ->          Original -> 1. Model: gradient_boosting_no_syntactic_fs_deep | Avg Precision: 0.5657

# 1. gradient_boosting_prenormalized_all_ne100_lr0.05_md3_ss1.0_msl10
#    Average score: 0.8148
#    Individual scores: ['0.9448', '0.9074', '0.7824', '0.9325', '0.5821', '0.6105', '0.9439']

# 2. gradient_boosting_prenormalized_all_ne50_lr0.05_md5_ss1.0_msl1
#    Average score: 0.8139
#    Individual scores: ['0.9547', '0.9074', '0.7895', '0.9283', '0.6060', '0.5883', '0.9229']

# 3. gradient_boosting_prenormalized_all_ne100_lr0.05_md3_ss1.0_msl1
#    Average score: 0.8139
#    Individual scores: ['0.9444', '0.9001', '0.7808', '0.9497', '0.5794', '0.6011', '0.9415']

# Top 3 models with individual benchmark scores:

# 1. gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10
#    Average score: 0.8145
#    Individual scores: ['0.9448', '0.9075', '0.7802', '0.9430', '0.5847', '0.6026', '0.9388']

# 2. gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10
#    Average score: 0.8142
#    Individual scores: ['0.9448', '0.9081', '0.7789', '0.9325', '0.5811', '0.6092', '0.9450']

# 3. gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1
#    Average score: 0.8134
#    Individual scores: ['0.9537', '0.9072', '0.7894', '0.9283', '0.6089', '0.5835', '0.9229']

In [536]:
# Prenormalized all

import re
from collections import defaultdict

# Path to your file
file_path = "final_tests_prenormalized_base.txt"

# Dictionary: model_name -> list of scores
model_scores = defaultdict(list)

# Regex to capture model name and Avg Precision
pattern = re.compile(
    r"Model:\s+(.*?)\s+\|\s+Avg Precision:\s+([0-9.]+)"
)

with open(file_path, "r") as f:
    for line in f:
        match = pattern.search(line)
        if match:
            model_name = match.group(1).strip()
            score = float(match.group(2))
            model_scores[model_name].append(score)

# Compute average score per model
model_avg = {
    model: sum(scores) / len(scores)
    for model, scores in model_scores.items()
}

# Sort models by average score (descending)
sorted_models = sorted(
    model_avg.items(),
    key=lambda x: x[1],
    reverse=True
)

# Print top 3 models with individual scores
print("Top 3 models with individual benchmark scores:\n")

for rank, (model, avg_score) in enumerate(sorted_models[:3], start=1):
    scores = model_scores[model]
    print(f"{rank}. {model}")
    print(f"   Average score: {avg_score:.4f}")
    print(f"   Individual scores: {['{:.4f}'.format(s) for s in scores]}")
    print()


Top 3 models with individual benchmark scores:

1. gradient_boosting_prenormalized_base_ne50_lr0.1_md3_ss1.0_msl10
   Average score: 0.8145
   Individual scores: ['0.9448', '0.9075', '0.7802', '0.9430', '0.5847', '0.6026', '0.9388']

2. gradient_boosting_prenormalized_base_ne100_lr0.05_md3_ss1.0_msl10
   Average score: 0.8142
   Individual scores: ['0.9448', '0.9081', '0.7789', '0.9325', '0.5811', '0.6092', '0.9450']

3. gradient_boosting_prenormalized_base_ne50_lr0.05_md5_ss1.0_msl1
   Average score: 0.8134
   Individual scores: ['0.9537', '0.9072', '0.7894', '0.9283', '0.6089', '0.5835', '0.9229']



In [5]:
import pandas as pd

def compare_datasets(path_1, path_2):
    # Load datasets
    df1 = pd.read_csv(path_1)
    df2 = pd.read_csv(path_2)

    # 1. Check column equality
    if set(df1.columns) != set(df2.columns):
        print("❌ Column names do not match.")
        print("Only in dataset 1:", set(df1.columns) - set(df2.columns))
        print("Only in dataset 2:", set(df2.columns) - set(df1.columns))
        return
    else:
        print("✅ Column names match.")

    # Ensure same column order
    df2 = df2[df1.columns]

    # Composite key
    key_cols = ["dataset_name", "attribute_name"]

    # Set index for comparison
    df1 = df1.set_index(key_cols)
    df2 = df2.set_index(key_cols)

    # 2. Check for missing rows
    only_in_df1 = df1.index.difference(df2.index)
    only_in_df2 = df2.index.difference(df1.index)

    if len(only_in_df1) > 0:
        print("\n⚠️ Rows only in dataset 1:")
        print(only_in_df1.tolist())

    if len(only_in_df2) > 0:
        print("\n⚠️ Rows only in dataset 2:")
        print(only_in_df2.tolist())

    # 3. Compare values for common rows
    common_index = df1.index.intersection(df2.index)

    print("\n🔍 Value differences:")
    differences_found = False

    for col in df1.columns:
        diff_mask = df1.loc[common_index, col] != df2.loc[common_index, col]

        if diff_mask.any():
            differences_found = True
            for idx in common_index[diff_mask]:
                print(
                    f"Key={idx} | Column='{col}' | "
                    f"Dataset1='{df1.loc[idx, col]}' | "
                    f"Dataset2='{df2.loc[idx, col]}'"
                )

    if not differences_found:
        print("✅ No value differences found.")

if __name__ == "__main__":
    compare_datasets("C:/Projects/freyja_plus_more/profiles/profiles_santos_small.csv", 
                     "C:/Projects/FREYJA2/profiles/santos_small.csv")


✅ Column names match.

🔍 Value differences:
Key=('animal_tag_data_a.csv', 'serial_no') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('civic_building_locations_1.csv', 'address2') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_0.csv', 'BusinessState') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_0.csv', 'BusinessZip') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_1.csv', 'BusinessStreetLine1') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_1.csv', 'BusinessState') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_2.csv', 'BusinessState') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_2.csv', 'BusinessZip') | Column='first_word' | Dataset1='nan' | Dataset2='nan'
Key=('complaint_by_practice_3.csv', 'BusinessState') | Column='first_word' | Dataset1='nan' | 

In [10]:
from pathlib import Path

path = Path('profiles/omnimatch_culture_recreation.csv')
path_with_suffix = path.with_stem(path.stem + "_normalized")
output_profiles_path_preprocessed = path_with_suffix.with_suffix(".pkl")

print(path_with_suffix)
print(output_profiles_path_preprocessed)

profiles\omnimatch_culture_recreation_normalized.csv
profiles\omnimatch_culture_recreation_normalized.pkl
