# Double-feature Models

In [1]:
import datetime
from pathlib import Path
import pickle
import sys
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import rand_score

dir_generic_files = Path("/home/enrico/shared_virtualbox/phd_projects_Enrico_Gandini/phd_project_similarity_prediction/generic_input_files/")
sys.path.append(dir_generic_files.as_posix())
import machine_learning_helpers as mlh

## Define directories and files

In [2]:
#The current notebook should be
#in the main directory containing queried results.
dir_results = Path.cwd()

Select the date when the survey ended, and define the directory containing survey results up to that date.

In [3]:
date_end_survey = datetime.date(year=2021, month=6, day=28)
dir_queries = Path(dir_results, f"queried_heroku_{date_end_survey}")

Load DataFrame containing aggregated survey answer. The dataset was produced by the "retrieve answers" script, using SQLAlchemy.

In [4]:
file_agg = Path(dir_queries, "aggregated_survey_answers.csv")
df_agg = pd.read_csv(file_agg, index_col='id_chosenPair')
df_agg

Unnamed: 0_level_0,id_subsetPair,smiles_molecule_a,smiles_molecule_b,tanimoto_cdk_Extended,TanimotoCombo,pchembl_distance,target_name,simil_2D,simil_3D,dissimil_2D,dissimil_3D,id_molecule_a,id_molecule_b,id_randPair,pair_type,n_answers,n_similar,frac_similar
id_chosenPair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,Cc1cscc1-c1cccnc1,Cc1ccsc1-c1cccnc1,0.567010,1.989,0.31,CYP2D6,0,1,1,0,84,127,1493,"dis2D,sim3D",22,18,0.818182
2,8,O=S(=O)(c1ccccc1)c1ccc(/C=C/c2ccc(F)cc2F)cc1,O=S(=O)(c1ccc(F)cc1)c1ccc(/C=C/c2ccc(F)cc2)nc1,0.532051,1.782,0.34,HERG,0,1,1,0,491,903,1902,"dis2D,sim3D",16,9,0.562500
3,10,NCc1ccc(-c2ccccc2)o1,CNCc1ccc(-c2cccnc2)o1,0.549206,1.778,0.49,CYP2D6,0,1,1,0,22,23,3000,"dis2D,sim3D",21,8,0.380952
4,11,CCCCCCCN(CC)CC#CCc1ccc(Cl)cc1,CCCCCCCN(CC)CC#Cc1ccc(C)cc1,0.558952,1.764,1.30,HERG,0,1,1,0,479,1233,136,"dis2D,sim3D",20,15,0.750000
5,13,CN(C)Cc1ccc(-c2cccnc2)s1,CNCc1ccc(-c2cccnc2)o1,0.452685,1.757,0.50,CYP2D6,0,1,1,0,15,23,945,"dis2D,sim3D",23,15,0.652174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,7205,OCC1CC2(c3ccccc3)NC1CCC2NCc1cc(OC(F)(F)F)ccc1O...,C[C@@H]1CCCN1CCc1ccc2nc(-c3csc(-c4cc(Cl)nc(Cl)...,0.399689,0.741,1.28,HERG,0,0,1,1,886,1243,271,"dis2D,dis3D",25,1,0.040000
97,7233,Cc1ccc2c(c1C)N1[C@H](CNC[C@H]1C)C2,COc1cc2c(cc1C(F)(F)F)N(C(=O)Nc1cncc(-c3ccc(F)c...,0.400709,0.625,0.03,5HT2B,0,0,1,1,289,612,2057,"dis2D,dis3D",18,0,0.000000
98,7344,COc1ccc(N2Cc3c(c4cc(Cl)c(Cl)cc4n3C)C2=O)cc1OCC...,O=C1NC2CCNCCN2c2ccccc21,0.405573,0.657,0.47,5HT2B,0,0,1,1,56,305,2413,"dis2D,dis3D",24,4,0.166667
99,7500,Cc1nc2ccccn2c1-c1ccc2cc(CCN3CCC[C@H]3C)ccc2n1,CN(C)CCCn1nc(C2=C(c3cn(-c4ccc5ccccc5c4)c4ccccc...,0.413889,0.762,1.90,HERG,0,0,1,1,480,1004,1596,"dis2D,dis3D",16,2,0.125000


In [5]:
df_agg.columns

Index(['id_subsetPair', 'smiles_molecule_a', 'smiles_molecule_b',
       'tanimoto_cdk_Extended', 'TanimotoCombo', 'pchembl_distance',
       'target_name', 'simil_2D', 'simil_3D', 'dissimil_2D', 'dissimil_3D',
       'id_molecule_a', 'id_molecule_b', 'id_randPair', 'pair_type',
       'n_answers', 'n_similar', 'frac_similar'],
      dtype='object')

Directory that will contain fitted models.

In [6]:
dir_models = Path(dir_results, "models_3Classes")
dir_models.mkdir(exist_ok=True)

Directory that will contain analyses and visualization of models.

In [7]:
dir_models_analysis = Path(dir_models, "analysis")
dir_models_analysis.mkdir(exist_ok=True)

## Define variables
Contained in the input data, and necessary to create nice figures.

In [8]:
colname_score_2d = "tanimoto_cdk_Extended"
colname_score_3d = "TanimotoCombo"

colname_dist = "pchembl_distance"

colname_target = "target_name"

colname_subset = "pair_type"

colname_pair = "id_surveyPair"

colname_n_ans = "n_answers"

colname_n_simil = "n_similar"

colname_frac_simil = "frac_similar"

In [9]:
categories_subset = ["dis2D,dis3D",
                     "dis2D,sim3D",
                     "sim2D,dis3D",
                     "sim2D,sim3D",
                     ]

In [10]:
targets_names = ["HERG",
                 "5HT2B",
                 "CYP2D6",
                 ]

In [11]:
nicename_score_2d = (colname_score_2d
                     .replace("_", " ")
                     .title()
                     .replace("Cdk", "CDK")
                     )
nicename_score_3d = colname_score_3d
nicenames_scores = {colname_score_2d: nicename_score_2d,
                    colname_score_3d: nicename_score_3d,
                    }

nicename_dist = "pChEMBL Distance"

nicename_target = "Target"

nicename_subset = "Pair Type"

nicename_similar = "Similar"

nicename_experience = "Academic Qualification"

nicename_n_ans = "Number of Answers"

nicename_ans_percent = "Answer Percentage"

nicename_simil_percent = "Similarity Percentage"

nicename_pair = "Pair ID"

nicename_n_pairs = "Number of Pairs"

nicename_percent_pairs_subset = "Pair Percentage in each subset"

In [12]:
FIGSIZE_GOLD = (9.556, 5.906) # Golden Rectangle / (in, in)
FIGSIZE_PAGE = (8.27, 11.69) #A4 page without margins.
FIGSIZE_SQUARE = (FIGSIZE_GOLD[0], FIGSIZE_GOLD[0])

FONTSIZE = 14

kwargs_fig_basic = {"constrained_layout": True, "figsize": FIGSIZE_GOLD}

In [13]:
n_pairs_each_subset = 25

In [14]:
lim_score_2d = (0, 1)
lim_score_3d = (0, 2)
lims_scores = {colname_score_2d: lim_score_2d,
               colname_score_3d: lim_score_3d,
               }

lim_percent = (0, 100)

## Define Training set

In [15]:
colnames_features = [colname_score_2d, colname_score_3d]

X_train = df_agg.loc[:, colnames_features].copy()
n_train = X_train.shape[0]

Training set `y` labels will also be added to the original DataFrame, for easier analyses.

In [16]:
human_similarity = df_agg[colname_frac_simil] # fraction of human experts that considered a pair of molecules to be similar

thres_low = 0.4
thres_high = 0.6
#(!)The last break is `1.0001` instead of `1.0` since
#I need non-overlapping intervals
#(so, `closed="both"` option in `from_breaks` method is not possible),
#and I need to include the right extreme `1.0`
#(otherwise, pairs with `frac_similar` equals `1.0` would be in a NaN bin).
breaks = [0, thres_low, thres_high, 1.00001]
bins_classes = pd.IntervalIndex.from_breaks(breaks,
                                            closed="left",
                                            )
labels_bins_classes = ["Not Similar", "Uncertain", "Similar"]

y_train = pd.cut(human_similarity,
                 bins=bins_classes,
                 )
y_train = y_train.cat.rename_categories(labels_bins_classes)

colname_judged = "judged_similar"
y_train.name = colname_judged
df_agg[colname_judged] = y_train

#Check that all items in `human_similarity` were assigned to a bin.
if y_train.isna().any():
    raise ValueError("NaN values in `y_train`! Check the binning!")

y_train

id_chosenPair
1          Similar
2        Uncertain
3      Not Similar
4          Similar
5          Similar
          ...     
96     Not Similar
97     Not Similar
98     Not Similar
99     Not Similar
100    Not Similar
Name: judged_similar, Length: 100, dtype: category
Categories (3, object): ['Not Similar' < 'Uncertain' < 'Similar']

Counts of new multi-class labelling.

In [17]:
counts_y_train = (pd.value_counts(y_train)
                  .rename("train_labels")
                  .loc[labels_bins_classes]
                  .to_frame()
                  )
counts_y_train

Unnamed: 0,train_labels
Not Similar,35
Uncertain,22
Similar,43


## Define all Logistic Regression models

Define `kwargs` of Logistic Regression models with all penalty types.

In [18]:
seed_rand = 1
max_iter = 2000
solver = "saga" # Supports all logistic regression penalties.
class_weight = "balanced" #Balance an unbalanced multi-class dataset.


kwargs_base = {"random_state": seed_rand,
               "max_iter": max_iter,
               "solver": solver,
               "class_weight": class_weight,
               }

kwargs_noreg = kwargs_base.copy()
kwargs_noreg["penalty"]: "none"

kwargs_l1 = kwargs_base.copy()
kwargs_l1["penalty"] = "l1"

kwargs_l2 = kwargs_base.copy()
kwargs_l2["penalty"] = "l2"

kwargs_enet = kwargs_base.copy()
kwargs_enet["penalty"] = "elasticnet"
#Elasticnet also requires specification of `l1_ratio` hyperparameter.
ratio_default = 0.5
kwargs_enet_default = kwargs_enet.copy()
kwargs_enet_default["l1_ratio"] = ratio_default

### Define Decision Tree and Random Forest Models

In [19]:
kwargs_decisiontree = {"random_state": seed_rand,
                       "class_weight": class_weight,
                       }

kwargs_randomforest = kwargs_decisiontree.copy()

Define models that will be trained with default hyperparameters

In [20]:
initial_models_default = {"noreg": LogisticRegression(**kwargs_noreg),
                          "l1": LogisticRegression(**kwargs_l1),
                          "l2": LogisticRegression(**kwargs_l2),
                          "enet": LogisticRegression(**kwargs_enet_default),
                          "decisiontree": DecisionTreeClassifier(**kwargs_decisiontree),
                          "randomforest": RandomForestClassifier(**kwargs_randomforest),
                          }

## Fit Logistic Regression models with default hyperparameters

In [21]:
fitted_models_default = {}

for name, model in initial_models_default.items():
    model.fit(X_train, y_train)
    
    fitted_models_default[name] = model
    print(f"fitted {name}")

fitted noreg
fitted l1
fitted l2
fitted enet
fitted decisiontree
fitted randomforest


### Evaluate models based on default hyperparameters
On training-set.

In [22]:
df_evals_default = [] #will contain results of all scores for default models
merged_counts_pred_default = []
df_fracs_correct_default = []
for name, model in fitted_models_default.items():
    pred = model.predict(X_train)
    df_agg[f"pred_default_{name}"] = pred
    
    #Count classes predicted by the model.
    counts_pred = (pd.value_counts(pred)
                   .rename(name)
                   .loc[labels_bins_classes]
                   )
    merged_counts_pred_default.append(counts_pred)
    
    #Fractions of correct predictions
    fracs_correct = mlh.fracs_correct_predictions(model,
                                                  X_train,
                                                  y_train,
                                                  )
    fracs_correct = (fracs_correct
                     .loc[labels_bins_classes]
                     .rename(name)
                     )
    df_fracs_correct_default.append(fracs_correct)
    
    correct = (y_train == pred).astype(int)
    n_correct = correct.sum()
    rand_index = rand_score(y_train, pred)
    print(f"Model `{name}` correctly predicted {n_correct}/{n_train} pairs.")
    df_agg[f"correct_default_{name}"] = correct
    
    tmp = {"model": name,
           "n_correct": n_correct,
           "rand_index": rand_index,
           }
    metrics_dict = mlh.various_metrics_multi_classification(model=model,
                                                            metrics=mlh.METRICS_MULTI_PROBA,
                                                            X=X_train,
                                                            y_true=y_train,
                                                            )
    tmp.update(metrics_dict)
    df_evals_default.append(tmp)


merged_counts_pred_default = pd.DataFrame(merged_counts_pred_default).T
df_fracs_correct_default = pd.DataFrame(df_fracs_correct_default).T
df_evals_default = pd.DataFrame(df_evals_default)
df_evals_default

Model `noreg` correctly predicted 77/100 pairs.
Model `l1` correctly predicted 77/100 pairs.
Model `l2` correctly predicted 77/100 pairs.
Model `enet` correctly predicted 76/100 pairs.
Model `decisiontree` correctly predicted 100/100 pairs.
Model `randomforest` correctly predicted 100/100 pairs.


Unnamed: 0,model,n_correct,rand_index,roc_auc_score,average_precision_score,log_loss,brier_multi_score_loss
0,noreg,77,0.756768,0.912677,0.83983,0.6189014,0.344406
1,l1,77,0.766061,0.916768,0.844511,0.5299102,0.298519
2,l2,77,0.756768,0.912677,0.83983,0.6189014,0.344406
3,enet,76,0.747475,0.917173,0.843682,0.5894334,0.327995
4,decisiontree,100,1.0,1.0,1.0,2.109424e-15,0.0
5,randomforest,100,1.0,1.0,1.0,0.1274677,0.050574


In [23]:
df_latex_default = (df_evals_default
                    .set_index("model")
                    .rename(columns=mlh.MAP_CLASSIF_METRICS_LATEX)
                    .drop(columns="rand_index")
                    #.T
                    )
print(df_latex_default.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrr}
\toprule
{} &  N\$\_\{correct\}\$ &  AUROC &  AveP &  L\$\_\{log\}\$ &  L\$\_\{Brier\}\$ \\
model        &                &        &       &            &              \\
\midrule
noreg        &             77 &  0.913 & 0.840 &      0.619 &        0.344 \\
l1           &             77 &  0.917 & 0.845 &      0.530 &        0.299 \\
l2           &             77 &  0.913 & 0.840 &      0.619 &        0.344 \\
enet         &             76 &  0.917 & 0.844 &      0.589 &        0.328 \\
decisiontree &            100 &  1.000 & 1.000 &      0.000 &        0.000 \\
randomforest &            100 &  1.000 & 1.000 &      0.127 &        0.051 \\
\bottomrule
\end{tabular}



Add the original true training labels to the prediction counts DataFrame, for clarity.

In [24]:
merged_counts_pred_default = pd.merge(left=counts_y_train,
                                      right=merged_counts_pred_default,
                                      left_index=True,
                                      right_index=True,
                                      )
merged_counts_pred_default

Unnamed: 0,train_labels,noreg,l1,l2,enet,decisiontree,randomforest
Not Similar,35,29,33,29,28,35,35
Uncertain,22,36,34,36,37,22,22
Similar,43,35,33,35,35,43,43


Correct predictions for each class, normalized by number of true instances of each class.

In [25]:
df_fracs_correct_default

Unnamed: 0,noreg,l1,l2,enet,decisiontree,randomforest
Not Similar,0.771429,0.857143,0.771429,0.742857,1.0,1.0
Uncertain,0.818182,0.772727,0.818182,0.818182,1.0,1.0
Similar,0.744186,0.697674,0.744186,0.744186,1.0,1.0


In [26]:
print(df_fracs_correct_default.mul(100).T.to_latex(float_format="%.1f"))

\begin{tabular}{lrrr}
\toprule
{} &  Not Similar &  Uncertain &  Similar \\
\midrule
noreg        &         77.1 &       81.8 &     74.4 \\
l1           &         85.7 &       77.3 &     69.8 \\
l2           &         77.1 &       81.8 &     74.4 \\
enet         &         74.3 &       81.8 &     74.4 \\
decisiontree &        100.0 &      100.0 &    100.0 \\
randomforest &        100.0 &      100.0 &    100.0 \\
\bottomrule
\end{tabular}



In [27]:
file_evals_default = Path(dir_models_analysis, "metrics_double_feature_default_models.csv")
df_evals_default.to_csv(file_evals_default, index=False)

file_counts_default = Path(dir_models_analysis, "pred_counts_double_feature_default_models.csv")
merged_counts_pred_default.to_csv(file_counts_default)

file_fracs_correct_default = Path(dir_models_analysis, "fracs_correct_double_feature_default_models.csv")
df_fracs_correct_default.to_csv(file_fracs_correct_default)

### Save default models

In [28]:
dir_default = Path(dir_models, "double_feature_default_models")
dir_default.mkdir(exist_ok=True)

In [29]:
for name, model in fitted_models_default.items():
    file_model = Path(dir_default, f"{name}.pickle")
    with open(file_model, "wb") as f:
        pickle.dump(model, f)

Define grids of hyperparameters, that will be later used for hyperparameter optimization.

All hyperparameters of Logistic Regression are about penalty, so the model with no penalty does not have hyperparameters, and hyperparameter optimization is not needed.

In [30]:
reg_strengths = np.geomspace(0.01, 100, num=20)
ratios = np.geomspace(0.01, 1, num=25)


grid_l1 = {"C": reg_strengths}

grid_l2 = grid_l1

grid_enet = {"C": reg_strengths,
             "l1_ratio": ratios,
             }


#Hyperparameters for Decision Tree and Random Forest
grid_decisiontree = {"min_samples_leaf": [1, 2, 5, 10],
                     "max_depth": [2, 3, 5, 10],
                     "min_samples_split": [2, 5, 10],
                     }
grid_randomforest = grid_decisiontree.copy()
grid_randomforest["n_estimators"] = [5, 10, 50, 100]


grids_hyperparams = {"l1": grid_l1,
                     "l2": grid_l2,
                     "enet": grid_enet,
                     "decisiontree": grid_decisiontree,
                     "randomforest": grid_randomforest,
                     }

In [31]:
kwargs_for_optim = {"l1": kwargs_l1,
                    "l2": kwargs_l2,
                    "enet": kwargs_enet,
                    "decisiontree": kwargs_decisiontree,
                    "randomforest": kwargs_randomforest,
                    }

n_folds_cv = 10


n_jobs = 5
initial_optimizers = {}
for name, grid in grids_hyperparams.items():
    kwargs_model = kwargs_for_optim[name]
    
    #Select estimator
    if name == "decisiontree":
        estimator = DecisionTreeClassifier
    elif name == "randomforest":
        estimator = RandomForestClassifier
    elif name in ["l1", "l2", "enet"]:
        estimator = LogisticRegression
    else:
        raise ValueError(f"`{name}` model not valid!")
    
    optimizer = GridSearchCV(estimator=estimator(**kwargs_model),
                             param_grid=grid,
                             cv=n_folds_cv,
                             n_jobs=n_jobs,
                             )
    initial_optimizers[name] = optimizer

In [32]:
fitted_optimizers = {}
for name, optimizer in initial_optimizers.items():
    start = time.perf_counter()
    optimizer.fit(X_train, y_train)
    delta_time = time.perf_counter() - start
    delta_time = datetime.timedelta(seconds=delta_time)
    print(f"Completed hyperparameter search of `{name}` "
          f"using {n_folds_cv}-fold CV in {delta_time} hours with {n_jobs} CPUs.")
    fitted_optimizers[name] = optimizer

Completed hyperparameter search of `l1` using 10-fold CV in 0:00:02.452887 hours with 5 CPUs.
Completed hyperparameter search of `l2` using 10-fold CV in 0:00:00.650883 hours with 5 CPUs.
Completed hyperparameter search of `enet` using 10-fold CV in 0:00:14.279661 hours with 5 CPUs.
Completed hyperparameter search of `decisiontree` using 10-fold CV in 0:00:01.164736 hours with 5 CPUs.
Completed hyperparameter search of `randomforest` using 10-fold CV in 0:00:44.504444 hours with 5 CPUs.


In [33]:
df_evals_optim = [] #will contain results of all scores for optim models
merged_counts_pred_optim = []
df_fracs_correct_optim = []
for name, model in fitted_optimizers.items():
    pred = model.predict(X_train)
    df_agg[f"pred_optim_{name}"] = pred
    
    #Count classes predicted by the model.
    counts_pred = (pd.value_counts(pred)
                   .rename(name)
                   .loc[labels_bins_classes]
                   )
    merged_counts_pred_optim.append(counts_pred)
    
    #Fractions of correct predictions
    fracs_correct = mlh.fracs_correct_predictions(model,
                                                  X_train,
                                                  y_train,
                                                  )
    fracs_correct = (fracs_correct
                     .loc[labels_bins_classes]
                     .rename(name)
                     )
    df_fracs_correct_optim.append(fracs_correct)
    
    correct = (y_train == pred).astype(int)
    n_correct = correct.sum()
    rand_index = rand_score(y_train, pred)
    print(f"Optimized `{name}` correctly predicted {n_correct}/{n_train} pairs.")
    df_agg[f"correct_optim_{name}"] = correct
    
    tmp = {"model": name,
           "n_correct": n_correct,
           "rand_index": rand_index,
           }
    metrics_dict = mlh.various_metrics_multi_classification(model=model,
                                                            metrics=mlh.METRICS_MULTI_PROBA,
                                                            X=X_train,
                                                            y_true=y_train,
                                                            )
    tmp.update(metrics_dict)
    df_evals_optim.append(tmp)


merged_counts_pred_optim = pd.DataFrame(merged_counts_pred_optim).T
df_fracs_correct_optim = pd.DataFrame(df_fracs_correct_optim).T
df_evals_optim = pd.DataFrame(df_evals_optim)
df_evals_optim

Optimized `l1` correctly predicted 78/100 pairs.
Optimized `l2` correctly predicted 76/100 pairs.
Optimized `enet` correctly predicted 76/100 pairs.
Optimized `decisiontree` correctly predicted 82/100 pairs.
Optimized `randomforest` correctly predicted 86/100 pairs.


Unnamed: 0,model,n_correct,rand_index,roc_auc_score,average_precision_score,log_loss,brier_multi_score_loss
0,l1,78,0.772727,0.888243,0.820946,0.691374,0.388863
1,l2,76,0.747475,0.911043,0.842784,0.639714,0.356719
2,enet,76,0.747475,0.911043,0.842784,0.639281,0.356467
3,decisiontree,82,0.801616,0.953035,0.894116,0.372318,0.227147
4,randomforest,86,0.839596,0.97412,0.956237,0.342734,0.198345


In [34]:
df_latex_optim = (df_evals_optim
                  .set_index("model")
                  .rename(columns=mlh.MAP_CLASSIF_METRICS_LATEX)
                  .drop(columns="rand_index")
                  #.T
                  )
print(df_latex_optim.to_latex(float_format="%.3f"))

\begin{tabular}{lrrrrr}
\toprule
{} &  N\$\_\{correct\}\$ &  AUROC &  AveP &  L\$\_\{log\}\$ &  L\$\_\{Brier\}\$ \\
model        &                &        &       &            &              \\
\midrule
l1           &             78 &  0.888 & 0.821 &      0.691 &        0.389 \\
l2           &             76 &  0.911 & 0.843 &      0.640 &        0.357 \\
enet         &             76 &  0.911 & 0.843 &      0.639 &        0.356 \\
decisiontree &             82 &  0.953 & 0.894 &      0.372 &        0.227 \\
randomforest &             86 &  0.974 & 0.956 &      0.343 &        0.198 \\
\bottomrule
\end{tabular}



In [35]:
merged_counts_pred_optim = pd.merge(left=counts_y_train,
                                    right=merged_counts_pred_optim,
                                    left_index=True,
                                    right_index=True,
                                    )
merged_counts_pred_optim

Unnamed: 0,train_labels,l1,l2,enet,decisiontree,randomforest
Not Similar,35,32,28,28,35,29
Uncertain,22,26,37,37,32,26
Similar,43,42,35,35,33,45


Correct predictions for each class, normalized by number of true instances of each class.

In [36]:
df_fracs_correct_optim

Unnamed: 0,l1,l2,enet,decisiontree,randomforest
Not Similar,0.857143,0.742857,0.742857,0.885714,0.8
Uncertain,0.636364,0.818182,0.818182,0.863636,0.818182
Similar,0.790698,0.744186,0.744186,0.744186,0.930233


In [37]:
print(df_fracs_correct_optim.mul(100).T.to_latex(float_format="%.1f"))

\begin{tabular}{lrrr}
\toprule
{} &  Not Similar &  Uncertain &  Similar \\
\midrule
l1           &         85.7 &       63.6 &     79.1 \\
l2           &         74.3 &       81.8 &     74.4 \\
enet         &         74.3 &       81.8 &     74.4 \\
decisiontree &         88.6 &       86.4 &     74.4 \\
randomforest &         80.0 &       81.8 &     93.0 \\
\bottomrule
\end{tabular}



In [38]:
file_evals_optim = Path(dir_models_analysis, "metrics_double_feature_optimized_models.csv")
df_evals_optim.to_csv(file_evals_optim, index=False)

file_counts_optim = Path(dir_models_analysis, "pred_counts_double_feature_optimized_models.csv")
merged_counts_pred_optim.to_csv(file_counts_optim)

file_fracs_correct_optim = Path(dir_models_analysis, "fracs_correct_double_feature_optimized_models.csv")
df_fracs_correct_optim.to_csv(file_fracs_correct_optim)

Concatenate the two evaluation DataFrames for an easier visual inspection.

In [39]:
#Before concatenation, add prefix to `model` column,
#to specify if models in concatenated DataFrame are default or optimized.
df_evals_default["model"] = "default_" + df_evals_default["model"]
df_evals_optim["model"] = "optimized_" + df_evals_optim["model"]


pd.concat([df_evals_default, df_evals_optim], ignore_index=True)

Unnamed: 0,model,n_correct,rand_index,roc_auc_score,average_precision_score,log_loss,brier_multi_score_loss
0,default_noreg,77,0.756768,0.912677,0.83983,0.6189014,0.344406
1,default_l1,77,0.766061,0.916768,0.844511,0.5299102,0.298519
2,default_l2,77,0.756768,0.912677,0.83983,0.6189014,0.344406
3,default_enet,76,0.747475,0.917173,0.843682,0.5894334,0.327995
4,default_decisiontree,100,1.0,1.0,1.0,2.109424e-15,0.0
5,default_randomforest,100,1.0,1.0,1.0,0.1274677,0.050574
6,optimized_l1,78,0.772727,0.888243,0.820946,0.6913743,0.388863
7,optimized_l2,76,0.747475,0.911043,0.842784,0.6397141,0.356719
8,optimized_enet,76,0.747475,0.911043,0.842784,0.6392807,0.356467
9,optimized_decisiontree,82,0.801616,0.953035,0.894116,0.3723178,0.227147


***Notes***:
* For optimized models, Elastic Net became identical to L1 (I also checked parameters and hyperparameters) $\Rightarrow$ **do not use optimized Elastic Net**!
* Best model is Random Forest, followed by Decision Tree.
* Decision Tree and Random Forest with default parameters are probably overfit (a lot): all correct predictions; they probably memorized each sample in the training-set.
* After hyperparameter optimization, Decision Tree and Random Forest got worse than default models, but they are now probably not overfit. And they are better than models based on Logistic Regression.
* Random Forest, after optimization, is quite significantly better than Decision Tree. Let's hope that it is not overfit (and it should not be overfit, since Random Forests are meant to reduce overfitting of Decision Trees).

### Save Optimized Models

In [40]:
dir_optim = Path(dir_models, "double_feature_optimized_models")
dir_optim.mkdir(exist_ok=True)

In [41]:
for name, optimizer in fitted_optimizers.items():
    #Select final model with best hyperparameters,
    #refitted on whole dataset.
    model = optimizer.best_estimator_
    file_model = Path(dir_optim, f"{name}.pickle")
    with open(file_model, "wb") as f:
        pickle.dump(model, f)