In [67]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, f1_score, precision_score, matthews_corrcoef
from joblib import dump
from sklearn.model_selection import train_test_split
from json import load
import os

In [68]:
def get_metrics(y_true, y_pred, model_name, pca_option):

    return {
        "model_name" : model_name,
        "apply_pca":pca_option,
        "precision" : precision_score(y_true, y_pred),
        "recall" : recall_score(y_true, y_pred),
        "f1_score" : f1_score(y_true, y_pred),
        "mcc" : matthews_corrcoef(y_true, y_pred)
    }

In [69]:
with open("config_hyp_params.json", "r") as doc_import:
    dict_configs = load(doc_import)
dict_configs

{'hist_model': {'random_state': 42, 'loss': 'log_loss'},
 'RF_model': {'random_state': 42, 'n_estimators': 500},
 'SVC': {'kernel': 'poly', 'degree': 5},
 'LR': {'penalty': 'l2', 'random_state': 42}}

In [70]:
data = load_breast_cancer()
X = data.data
y = data.target

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, shuffle=True)

In [72]:
df_test = pd.DataFrame(data=X_test, columns=data.feature_names)
df_test["label"] = y_test
df_test.to_csv("my_test_data.csv", index=False)

In [73]:
dict_models = {
    ("hist_model", HistGradientBoostingClassifier),
    ("RF_model", RandomForestClassifier),
    ("SVC", SVC),
    ("LR", LogisticRegression)
}

In [74]:
list_metrics = []

for pca_option in [True, False]:
    for element in dict_models:

        algorithm = element[1]
        name_algorithm = element[0]

        print("Processing: ", name_algorithm, pca_option)

        if pca_option:
            my_pipeline = Pipeline(
                [("scaler", StandardScaler()),
                 ("pca", PCA(n_components=10, random_state=42)),
                ("model", algorithm(**dict_configs[name_algorithm]))]
            )
        else:
            my_pipeline = Pipeline(
                [("scaler", StandardScaler()),
                ("model", algorithm(**dict_configs[name_algorithm]))]
            )

        my_pipeline.fit(X_train, y_train)
        y_pred = my_pipeline.predict(X_test)
        list_metrics.append(get_metrics(y_test, y_pred, name_algorithm, pca_option))

        os.makedirs(f"models/{name_algorithm}_{pca_option}", exist_ok=True)

        dump(my_pipeline, f"models/{name_algorithm}_{pca_option}/my_model.joblib")
df_metrics = pd.DataFrame(list_metrics)
df_metrics

Processing:  hist_model True
Processing:  SVC True
Processing:  LR True
Processing:  RF_model True
Processing:  hist_model False
Processing:  SVC False
Processing:  LR False
Processing:  RF_model False


Unnamed: 0,model_name,apply_pca,precision,recall,f1_score,mcc
0,hist_model,True,0.985714,0.971831,0.978723,0.944408
1,SVC,True,0.771739,1.0,0.871166,0.628366
2,LR,True,0.985915,0.985915,0.985915,0.96266
3,RF_model,True,0.957746,0.957746,0.957746,0.887979
4,hist_model,False,0.972222,0.985915,0.979021,0.943898
5,SVC,False,0.771739,1.0,0.871166,0.628366
6,LR,False,0.972222,0.985915,0.979021,0.943898
7,RF_model,False,0.958904,0.985915,0.972222,0.925285


In [75]:
df_metrics.sort_values(by="mcc", ascending=False)

Unnamed: 0,model_name,apply_pca,precision,recall,f1_score,mcc
2,LR,True,0.985915,0.985915,0.985915,0.96266
0,hist_model,True,0.985714,0.971831,0.978723,0.944408
6,LR,False,0.972222,0.985915,0.979021,0.943898
4,hist_model,False,0.972222,0.985915,0.979021,0.943898
7,RF_model,False,0.958904,0.985915,0.972222,0.925285
3,RF_model,True,0.957746,0.957746,0.957746,0.887979
1,SVC,True,0.771739,1.0,0.871166,0.628366
5,SVC,False,0.771739,1.0,0.871166,0.628366
