In [40]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import time
import warnings
import xgboost
#import catboost
import lightgbm

from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
from sklearn.base import ClassifierMixin
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score, 
    recall_score,
    confusion_matrix,
    roc_auc_score,
    f1_score,
    r2_score,
    mean_squared_error,
    classification_report
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    RandomizedSearchCV,
    GridSearchCV,
    learning_curve,
    validation_curve
)
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from scipy import stats
from pca import pca
from IPython.display import display
import dataframe_image as dfi

from src.visualization import feature_importances_plot

warnings.filterwarnings("ignore")
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

removed_classifiers = [
    "ClassifierChain",
    "ComplementNB",
    "GradientBoostingClassifier",
    "GaussianProcessClassifier",
    "HistGradientBoostingClassifier",
    "MLPClassifier",
    "LogisticRegressionCV", 
    "MultiOutputClassifier", 
    "MultinomialNB", 
    "OneVsOneClassifier",
    "OneVsRestClassifier",
    "OutputCodeClassifier",
    "RadiusNeighborsClassifier",
    "VotingClassifier",
]

removed_regressors = [
    "TheilSenRegressor",
    "ARDRegression", 
    "CCA", 
    "IsotonicRegression", 
    "StackingRegressor",
    "MultiOutputRegressor", 
    "MultiTaskElasticNet", 
    "MultiTaskElasticNetCV", 
    "MultiTaskLasso", 
    "MultiTaskLassoCV", 
    "PLSCanonical", 
    "PLSRegression", 
    "RadiusNeighborsRegressor", 
    "RegressorChain", 
    "VotingRegressor", 
]

CLASSIFIERS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], ClassifierMixin) and (est[0] not in removed_classifiers))
]

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]

REGRESSORS.append(("XGBRegressor", xgboost.XGBRegressor))
REGRESSORS.append(("LGBMRegressor", lightgbm.LGBMRegressor))
# REGRESSORS.append(('CatBoostRegressor',catboost.CatBoostRegressor))

CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier))
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant")), ("scaler", StandardScaler())]
)

categorical_transformer_low = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

categorical_transformer_high = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423
        ("encoding", OrdinalEncoder()),
    ]
)

In [41]:
RANDOM_STATE = 42
N_JOBS = -1
class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]

map_target = {
    "Streptococcus canis": 0,
    "Streptococcus dysgalactiae subsp. equisimilis": 1,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 2
}

map_target_inv = {
    0: "Strept. canis",
    1: "Strept. dysg. equisimilis",
    2: "Strept. dysg. dysgalactiae"
}

map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}
start = 9
n_antibiotici = 9
n_geni = 27
n_virulenza = 18
#n_picchi = ['46','306']
n_picchi = ['46']
N_BEST = 3

In [58]:
n_classes = [0,1]
# Hyperparameter tuning using RandomizedSearchCV
param_grid = {'LogisticRegression': {'C': np.logspace(-4, 4, 20), 
                                    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                    'fit_intercept': [True, False],
                                    'intercept_scaling': [0.5, 1, 2],
                                    'class_weight': [None, 'balanced']},
              'RidgeClassifier' : {'alpha': np.logspace(-5, 5, 100)},
              'DecisionTreeClassifier': {'criterion': ['gini', 'entropy', 'log_loss'],
                                        'splitter': ['best', 'random'],
                                        'max_depth': [2*n for n in range(1,10)],
                                        'max_features': ['auto', 'sqrt', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'class_weight': [None, 'balanced']},
              'KNeighborsClassifier': {'n_neighbors': list(range(1, 20)),
                                        'weights': ['uniform', 'distance'],
                                        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                        'p': [1,2]},
              'RandomForestClassifier': {'n_estimators': range(10, 100), 
                                        'max_features': ['auto', 'sqrt', 'log2'],
                                        'max_depth': [2*n for n in range(1,10)],
                                        'min_samples_split': range(2, 15), 
                                        'class_weight': [None, 'balanced'], 
                                        'criterion': ['gini', 'entropy', 'log_loss']},
              'BernoulliNB': {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
                            'fit_prior': [True, False],
                            'class_prior': [None, [0.1,]* len(n_classes)],
                            'binarize': [None, -5, 0.0, 5, 10.0]
                            },
              'GaussianNB': {'var_smoothing': np.logspace(0,-9, num=20)},
              'NearestCentroid':  {'shrink_threshold': np.logspace(0, 1, 20),
                                   'metric': ['euclidean', 'manhattan']},
              'SVC': {'C': np.logspace(-3, 3, 10),
                    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                    'degree': range(2,5),
                    'gamma': np.logspace(-3, 1, 10)},
              'SGDClassifier':{'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                            'penalty' : ['l1', 'l2', 'elasticnet'],
                            'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                            'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
                            'class_weight' : [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
                            'eta0' : [1, 10, 100]}
              }

In [52]:
def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
        The value of 'n' will be used to split columns.
    Returns
    -------
    card_low : list-like
        Columns with cardinality < n
    card_high : list-like
        Columns with cardinality >= n
    """
    cond = df[cols].nunique() > n
    card_high = cols[cond]
    card_low = cols[~cond]
    return card_low, card_high

def makeTuning(model, model_name, X_train, y_train):
    #print(model_name)
    #print(model)
    params = param_grid.get(model_name)

    if params != None:
        rs = GridSearchCV(estimator=model(), param_grid=params,
                            scoring="accuracy", n_jobs=-1, cv=skfold, verbose=0)
        rs.fit(X_train, y_train)
        params = rs.best_params_
        model = rs.best_estimator_
        cv_best = rs.best_score_
        model_name = model_name+'_Best'

    return params

In [54]:
class LazyClassifierCustom:
    """
    This module helps in fitting to all the classification algorithms that are available in Scikit-learn
    Parameters
    ----------
    verbose : int, optional (default=0)
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    ignore_warnings : bool, optional (default=True)
        When set to True, the warning related to algorigms that are not able to run are ignored.
    custom_metric : function, optional (default=None)
        When function is provided, models are evaluated based on the custom evaluation metric provided.
    prediction : bool, optional (default=False)
        When set to True, the predictions of all the models models are returned as dataframe.
    classifiers : list, optional (default="all")
        When function is provided, trains the chosen classifier(s).

    Examples
    --------
    """
    def __init__(
        self,
        verbose=0,
        ignore_warnings=True,
        custom_metric=None,
        predictions=False,
        random_state=42,
        classifiers="all",
    ):
        self.verbose = verbose
        self.ignore_warnings = ignore_warnings
        self.custom_metric = custom_metric
        self.predictions = predictions
        self.models = {}
        self.random_state = random_state
        self.classifiers = classifiers

    def fit(self, X_train, X_test, y_train, y_test):
        """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        scores : Pandas DataFrame
            Returns metrics of all the models in a Pandas DataFrame.
        predictions : Pandas DataFrame
            Returns predictions of all the models in a Pandas DataFrame.
        """
        Accuracy = []
        B_Accuracy = []
        ROC_AUC = []
        F1 = []
        names = []
        TIME = []
        predictions = {}

        if self.custom_metric is not None:
            CUSTOM_METRIC = []

        if isinstance(X_train, np.ndarray):
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

        numeric_features = X_train.select_dtypes(include=[np.number]).columns
        categorical_features = X_train.select_dtypes(include=["object"]).columns

        categorical_low, categorical_high = get_card_split(
            X_train, categorical_features
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("numeric", numeric_transformer, numeric_features),
                ("categorical_low", categorical_transformer_low, categorical_low),
                ("categorical_high", categorical_transformer_high, categorical_high),
            ]
        )

        if self.classifiers == "all":
            self.classifiers = CLASSIFIERS
        else:
            try:
                temp_list = []
                for classifier in self.classifiers:
                    full_name = (classifier.__name__, classifier)
                    temp_list.append(full_name)
                self.classifiers = temp_list
            except Exception as exception:
                print(exception)
                print("Invalid Classifier(s)")

        for name, model in tqdm(self.classifiers):
            start = time.time()
            n_classes = np.unique(y_train)
            parametri = makeTuning(model, name, X_train, y_train)
            if parametri != None:
                print(model().get_params())
            try:
                if "random_state" in model().get_params().keys():
                    pipe = Pipeline(
                        steps=[
                            ("preprocessor", preprocessor),
                            ("classifier", model(random_state=self.random_state)),
                        ]
                    )
                else:
                    pipe = Pipeline(
                        steps=[("preprocessor", preprocessor), ("classifier", model())]
                    )
                
                pipe.fit(X_train, y_train)
                self.models[name] = pipe
                y_pred = pipe.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred, normalize=True)
                b_accuracy = balanced_accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average="weighted")
                try:
                    roc_auc = roc_auc_score(y_test, y_pred)
                except Exception as exception:
                    roc_auc = None
                    if self.ignore_warnings is False:
                        print("ROC AUC couldn't be calculated for " + name)
                        print(exception)
                names.append(name)
                Accuracy.append(accuracy)
                B_Accuracy.append(b_accuracy)
                ROC_AUC.append(roc_auc)
                F1.append(f1)
                TIME.append(time.time() - start)
                if self.custom_metric is not None:
                    custom_metric = self.custom_metric(y_test, y_pred)
                    CUSTOM_METRIC.append(custom_metric)
                if self.verbose > 0:
                    if self.custom_metric is not None:
                        print(
                            {
                                "Model": name,
                                "Accuracy": accuracy,
                                "Balanced Accuracy": b_accuracy,
                                "ROC AUC": roc_auc,
                                "F1 Score": f1,
                                self.custom_metric.__name__: custom_metric,
                                "Time taken": time.time() - start,
                            }
                        )
                    else:
                        print(
                            {
                                "Model": name,
                                "Accuracy": accuracy,
                                "Balanced Accuracy": b_accuracy,
                                "ROC AUC": roc_auc,
                                "F1 Score": f1,
                                "Time taken": time.time() - start,
                            }
                        )
                if self.predictions:
                    predictions[name] = y_pred
                
                if parametri != None:
                    name = name+'_Best'
                    pipe['classifier'].set_params(**parametri)
                    parametri = pipe['classifier'].get_params()
                    print(parametri)
                    pipe.fit(X_train, y_train)
                    self.models[name] = pipe
                    y_pred = pipe.predict(X_test)
                    accuracy = accuracy_score(y_test, y_pred, normalize=True)
                    b_accuracy = balanced_accuracy_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred, average="weighted")
                    try:
                        roc_auc = roc_auc_score(y_test, y_pred)
                    except Exception as exception:
                        roc_auc = None
                        if self.ignore_warnings is False:
                            print("ROC AUC couldn't be calculated for " + name)
                            print(exception)
                    names.append(name)
                    Accuracy.append(accuracy)
                    B_Accuracy.append(b_accuracy)
                    ROC_AUC.append(roc_auc)
                    F1.append(f1)
                    TIME.append(time.time() - start)
                    if self.custom_metric is not None:
                        custom_metric = self.custom_metric(y_test, y_pred)
                        CUSTOM_METRIC.append(custom_metric)
                    if self.verbose > 0:
                        if self.custom_metric is not None:
                            print(
                                {
                                    "Model": name,
                                    "Accuracy": accuracy,
                                    "Balanced Accuracy": b_accuracy,
                                    "ROC AUC": roc_auc,
                                    "F1 Score": f1,
                                    self.custom_metric.__name__: custom_metric,
                                    "Time taken": time.time() - start,
                                }
                            )
                        else:
                            print(
                                {
                                    "Model": name,
                                    "Accuracy": accuracy,
                                    "Balanced Accuracy": b_accuracy,
                                    "ROC AUC": roc_auc,
                                    "F1 Score": f1,
                                    "Time taken": time.time() - start,
                                }
                            )
                    if self.predictions:
                        predictions[name] = y_pred
                    
            except Exception as exception:
                if self.ignore_warnings is False:
                    print(name + " model failed to execute")
                    print(exception)
        if self.custom_metric is None:
            scores = pd.DataFrame(
                {
                    "Model": names,
                    "Accuracy": Accuracy,
                    "Balanced Accuracy": B_Accuracy,
                    "ROC AUC": ROC_AUC,
                    "F1 Score": F1,
                    "Time Taken": TIME,
                }
            )
        else:
            scores = pd.DataFrame(
                {
                    "Model": names,
                    "Accuracy": Accuracy,
                    "Balanced Accuracy": B_Accuracy,
                    "ROC AUC": ROC_AUC,
                    "F1 Score": F1,
                    self.custom_metric.__name__: CUSTOM_METRIC,
                    "Time Taken": TIME,
                }
            )
        scores = scores.sort_values(by="Balanced Accuracy", ascending=False).set_index(
            "Model"
        )
        
        if self.predictions:
            predictions_df = pd.DataFrame.from_dict(predictions)
        return scores, predictions_df if self.predictions is True else scores

    def provide_models(self, X_train, X_test, y_train, y_test):
        """
        This function returns all the model objects trained in fit function.
        If fit is not called already, then we call fit and then return the models.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        models: dict-object,
            Returns a dictionary with each model pipeline as value 
            with key as name of models.
        """
        if len(self.models.keys()) == 0:
            self.fit(X_train, X_test, y_train, y_test)

        return self.models


def adjusted_rsquared(r2, n, p):
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

Classification = LazyClassifierCustom

In [55]:
def makeDictBest(models, modelli, n_best = 5):
    for i in range(n_best):
        best = models.index[i]
        score = modelli.get(best)
        if score != None:
            modelli[best] = score + (n_best-i)
        else:
            modelli[best] = (n_best-i)
    return modelli

In [56]:
for n in n_picchi:
    print('DATAFRAME CON '+n+' PICCHI')
    df = pd.read_csv("data/Dati_Matemaldomics_"+n+"picchi.csv",
                    delimiter=';', index_col='ID Strain')
    n = int(n)
    modelli = {}
    animal  = df[['Animal species of origin']]
    lancefield = df[['LANCEFIELD GROUP']]
    haemolysis = df[['Haemolysis']]
    subspecies = df[['Putative Subspecies']]

    st = df[[df.columns[4]]]
    maldi = df[df.columns[start:start+n]]
    antibiotici = df[df.columns[start+n:start+n+n_antibiotici]]
    geni_antibiotici = df[df.columns[start+n+n_antibiotici:start+n+n_antibiotici+n_geni]]
    virulenza = df[df.columns[start+n+n_antibiotici+n_geni:start+n+n_antibiotici+n_geni+n_virulenza]]
    
    maldi.fillna(0, inplace=True)
    maldi = maldi.replace(',', '.', regex=True)
    columns = maldi.columns
    for column in columns:
        maldi[column] = maldi[column].astype(float)
    display(maldi)
    
    targets = {#'antibiotici' : antibiotici,
                #'geni_antibiotici' : geni_antibiotici,
                'virulenza' : virulenza}
    
    feats_agg = {'lancefield' : lancefield,
                'haemolysis' : haemolysis,
                'subspecies' : subspecies,        
                'animal' : animal}
    
    for str_target,target in targets.items():
        columns = target.columns
        for column in columns:
            if str_target == 'antibiotici':
                target[column] = df[column].map(map_target_antibiotici)
            rapporto = (target[column] == 0).sum() / target.shape[0]
            #if (antibiotici[column] == 0).all() or (antibiotici[column] == 1).all():
            print(column+" : "+str(rapporto))
            if rapporto < 0.15 or rapporto > 0.85:
                target.drop([column], axis=1, inplace=True)
        
        display(target)
        
    metrics_df = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
    metrics_df_agg = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    targets['subspecies'] = subspecies
    targets['st'] = st
    X = maldi
    for str_target, target in targets.items():
        columns = target.columns
        for column in columns:    
            y = target[column]
            n_classes = np.unique(y)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
            clf = LazyClassifierCustom(predictions=True)
            models, predictions = clf.fit(X_train, X_test, y_train, y_test)
            print("Colonna:"+column)
            display(models)
            print("\n")
            modelli = makeDictBest(models, modelli)
                    
            #models.to_csv('Risultati/LazyPredictor/model_'+str(n)+column+'.csv')
    del targets['subspecies']
    for str_feat, feat_agg in feats_agg.items():
        display(feat_agg)
        X = pd.concat([X, feat_agg], axis=1)
        for str_target, target in targets.items():
            columns = target.columns
            for column in columns:    
                y = target[column]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
                clf = LazyClassifierCustom(predictions=True)
                models, predictions = clf.fit(X_train, X_test, y_train, y_test)
                print("Colonna: "+column+" con feat agg: "+str_feat)
                display(models)
                print("\n")
                modelli = makeDictBest(models, modelli)
                
    modelli = sorted(modelli.items(), key=lambda x:x[1])
    print(modelli)
    display(metrics_df)
    display(metrics_df_agg)

DATAFRAME CON 46 PICCHI


Unnamed: 0_level_0,"2223,140967","2241,073989","2262,75751","2679,802856","2978,296408","3159,441237","3354,28405","3364,608472","3397,909861","3418,174965",...,"9030,351844","9073,208159","9487,183195","10103,20284","10400,80576","10491,16654","10930,54833","13276,73249","14943,03835","15048,89449"
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V13,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V142,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V151,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V160,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V161,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V800,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V82,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V90,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V91,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


fbp54 : 0.0
gbs0630 : 0.9935064935064936
gbs0631 : 0.9935064935064936
gbs0632 : 0.9935064935064936
hasC : 0.0
lmb : 0.9935064935064936
mf2 : 0.961038961038961
mf3 : 0.6753246753246753
scpA : 0.9935064935064936
sda : 0.8766233766233766
ska : 0.9935064935064936
slo : 0.9935064935064936
smeZ : 0.9935064935064936
spec : 0.974025974025974
speg : 0.9090909090909091
spek : 0.961038961038961
spel : 0.974025974025974
spem : 0.948051948051948


Unnamed: 0_level_0,mf3
ID Strain,Unnamed: 1_level_1
V13,0
V142,1
V151,0
V160,0
V161,1
...,...
V800,0
V82,1
V90,0
V91,1


 10%|█         | 3/29 [00:09<01:31,  3.50s/it]

{'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}
{'alpha': 0.01, 'binarize': None, 'class_prior': None, 'fit_prior': True}


 21%|██        | 6/29 [00:21<01:27,  3.81s/it]

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}


 34%|███▍      | 10/29 [00:24<00:41,  2.20s/it]

{'priors': None, 'var_smoothing': 1e-09}
{'priors': None, 'var_smoothing': 0.1}


 48%|████▊     | 14/29 [00:28<00:20,  1.35s/it]

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 8, 'p': 2, 'weights': 'distance'}


 48%|████▊     | 14/29 [00:44<00:20,  1.35s/it]

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
{'C': 10000.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 2, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


 59%|█████▊    | 17/29 [03:05<04:12, 21.00s/it]

{'metric': 'euclidean', 'shrink_threshold': None}
{'metric': 'manhattan', 'shrink_threshold': 1.0}


 72%|███████▏  | 21/29 [15:19<05:50, 43.78s/it]


KeyboardInterrupt: 