In [2]:
from LazyClassifierCustom import LazyClassifierCustom
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.inspection import DecisionBoundaryDisplay

from scipy import stats
from pca import pca
from IPython.display import display
import dataframe_image as dfi

from src.visualization import feature_importances_plot

import warnings
    
# warnings -> to silence warnings

warnings.filterwarnings("ignore")
np.set_printoptions(precision=5, suppress=True)


RANDOM_STATE = 42
N_JOBS = -1
class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]

map_target = {
    "Streptococcus canis": 0,
    "Streptococcus dysgalactiae subsp. equisimilis": 1,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 2
}

map_target_inv = {
    0: "Strept. canis",
    1: "Strept. dysg. equisimilis",
    2: "Strept. dysg. dysgalactiae"
}

map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}
start = 9
n_antibiotici = 9
n_geni = 27
n_virulenza = 18
#n_picchi = ['46','306']
n_picchi = ['46']
N_BEST = 3

ModuleNotFoundError: No module named 'LazyClassifierCustom'

In [14]:
def makeDictBest(models, modelli, n_best = 5):
    for i in range(n_best):
        best = models.index[i]
        score = modelli.get(best)
        if score != None:
            modelli[best] = score + (n_best-i)
        else:
            modelli[best] = (n_best-i)
    return modelli

In [28]:
n_classes = [0,1]
# Hyperparameter tuning using RandomizedSearchCV
param_grid = {'LogisticRegression': {'C': np.logspace(-4, 4, 20), 
                                    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                    'fit_intercept': [True, False],
                                    'intercept_scaling': [0.5, 1, 2],
                                    'class_weight': [None, 'balanced']},
              'RidgeClassifier' : {'alpha': np.logspace(-5, 5, 100)},
              'DecisionTreeClassifier': {'criterion': ['gini', 'entropy', 'log_loss'],
                                        'splitter': ['best', 'random'],
                                        'max_depth': [2*n for n in range(1,10)],
                                        'max_features': ['auto', 'sqrt', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'class_weight': [None, 'balanced']},
              'KNeighborsClassifier': {'n_neighbors': list(range(1, 20)),
                                        'weights': ['uniform', 'distance'],
                                        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                        'p': [1,2]},
              'RandomForestClassifier': {'n_estimators': range(10, 100), 
                                        'max_features': ['auto', 'sqrt', 'log2'],
                                        'max_depth': [2*n for n in range(1,10)],
                                        'min_samples_split': range(2, 15), 
                                        'class_weight': [None, 'balanced'], 
                                        'criterion': ['gini', 'entropy', 'log_loss']},
              'BernoulliNB': {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
                            'fit_prior': [True, False],
                            'class_prior': [None, [0.1,]* len(n_classes)],
                            'binarize': [None, -5, 0.0, 5, 10.0]
                            },
              'GaussianNB': {'var_smoothing': np.logspace(0,-9, num=20)},
              'NearestCentroid':  {'shrink_threshold': np.logspace(0, 1, 20),
                                   'metric': ['euclidean', 'manhattan']},
              'SVC': {'C': np.logspace(-3, 3, 10),
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                          'degree': range(2,5),
                          'gamma': np.logspace(-3, 1, 10)}
              }

In [30]:
for n in n_picchi:
    print('DATAFRAME CON '+n+' PICCHI')
    df = pd.read_csv("data/Dati_Matemaldomics_"+n+"picchi.csv",
                    delimiter=';', index_col='ID Strain')
    n = int(n)
    modelli = {}
    animal  = df[['Animal species of origin']]
    lancefield = df[['LANCEFIELD GROUP']]
    haemolysis = df[['Haemolysis']]
    subspecies = df[['Putative Subspecies']]

    st = df[[df.columns[4]]]
    maldi = df[df.columns[start:start+n]]
    antibiotici = df[df.columns[start+n:start+n+n_antibiotici]]
    geni_antibiotici = df[df.columns[start+n+n_antibiotici:start+n+n_antibiotici+n_geni]]
    virulenza = df[df.columns[start+n+n_antibiotici+n_geni:start+n+n_antibiotici+n_geni+n_virulenza]]
    
    maldi.fillna(0, inplace=True)
    maldi = maldi.replace(',', '.', regex=True)
    columns = maldi.columns
    for column in columns:
        maldi[column] = maldi[column].astype(float)
    display(maldi)
    
    targets = {#'antibiotici' : antibiotici,
                #'geni_antibiotici' : geni_antibiotici,
                'virulenza' : virulenza}
    
    feats_agg = {'lancefield' : lancefield,
                'haemolysis' : haemolysis,
                'subspecies' : subspecies,        
                'animal' : animal}
    
    for str_target,target in targets.items():
        columns = target.columns
        for column in columns:
            if str_target == 'antibiotici':
                target[column] = df[column].map(map_target_antibiotici)
            rapporto = (target[column] == 0).sum() / target.shape[0]
            #if (antibiotici[column] == 0).all() or (antibiotici[column] == 1).all():
            print(column+" : "+str(rapporto))
            if rapporto < 0.15 or rapporto > 0.85:
                target.drop([column], axis=1, inplace=True)
        
        display(target)
        
    metrics_df = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
    metrics_df_agg = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    targets['subspecies'] = subspecies
    targets['st'] = st
    X = maldi
    for str_target, target in targets.items():
        columns = target.columns
        for column in columns:    
            y = target[column]
            n_classes = np.unique(y)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
            clf = LazyClassifier(predictions=True)
            models, predictions = clf.fit(X_train, X_test, y_train, y_test)
            print("Colonna:"+column)
            display(models)
            print("\n")
            modelli = makeDictBest(models, modelli)
            for i in range(N_BEST):
                models_obj = clf.provide_models(X_train, X_test, y_train, y_test)
                model_name = models.index[i]
                print(model_name)
                model = models_obj[model_name]['classifier']
                print(model)
                params = param_grid.get(model_name)
                print(params)
                y_pred_lazy = predictions[model_name]
                print()
                acc = accuracy_score(y_test, y_pred_lazy)
                prec = precision_score(y_test, y_pred_lazy, average='weighted')
                rec = recall_score(y_test, y_pred_lazy, average='weighted')
                f1 = f1_score(y_test, y_pred_lazy, average='weighted')
                cv = cross_val_score(estimator=model, X=X_train, y=y_train,
                                                    scoring="accuracy", cv=skfold, n_jobs=N_JOBS, verbose=0).mean()
                ris = {'Target': column,
                        'Model': model_name,
                        'Accuracy' : acc,
                        'Precision' : prec,
                        'Recall' : rec,
                        'F1-Score' : f1,
                        'CV' : cv} 
                display(ris)
                metrics_df = metrics_df.append(ris, ignore_index=True)
                
                if params != None:
                    rs = RandomizedSearchCV(estimator=model, param_distributions=params,
                                        scoring="accuracy", n_jobs=-1, cv=skfold, verbose=1)
                    rs.fit(X_train, y_train)
                    parametri = rs.best_params_
                    best_model = rs.best_estimator_
                    cv_best = rs.best_score_
                else:
                    best_model = model
                    parametri = best_model.get_params()
                    cv_best = cross_val_score(estimator=model, X=X_train, y=y_train,
                                                    scoring="accuracy", cv=skfold, n_jobs=N_JOBS, verbose=0).mean()
                
                print(parametri)
                y_pred = best_model.predict(X_test)
                
                acc_best = accuracy_score(y_test, y_pred)
                prec_best = precision_score(y_test, y_pred, average='weighted')
                rec_best = recall_score(y_test, y_pred, average='weighted')
                f1_best = f1_score(y_test, y_pred, average='weighted')
                
                ris = {'Target': column,
                        'Model': model_name+'_Best',
                        'Accuracy' : acc_best,
                        'Precision' : prec_best,
                        'Recall' : rec_best,
                        'F1-Score' : f1_best,
                        'CV' : cv_best} 
                display(ris)
                metrics_df = metrics_df.append(ris, ignore_index=True)
            
            #models.to_csv('Risultati/LazyPredictor/model_'+str(n)+column+'.csv')
    del targets['subspecies']
    for str_feat, feat_agg in feats_agg.items():
        display(feat_agg)
        X = pd.concat([X, feat_agg], axis=1)
        for str_target, target in targets.items():
            columns = target.columns
            for column in columns:    
                y = target[column]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
                clf = LazyClassifier(predictions=True)
                models, predictions = clf.fit(X_train, X_test, y_train, y_test)
                print("Colonna: "+column+" con feat agg: "+str_feat)
                display(models)
                print("\n")
                modelli = makeDictBest(models, modelli)
                for i in range(N_BEST):
                    models_obj = clf.provide_models(X_train, X_test, y_train, y_test)
                    model_name = models.index[i]
                    print(model_name)
                    model = models_obj[model_name]['classifier']
                    preprocessing = models_obj[model_name]['preprocessing']
                    print(model)
                    params = param_grid.get(model_name)
                    print(params)
                    if params != None:
                        rs = RandomizedSearchCV(estimator=model, param_distributions=params,
                                            scoring="accuracy", n_jobs=-1, cv=skfold, verbose=1)
                        rs.fit(X_train, y_train)
                        parametri = rs.best_params_
                        print(parametri)
                        best_model = rs.best_estimator_
                        cv_best = rs.best_score_
                    else:
                        best_model = model
                        cv_best = cross_val_score(estimator=model, X=X_train, y=y_train,
                                                        scoring="accuracy", cv=skfold, n_jobs=N_JOBS, verbose=0).mean()
                    y_pred = best_model.predict(X_test)
                    
                    acc_best = accuracy_score(y_test, y_pred)
                    prec_best = precision_score(y_test, y_pred, average='weighted')
                    rec_best = recall_score(y_test, y_pred, average='weighted')
                    f1_best = f1_score(y_test, y_pred, average='weighted')
                    
                    ris = {'Target': column,
                            'Model': model_name,
                            'Accuracy' : acc_best,
                            'Precision' : prec_best,
                            'Recall' : rec_best,
                            'F1-Score' : f1_best,
                            'CV' : cv_best} 
                    display(ris)
                    metrics_df_agg = metrics_df_agg.append(ris, ignore_index=True)
                #models.to_csv('Risultati/LazyPredictor/model_'+str(n)+column+'_morefeat'+str_feat+'.csv')
    modelli = sorted(modelli.items(), key=lambda x:x[1])
    print(modelli)

DATAFRAME CON 46 PICCHI


Unnamed: 0_level_0,"2223,140967","2241,073989","2262,75751","2679,802856","2978,296408","3159,441237","3354,28405","3364,608472","3397,909861","3418,174965",...,"9030,351844","9073,208159","9487,183195","10103,20284","10400,80576","10491,16654","10930,54833","13276,73249","14943,03835","15048,89449"
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V13,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V142,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V151,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V160,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V161,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V800,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V82,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V90,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
V91,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


fbp54 : 0.0
gbs0630 : 0.9935064935064936
gbs0631 : 0.9935064935064936
gbs0632 : 0.9935064935064936
hasC : 0.0
lmb : 0.9935064935064936
mf2 : 0.961038961038961
mf3 : 0.6753246753246753
scpA : 0.9935064935064936
sda : 0.8766233766233766
ska : 0.9935064935064936
slo : 0.9935064935064936
smeZ : 0.9935064935064936
spec : 0.974025974025974
speg : 0.9090909090909091
spek : 0.961038961038961
spel : 0.974025974025974
spem : 0.948051948051948


Unnamed: 0_level_0,mf3
ID Strain,Unnamed: 1_level_1
V13,0
V142,1
V151,0
V160,0
V161,1
...,...
V800,0
V82,1
V90,0
V91,1


100%|██████████| 29/29 [00:01<00:00, 27.59it/s]

Colonna:mf3





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Perceptron,0.81,0.83,0.83,0.81,0.01
NearestCentroid,0.81,0.83,0.83,0.81,0.01
LogisticRegression,0.84,0.82,0.82,0.84,0.02
SGDClassifier,0.84,0.82,0.82,0.84,0.01
RidgeClassifierCV,0.84,0.82,0.82,0.84,0.02
RidgeClassifier,0.84,0.82,0.82,0.84,0.02
PassiveAggressiveClassifier,0.84,0.82,0.82,0.84,0.01
LabelPropagation,0.84,0.82,0.82,0.84,0.02
LabelSpreading,0.84,0.82,0.82,0.84,0.01
BernoulliNB,0.81,0.8,0.8,0.81,0.01




Perceptron
Perceptron(random_state=42)
None



{'Target': 'mf3',
 'Model': 'Perceptron',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.8489109456851394,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.8143695014662756,
 'CV': 0.5653333333333335}

{'alpha': 0.0001, 'class_weight': None, 'early_stopping': False, 'eta0': 1.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': None, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


{'Target': 'mf3',
 'Model': 'Perceptron_Best',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.5036420395421436,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.5891661594643944,
 'CV': 0.5653333333333335}

NearestCentroid
NearestCentroid()
{'shrink_threshold': array([ 1.     ,  1.02353,  1.04762,  1.07227,  1.0975 ,  1.12332,
        1.14976,  1.17681,  1.2045 ,  1.23285,  1.26186,  1.29155,
        1.32194,  1.35305,  1.38489,  1.41747,  1.45083,  1.48497,
        1.51991,  1.55568,  1.59228,  1.62975,  1.6681 ,  1.70735,
        1.74753,  1.78865,  1.83074,  1.87382,  1.91791,  1.96304,
        2.00923,  2.05651,  2.1049 ,  2.15443,  2.20513,  2.25702,
        2.31013,  2.36449,  2.42013,  2.47708,  2.53536,  2.59502,
        2.65609,  2.71859,  2.78256,  2.84804,  2.91505,  2.98365,
        3.05386,  3.12572,  3.19927,  3.27455,  3.3516 ,  3.43047,
        3.51119,  3.59381,  3.67838,  3.76494,  3.85353,  3.94421,
        4.03702,  4.13201,  4.22924,  4.32876,  4.43062,  4.53488,
        4.64159,  4.75081,  4.8626 ,  4.97702,  5.09414,  5.21401,
        5.3367 ,  5.46228,  5.59081,  5.72237,  5.85702,  5.99484,
        6.13591,  6.28029,  6.42807,  6.57933,  6.73415,  6.89261,
       

{'Target': 'mf3',
 'Model': 'NearestCentroid',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.8489109456851394,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.8143695014662756,
 'CV': 0.674}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'shrink_threshold': 1.0722672220103233, 'metric': 'manhattan'}


{'Target': 'mf3',
 'Model': 'NearestCentroid_Best',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.7841191066997519,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6586789554531489,
 'CV': 0.6743333333333333}

LogisticRegression
LogisticRegression(random_state=42)
{'C': array([    0.0001 ,     0.00026,     0.0007 ,     0.00183,     0.00483,
           0.01274,     0.0336 ,     0.08859,     0.23357,     0.61585,
           1.62378,     4.28133,    11.28838,    29.76351,    78.476  ,
         206.91381,   545.55948,  1438.44989,  3792.69019, 10000.     ]), 'penalty': ['l1', 'l2', 'elasticnet', 'none'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'fit_intercept': [True, False], 'intercept_scaling': [0.5, 1, 2], 'class_weight': [None, 'balanced']}



{'Target': 'mf3',
 'Model': 'LogisticRegression',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.8453149001536099,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.8410786907253127,
 'CV': 0.6666666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'newton-cg', 'penalty': 'none', 'intercept_scaling': 2, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.012742749857031334}


{'Target': 'mf3',
 'Model': 'LogisticRegression_Best',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.8963133640552995,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.8459021109792498,
 'CV': 0.7553333333333334}

100%|██████████| 29/29 [00:01<00:00, 21.86it/s]

Colonna:Putative Subspecies





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearSVC,1.0,1.0,,1.0,0.02
NuSVC,1.0,1.0,,1.0,0.02
BernoulliNB,1.0,1.0,,1.0,0.02
CalibratedClassifierCV,1.0,1.0,,1.0,0.11
SVC,1.0,1.0,,1.0,0.02
SGDClassifier,1.0,1.0,,1.0,0.02
RidgeClassifierCV,1.0,1.0,,1.0,0.02
RidgeClassifier,1.0,1.0,,1.0,0.02
GaussianNB,1.0,1.0,,1.0,0.01
RandomForestClassifier,1.0,1.0,,1.0,0.22




LinearSVC
LinearSVC(random_state=42)
None



{'Target': 'Putative Subspecies',
 'Model': 'LinearSVC',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.5393333333333333}

{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 42, 'tol': 0.0001, 'verbose': 0}


{'Target': 'Putative Subspecies',
 'Model': 'LinearSVC_Best',
 'Accuracy': 0.3225806451612903,
 'Precision': 0.1040582726326743,
 'Recall': 0.3225806451612903,
 'F1-Score': 0.15735641227380015,
 'CV': 0.5393333333333333}

NuSVC
NuSVC(random_state=42)
None



{'Target': 'Putative Subspecies',
 'Model': 'NuSVC',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.9506666666666665}

{'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'nu': 0.5, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}


{'Target': 'Putative Subspecies',
 'Model': 'NuSVC_Best',
 'Accuracy': 0.22580645161290322,
 'Precision': 0.0509885535900104,
 'Recall': 0.22580645161290322,
 'F1-Score': 0.0831918505942275,
 'CV': 0.9506666666666665}

BernoulliNB
BernoulliNB()
{'alpha': [0.01, 0.1, 0.5, 1.0, 10.0], 'fit_prior': [True, False], 'class_prior': [None, [0.1, 0.1]], 'binarize': [None, -5, 0.0, 5, 10.0]}



{'Target': 'Putative Subspecies',
 'Model': 'BernoulliNB',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.35}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'fit_prior': False, 'class_prior': None, 'binarize': None, 'alpha': 0.01}


{'Target': 'Putative Subspecies',
 'Model': 'BernoulliNB_Best',
 'Accuracy': 0.9032258064516129,
 'Precision': 0.9255583126550869,
 'Recall': 0.9032258064516129,
 'F1-Score': 0.8963406859620044,
 'CV': 0.7826666666666666}

100%|██████████| 29/29 [00:04<00:00,  7.16it/s]

Colonna:ST





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PassiveAggressiveClassifier,0.45,0.34,,0.47,0.09
RidgeClassifierCV,0.48,0.32,,0.41,0.02
LinearDiscriminantAnalysis,0.42,0.31,,0.41,0.02
LogisticRegression,0.45,0.31,,0.48,0.12
RidgeClassifier,0.45,0.3,,0.4,0.03
CalibratedClassifierCV,0.39,0.29,,0.33,0.66
RandomForestClassifier,0.42,0.29,,0.34,0.43
SGDClassifier,0.39,0.29,,0.36,0.06
LabelSpreading,0.42,0.28,,0.4,0.02
LabelPropagation,0.42,0.28,,0.4,0.02




PassiveAggressiveClassifier
PassiveAggressiveClassifier(random_state=42)
None



{'Target': 'ST',
 'Model': 'PassiveAggressiveClassifier',
 'Accuracy': 0.45161290322580644,
 'Precision': 0.5268817204301076,
 'Recall': 0.45161290322580644,
 'F1-Score': 0.4698924731182796,
 'CV': 0.057333333333333326}

{'C': 1.0, 'average': False, 'class_weight': None, 'early_stopping': False, 'fit_intercept': True, 'loss': 'hinge', 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


{'Target': 'ST',
 'Model': 'PassiveAggressiveClassifier_Best',
 'Accuracy': 0.0,
 'Precision': 0.0,
 'Recall': 0.0,
 'F1-Score': 0.0,
 'CV': 0.057333333333333326}

RidgeClassifierCV
RidgeClassifierCV()
None



{'Target': 'ST',
 'Model': 'RidgeClassifierCV',
 'Accuracy': 0.4838709677419355,
 'Precision': 0.3897849462365591,
 'Recall': 0.4838709677419355,
 'F1-Score': 0.4139784946236559,
 'CV': 0.13}

{'alphas': (0.1, 1.0, 10.0), 'class_weight': None, 'cv': None, 'fit_intercept': True, 'normalize': 'deprecated', 'scoring': None, 'store_cv_values': False}


{'Target': 'ST',
 'Model': 'RidgeClassifierCV_Best',
 'Accuracy': 0.12903225806451613,
 'Precision': 0.016649323621227886,
 'Recall': 0.12903225806451613,
 'F1-Score': 0.02949308755760369,
 'CV': 0.13}

LinearDiscriminantAnalysis
LinearDiscriminantAnalysis()
None



{'Target': 'ST',
 'Model': 'LinearDiscriminantAnalysis',
 'Accuracy': 0.41935483870967744,
 'Precision': 0.4268817204301075,
 'Recall': 0.41935483870967744,
 'F1-Score': 0.4096774193548387,
 'CV': 0.1943333333333333}

{'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}


{'Target': 'ST',
 'Model': 'LinearDiscriminantAnalysis_Best',
 'Accuracy': 0.03225806451612903,
 'Precision': 0.0010405827263267429,
 'Recall': 0.03225806451612903,
 'F1-Score': 0.0020161290322580645,
 'CV': 0.1943333333333333}

Unnamed: 0_level_0,LANCEFIELD GROUP
ID Strain,Unnamed: 1_level_1
V13,G
V142,G
V151,G
V160,G
V161,G
...,...
V800,C
V82,G
V90,G
V91,G


100%|██████████| 29/29 [00:01<00:00, 15.23it/s]

Colonna: mf3 con feat agg: lancefield





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SGDClassifier,0.84,0.85,0.85,0.84,0.02
NearestCentroid,0.81,0.83,0.83,0.81,0.03
LabelSpreading,0.84,0.82,0.82,0.84,0.04
RidgeClassifierCV,0.84,0.82,0.82,0.84,0.04
RidgeClassifier,0.84,0.82,0.82,0.84,0.02
PassiveAggressiveClassifier,0.84,0.82,0.82,0.84,0.03
LogisticRegression,0.84,0.82,0.82,0.84,0.04
LabelPropagation,0.84,0.82,0.82,0.84,0.02
XGBClassifier,0.84,0.79,0.79,0.84,0.11
AdaBoostClassifier,0.81,0.77,0.77,0.81,0.41




SGDClassifier
SGDClassifier(random_state=42)
None


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 890, in fit
    return self._fit(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 686, in _fit
    self._partial_fit(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 581, in _partial_fit
    X, y = self._validate_data(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
    X = check_array(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'C'
