In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone
import warnings
    
# warnings -> to silence warnings

warnings.filterwarnings("ignore")
np.set_printoptions(precision=5, suppress=True)


RANDOM_STATE = 42
N_JOBS = -1

class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]

map_target = {
    "Streptococcus canis": 0,
    "Streptococcus dysgalactiae subsp. equisimilis": 1,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 2
}

map_target_inv = {
    0: "Strept. canis",
    1: "Strept. dysg. equisimilis",
    2: "Strept. dysg. dysgalactiae"
}
map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}

start = 9
n_antibiotici = 9
n_geni = 27
n_virulenza = 18

In [9]:
# Load the data
n = 46
df = pd.read_csv("data/Dati_Matemaldomics_"+str(n)+"picchi.csv",
                    delimiter=';', index_col='ID Strain')

In [10]:
df['subspecies'] = df["Putative Subspecies"].map(map_target)

feat_agg = df.iloc[:,[7,8]]
display(feat_agg)
st = df.iloc[:,[4]]
display(st)
subspecies = df[['subspecies']]
maldi = df.iloc[:,start:start+n]
antibiotici = df.iloc[:,start+n:start+n+n_antibiotici]
geni_antibiotici = df.iloc[:,start+n+n_antibiotici:start+n+n_antibiotici+n_geni]
virulenza = df.iloc[:,start+n+n_antibiotici+n_geni:start+n+n_antibiotici+n_geni+n_virulenza]

maldi.fillna(0, inplace=True)
maldi = maldi.replace(',', '.', regex=True)
columns = maldi.columns
for column in columns:
    maldi[column] = maldi[column].astype(float)
display(maldi)

targets = {'antibiotici' : antibiotici,
            'geni_antibiotici' : geni_antibiotici,
            'virulenza' : virulenza}

for str_target,target in targets.items():
    columns = target.columns
    for column in columns:
        if str_target == 'antibiotici':
            target[column] = df[column].map(map_target_antibiotici)
        rapporto = (target[column] == 0).sum() / target.shape[0]
        #if (antibiotici[column] == 0).all() or (antibiotici[column] == 1).all():
        print(column+" : "+str(rapporto))
        if rapporto < 0.15 or rapporto > 0.85:
            target.drop([column], axis=1, inplace=True)
            
    display(target)
    
targets['st'] = st
target['subspecies'] = subspecies

Unnamed: 0_level_0,LANCEFIELD GROUP,Haemolysis
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1
V13,G,b
V142,G,b
V151,G,b
V160,G,b
V161,G,b
...,...,...
V800,C,a
V82,G,b
V90,G,b
V91,G,b


Unnamed: 0_level_0,ST
ID Strain,Unnamed: 1_level_1
V13,ST13
V142,ST23
V151,ST95
V160,ST15
V161,ST9
...,...
V800,ST307
V82,ST9
V90,ST13
V91,ST9


Unnamed: 0_level_0,"2223,140967","2241,073989","2262,75751","2679,802856","2978,296408","3159,441237","3354,28405","3364,608472","3397,909861","3418,174965",...,"9030,351844","9073,208159","9487,183195","10103,20284","10400,80576","10491,16654","10930,54833","13276,73249","14943,03835","15048,89449"
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V13,0.000665,0.000180,0.000198,0.000059,0.000865,0.000497,0.000197,0.000272,0.000578,0.000198,...,0.000206,0.000133,0.000587,0.000180,0.000065,0.000065,0.000172,0.000076,0.000058,0.000142
V142,0.000648,0.000156,0.000192,0.000400,0.000698,0.000414,0.000684,0.000349,0.000617,0.000494,...,0.000219,0.000205,0.000465,0.000193,0.000142,0.000116,0.000165,0.000067,0.000067,0.000194
V151,0.000545,0.000331,0.000204,0.000277,0.000613,0.000464,0.000279,0.001031,0.000624,0.000409,...,0.000261,0.000263,0.000585,0.000235,0.000198,0.000152,0.000208,0.000084,0.000092,0.000069
V160,0.000509,0.000191,0.000193,0.000266,0.000489,0.000358,0.000628,0.000692,0.000564,0.000611,...,0.000311,0.000312,0.000083,0.000241,0.000242,0.000187,0.000245,0.000082,0.000075,0.000136
V161,0.000517,0.000118,0.000158,0.000163,0.000610,0.000502,0.000621,0.000315,0.000507,0.000638,...,0.000206,0.000224,0.000711,0.000242,0.000156,0.000116,0.000154,0.000093,0.000072,0.000203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V800,0.000456,0.000253,0.000125,0.000061,0.000586,0.000365,0.000258,0.001095,0.000537,0.000214,...,0.000102,0.000195,0.000843,0.000250,0.000088,0.000109,0.000298,0.000060,0.000073,0.000059
V82,0.000459,0.000121,0.000147,0.000270,0.000571,0.000338,0.000596,0.000321,0.000537,0.000668,...,0.000263,0.000277,0.000452,0.000254,0.000211,0.000162,0.000203,0.000059,0.000070,0.000144
V90,0.000505,0.000154,0.000158,0.000181,0.000610,0.000473,0.000374,0.000525,0.000513,0.000377,...,0.000288,0.000310,0.000671,0.000237,0.000224,0.000185,0.000230,0.000089,0.000080,0.000137
V91,0.000520,0.000124,0.000169,0.000174,0.000597,0.000414,0.000572,0.000310,0.000532,0.000646,...,0.000237,0.000259,0.000514,0.000237,0.000202,0.000150,0.000175,0.000092,0.000078,0.000214


Eritromicina : 0.461038961038961
Ceftiofur : 0.0
Tetraciclina : 0.5194805194805194
Gentamicina : 0.6233766233766234
Penicillina : 0.0
Ampicillina : 0.0
Sulfametossazolo_trimethoprim : 0.01948051948051948
Clindamicina : 0.2662337662337662
Enrofloxacin : 0.6688311688311688


Unnamed: 0_level_0,Eritromicina,Tetraciclina,Gentamicina,Clindamicina,Enrofloxacin
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
V13,0,0,0,1,0
V142,0,1,1,1,0
V151,1,1,0,1,0
V160,1,0,0,1,0
V161,1,1,0,1,0
...,...,...,...,...,...
V800,1,0,1,1,0
V82,1,1,0,1,1
V90,1,0,0,1,0
V91,1,1,0,1,0


aad(6) : 0.935064935064935
ANT(6)-Ia : 0.8246753246753247
APH(2'')-IIIa : 1.0
APH(3')-IIIa : 0.9025974025974026
catS : 0.9675324675324676
dfrF : 0.9805194805194806
E. faecalis chloramphenicol acetyltransferase : 0.9935064935064936
Erm(47) : 0.987012987012987
ErmB : 0.8181818181818182
fexA : 0.9935064935064936
L._reuteri cat-TC : 1.0
lmrP : 0.006493506493506494
lnuC : 0.987012987012987
lnuD : 0.9935064935064936
lsaC : 0.961038961038961
lsaE : 0.7857142857142857
mefE : 0.8506493506493507
optrA : 0.9935064935064936
poxtA : 0.9935064935064936
SAT-4 : 0.922077922077922
tet(40) : 0.987012987012987
tet(L) : 0.9935064935064936
tetM : 0.8181818181818182
tetO : 0.7402597402597403
tetS : 0.9805194805194806
tetT : 0.974025974025974
vatE : 0.9935064935064936


Unnamed: 0_level_0,ANT(6)-Ia,ErmB,lsaE,tetM,tetO
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
V13,0,0,0,0,0
V142,0,0,0,0,0
V151,0,0,0,0,0
V160,0,0,0,0,0
V161,0,0,0,0,0
...,...,...,...,...,...
V800,0,0,0,1,0
V82,0,0,0,0,0
V90,0,0,0,0,0
V91,0,0,0,0,0


fbp54 : 0.0
gbs0630 : 0.9935064935064936
gbs0631 : 0.9935064935064936
gbs0632 : 0.9935064935064936
hasC : 0.0
lmb : 0.9935064935064936
mf2 : 0.961038961038961
mf3 : 0.6753246753246753
scpA : 0.9935064935064936
sda : 0.8766233766233766
ska : 0.9935064935064936
slo : 0.9935064935064936
smeZ : 0.9935064935064936
spec : 0.974025974025974
speg : 0.9090909090909091
spek : 0.961038961038961
spel : 0.974025974025974
spem : 0.948051948051948


Unnamed: 0_level_0,mf3
ID Strain,Unnamed: 1_level_1
V13,0
V142,1
V151,0
V160,0
V161,1
...,...
V800,0
V82,1
V90,0
V91,1


In [11]:
# Define a function for standard scaling
def standard_scaler(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Define a function for dimensionality reduction using PCA
def dimensionality_reduction(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)
    
    return X_train_pca, X_test_pca

In [14]:
# define the models
models = {'Logistic Regression': LogisticRegression(),
          'Decision Tree': DecisionTreeClassifier(),
          'K-nn': KNeighborsClassifier(),
          'Random Forest': RandomForestClassifier()}

# Hyperparameter tuning using RandomizedSearchCV
param_grid = {'Logistic Regression': {'C': np.logspace(-4, 4, 20), 
                                    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                    'fit_intercept': [True, False],
                                    'intercept_scaling': [0.5, 1, 2],
                                    'class_weight': [None, 'balanced']},
              'Decision Tree': {'max_depth': range(1, 15), 'min_samples_split': range(2, 15), 'class_weight': [None, 'balanced']},
              'K-nn': {'n_neighbors': range(1, 15), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
              'Random Forest': {'n_estimators': range(10, 100), 'max_depth': range(1, 15), 'min_samples_split': range(2, 15), 'class_weight': [None, 'balanced'], 'criterion': ['gini', 'entropy']}}

# create an empty dataframe to store the metrics
metrics_df = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
metrics_pca = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])

metrics_best = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])
metrics_best_pca = pd.DataFrame(columns=['Target','Model','Accuracy','Precision','Recall','F1-Score','CV'])

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = maldi
for str_target, target in targets.items():
    columns = target.columns
    for column in columns:    
        y = target[column]

        # split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_pca_train, X_pca_test = dimensionality_reduction(X_train, X_test, n_components=0.95)
        print("Colonna:"+column)
        
        '''print("X_TRAIN:")
        display(X_train)
        print("X_PCA_TRAIN:")
        display(X_pca_train)
        print("X_TEST:")
        display(X_test)
        print("X_PCA_TEST:")
        display(X_pca_test)
        print("y_TRAIN:")
        display(y_train)
        print("y_TEST:")
        display(y_test)'''

        # evaluate the models on the original dataset
        for name, model in models.items():
            model_base = model
            model_base_pca = model
            #Modello base
            print("Modello:"+name)
            model_base.fit(X_train, y_train)
            y_pred = model_base.predict(X_test)
            
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, average='weighted')
            rec = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')          
            cv = cross_val_score(estimator=model_base, X=X_train, y=y_train,
                                                    scoring="accuracy", cv=skfold, n_jobs=N_JOBS, verbose=0).mean()
            
            ris = {'Target': column,
                   'Model': name,
                   'Accuracy' : acc,
                   'Precision' : prec,
                   'Recall' : rec,
                   'F1-Score' : f1,
                   'CV' : cv} 
            #display(ris)
            metrics_df.append(ris, ignore_index=True)  
            
            #Tuning iperparametri
            params = param_grid[name]
            rs = RandomizedSearchCV(estimator=model, param_distributions=params,
                                scoring="accuracy", n_jobs=-1, cv=skfold, verbose=1)
            rs.fit(X_train, y_train)
            
            print(rs.best_params_)
            parametri = rs.best_params_
            best_model = rs.best_estimator_
            y_pred = best_model.predict(X_test)
             
            cv_best = rs.best_score_
            acc_best = accuracy_score(y_test, y_pred)
            prec_best = precision_score(y_test, y_pred, average='weighted')
            rec_best = recall_score(y_test, y_pred, average='weighted')
            f1_best = f1_score(y_test, y_pred, average='weighted')
            
            ris = {'Target': column,
                   'Model': name+'_Best',
                   'Accuracy' : acc_best,
                   'Precision' : prec_best,
                   'Recall' : rec_best,
                   'F1-Score' : f1_best,
                   'CV' : cv_best} 
            #display(ris)
            metrics_df.append(ris, ignore_index=True)  
            
            # PCA base
            model_base_pca.fit(X_pca_train, y_train)
            y_pred_pca = model_base_pca.predict(X_pca_test)
            
            acc_pca = accuracy_score(y_test, y_pred_pca)
            prec_pca = precision_score(y_test, y_pred_pca, average='weighted')
            rec_pca = recall_score(y_test, y_pred_pca, average='weighted')
            f1_pca = f1_score(y_test, y_pred_pca, average='weighted')
            cv_pca = cross_val_score(estimator=model_base_pca, X=X_pca_train, y=y_train,
                                                    scoring="accuracy", cv=skfold, n_jobs=N_JOBS, verbose=0).mean()
            
            ris = {'Target': column,
                   'Model': name+'_PCA',
                   'Accuracy' : acc_pca,
                   'Precision' : prec_pca,
                   'Recall' : rec_pca,
                   'F1-Score' : f1_pca,
                   'CV' : cv_pca} 
            display(ris)
            metrics_pca.append(ris, ignore_index=True)  
            
            #PCA con Tuning
            rs = RandomizedSearchCV(estimator=model, param_distributions=params,
                                scoring="accuracy", n_jobs=-1, cv=skfold, verbose=1)
            rs.fit(X_pca_train, y_train)
            print(rs.best_params_)
            parametri = rs.best_params_           
            best_model_pca = rs.best_estimator_
            y_pred = best_model_pca.predict(X_pca_test)
            
            cv_best_pca = rs.best_score_
            acc_best_pca = accuracy_score(y_test, y_pred)
            prec_best_pca = precision_score(y_test, y_pred, average='weighted')
            rec_best_pca = recall_score(y_test, y_pred, average='weighted')
            f1_best_pca = f1_score(y_test, y_pred, average='weighted')
            
            ris = {'Target': column,
                   'Model': name+'_Best_PCA',
                   'Accuracy' : acc_best_pca,
                   'Precision' : prec_best_pca,
                   'Recall' : rec_best_pca,
                   'F1-Score' : f1_best_pca,
                   'CV' : cv_best_pca} 
            display(ris)
            metrics_pca.append(ris, ignore_index=True) 
            
display(metrics_df)
display(metrics_pca)
metrics_df.to_csv('Results.csv')
metrics_pca.to_csv('Results_pca.csv')

Colonna:Eritromicina
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l2', 'intercept_scaling': 1, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.03359818286283781}


{'Target': 'Eritromicina',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.4162330905306972,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.5060088551549652,
 'CV': 0.512}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l1', 'intercept_scaling': 1, 'fit_intercept': False, 'class_weight': None, 'C': 545.5594781168514}


{'Target': 'Eritromicina',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.7926661152467604,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7139245626453636,
 'CV': 0.5933333333333333}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 9, 'max_depth': 8, 'class_weight': None}


{'Target': 'Eritromicina',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6494623655913978,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.6211745244003308,
 'CV': 0.5536666666666668}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 14, 'max_depth': 11, 'class_weight': None}


{'Target': 'Eritromicina',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.7057159026598755,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.6161290322580646,
 'CV': 0.6100000000000001}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 14, 'algorithm': 'kd_tree'}


{'Target': 'Eritromicina',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.4838709677419355,
 'Precision': 0.5034463744141164,
 'Recall': 0.4838709677419355,
 'F1-Score': 0.4917940011318619,
 'CV': 0.5116666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 1, 'algorithm': 'brute'}


{'Target': 'Eritromicina',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.6707779886148008,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6520662598081952,
 'CV': 0.5843333333333333}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 48, 'min_samples_split': 7, 'max_depth': 12, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'Eritromicina',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6494623655913978,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.6211745244003308,
 'CV': 0.528}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 87, 'min_samples_split': 3, 'max_depth': 7, 'criterion': 'gini', 'class_weight': None}


{'Target': 'Eritromicina',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.74085117918135,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6835185687178096,
 'CV': 0.5693333333333334}

Colonna:Tetraciclina
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'lbfgs', 'penalty': 'none', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 0.012742749857031334}


{'Target': 'Tetraciclina',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.3548387096774194,
 'Precision': 0.1259105098855359,
 'Recall': 0.3548387096774194,
 'F1-Score': 0.1858678955453149,
 'CV': 0.512}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l2', 'intercept_scaling': 2, 'fit_intercept': False, 'class_weight': None, 'C': 206.913808111479}


{'Target': 'Tetraciclina',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709679,
 'CV': 0.6413333333333333}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 4, 'max_depth': 7, 'class_weight': None}


{'Target': 'Tetraciclina',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6297215329473395,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.6188455008488963,
 'CV': 0.6096666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 13, 'max_depth': 1, 'class_weight': None}


{'Target': 'Tetraciclina',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.5483870967741935,
 'Precision': 0.8012903225806451,
 'Recall': 0.5483870967741935,
 'F1-Score': 0.5146126275158533,
 'CV': 0.6900000000000001}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 3, 'algorithm': 'ball_tree'}


{'Target': 'Tetraciclina',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.74085117918135,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6835185687178096,
 'CV': 0.6743333333333335}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 10, 'algorithm': 'ball_tree'}


{'Target': 'Tetraciclina',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.7233250620347395,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6503522432332222,
 'CV': 0.6736666666666667}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 53, 'min_samples_split': 2, 'max_depth': 10, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'Tetraciclina',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.6707779886148008,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6520662598081952,
 'CV': 0.6743333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 39, 'min_samples_split': 10, 'max_depth': 10, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'Tetraciclina',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6297215329473395,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.6188455008488963,
 'CV': 0.683}

Colonna:Gentamicina
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': False, 'class_weight': None, 'C': 4.281332398719396}


{'Target': 'Gentamicina',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.5483870967741935,
 'Precision': 0.3007284079084287,
 'Recall': 0.5483870967741935,
 'F1-Score': 0.38844086021505375,
 'CV': 0.6423333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l1', 'intercept_scaling': 1, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 10000.0}


{'Target': 'Gentamicina',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.6727504244482173,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6414534668149797,
 'CV': 0.651}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 7, 'max_depth': 4, 'class_weight': None}


{'Target': 'Gentamicina',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.45161290322580644,
 'Precision': 0.4548387096774194,
 'Recall': 0.45161290322580644,
 'F1-Score': 0.4527589577645195,
 'CV': 0.626}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 7, 'max_depth': 4, 'class_weight': None}


{'Target': 'Gentamicina',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.5161290322580645,
 'Precision': 0.49983708048224174,
 'Recall': 0.5161290322580645,
 'F1-Score': 0.4945517315783795,
 'CV': 0.659}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 12, 'algorithm': 'auto'}


{'Target': 'Gentamicina',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6097906055461234,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.608767576509512,
 'CV': 0.6186666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 13, 'algorithm': 'auto'}


{'Target': 'Gentamicina',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.7123167155425218,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7040278988666085,
 'CV': 0.6676666666666666}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 77, 'min_samples_split': 13, 'max_depth': 13, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'Gentamicina',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.5483870967741935,
 'Precision': 0.5483870967741935,
 'Recall': 0.5483870967741935,
 'F1-Score': 0.5483870967741935,
 'CV': 0.6663333333333334}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 60, 'min_samples_split': 4, 'max_depth': 5, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'Gentamicina',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.5806451612903226,
 'Precision': 0.5783016266887234,
 'Recall': 0.5806451612903226,
 'F1-Score': 0.5788701143539854,
 'CV': 0.6910000000000001}

Colonna:Clindamicina
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'lbfgs', 'penalty': 'none', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': None, 'C': 4.281332398719396}


{'Target': 'Clindamicina',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.4588969823100936,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.5471464019851118,
 'CV': 0.748}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'l2', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 78.47599703514607}


{'Target': 'Clindamicina',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6923963133640554,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.6937381404174573,
 'CV': 0.8046666666666666}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 2, 'max_depth': 7, 'class_weight': 'balanced'}


{'Target': 'Clindamicina',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7667050691244239,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7617963314358002,
 'CV': 0.7136666666666668}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 4, 'max_depth': 1, 'class_weight': 'balanced'}


{'Target': 'Clindamicina',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7667050691244239,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7617963314358002,
 'CV': 0.8783333333333333}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 10, 'algorithm': 'kd_tree'}


{'Target': 'Clindamicina',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6923963133640554,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.6937381404174573,
 'CV': 0.8859999999999999}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 13, 'algorithm': 'brute'}


{'Target': 'Clindamicina',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7437275985663083,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7027649769585254,
 'CV': 0.8939999999999999}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 62, 'min_samples_split': 13, 'max_depth': 5, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'Clindamicina',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7791563275434243,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7485701212537176,
 'CV': 0.8696666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 75, 'min_samples_split': 4, 'max_depth': 14, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'Clindamicina',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7667050691244239,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7617963314358002,
 'CV': 0.8780000000000001}

Colonna:Enrofloxacin
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'l2', 'intercept_scaling': 1, 'fit_intercept': False, 'class_weight': None, 'C': 29.763514416313132}


{'Target': 'Enrofloxacin',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.4162330905306972,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.5060088551549652,
 'CV': 0.6746666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'l1', 'intercept_scaling': 0.5, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.0001}


{'Target': 'Enrofloxacin',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.4162330905306972,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.5060088551549652,
 'CV': 0.6746666666666666}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 4, 'max_depth': 1, 'class_weight': None}


{'Target': 'Enrofloxacin',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709679,
 'CV': 0.5443333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 14, 'max_depth': 10, 'class_weight': None}


{'Target': 'Enrofloxacin',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.6382488479262672,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6410400509535049,
 'CV': 0.6426666666666667}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 8, 'algorithm': 'kd_tree'}


{'Target': 'Enrofloxacin',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.7010752688172042,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.683111954459203,
 'CV': 0.7083333333333334}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 6, 'algorithm': 'ball_tree'}


{'Target': 'Enrofloxacin',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.674347158218126,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6121351766513058,
 'CV': 0.716}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 35, 'min_samples_split': 12, 'max_depth': 7, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'Enrofloxacin',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.7010752688172042,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.683111954459203,
 'CV': 0.683}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 95, 'min_samples_split': 10, 'max_depth': 14, 'criterion': 'gini', 'class_weight': 'balanced'}


{'Target': 'Enrofloxacin',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.626227208976157,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6295258025032574,
 'CV': 0.7}

Colonna:ANT(6)-Ia
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'none', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 0.08858667904100823}


{'Target': 'ANT(6)-Ia',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.6503642039542143,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.7200460829493087,
 'CV': 0.8293333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 1.623776739188721}


{'Target': 'ANT(6)-Ia',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7598014888337469,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7661434075096314,
 'CV': 0.8696666666666667}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 3, 'max_depth': 1, 'class_weight': None}


{'Target': 'ANT(6)-Ia',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709676,
 'CV': 0.7886666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 14, 'max_depth': 9, 'class_weight': None}


{'Target': 'ANT(6)-Ia',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709676,
 'CV': 0.8463333333333333}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 9, 'algorithm': 'kd_tree'}


{'Target': 'ANT(6)-Ia',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7269585253456221,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7429498884154999,
 'CV': 0.8946666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 7, 'algorithm': 'ball_tree'}


{'Target': 'ANT(6)-Ia',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7269585253456221,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7429498884154999,
 'CV': 0.8946666666666667}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 95, 'min_samples_split': 2, 'max_depth': 2, 'criterion': 'gini', 'class_weight': None}


{'Target': 'ANT(6)-Ia',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.764182424916574,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.7652329749103943,
 'CV': 0.8373333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 84, 'min_samples_split': 10, 'max_depth': 1, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'ANT(6)-Ia',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709676,
 'CV': 0.8700000000000001}

Colonna:ErmB
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l2', 'intercept_scaling': 2, 'fit_intercept': True, 'class_weight': None, 'C': 206.913808111479}


{'Target': 'ErmB',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.7034339229968783,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.7651386530843237,
 'CV': 0.8133333333333335}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'lbfgs', 'penalty': 'l2', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': None, 'C': 0.615848211066026}


{'Target': 'ErmB',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.7034339229968783,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.7651386530843237,
 'CV': 0.8133333333333335}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 9, 'max_depth': 1, 'class_weight': None}


{'Target': 'ErmB',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.8261648745519713,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.8312707107594509,
 'CV': 0.8213333333333335}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 9, 'max_depth': 8, 'class_weight': None}


{'Target': 'ErmB',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.8709677419354839,
 'Precision': 0.8709677419354839,
 'Recall': 0.8709677419354839,
 'F1-Score': 0.8709677419354839,
 'CV': 0.8370000000000001}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 8, 'algorithm': 'brute'}


{'Target': 'ErmB',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.6889400921658987,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7144563918757467,
 'CV': 0.8460000000000001}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 9, 'algorithm': 'ball_tree'}


{'Target': 'ErmB',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.6889400921658987,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7144563918757467,
 'CV': 0.8623333333333333}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 43, 'min_samples_split': 12, 'max_depth': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'ErmB',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.8261648745519713,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.8312707107594509,
 'CV': 0.8296666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 13, 'min_samples_split': 13, 'max_depth': 2, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'ErmB',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.8387096774193549,
 'Precision': 0.8522580645161291,
 'Recall': 0.8387096774193549,
 'F1-Score': 0.8444597780461158,
 'CV': 0.8623333333333333}

Colonna:lsaE
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.004832930238571752}


{'Target': 'lsaE',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.5504682622268471,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.6320191158900836,
 'CV': 0.7966666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'none', 'intercept_scaling': 2, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 0.0001}


{'Target': 'lsaE',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7225806451612904,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7288786482334868,
 'CV': 0.8936666666666666}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 9, 'max_depth': 14, 'class_weight': None}


{'Target': 'lsaE',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.7980884109916367,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.7819354838709678,
 'CV': 0.8283333333333334}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 3, 'max_depth': 1, 'class_weight': 'balanced'}


{'Target': 'lsaE',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7540942928039702,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7550514002126906,
 'CV': 0.8856666666666666}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 3, 'algorithm': 'ball_tree'}


{'Target': 'lsaE',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6979646697388632,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7030885380919698,
 'CV': 0.9346666666666665}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 5, 'algorithm': 'kd_tree'}


{'Target': 'lsaE',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6979646697388632,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7030885380919698,
 'CV': 0.9346666666666665}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 80, 'min_samples_split': 3, 'max_depth': 10, 'criterion': 'gini', 'class_weight': None}


{'Target': 'lsaE',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7540942928039702,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7550514002126906,
 'CV': 0.9186666666666665}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 66, 'min_samples_split': 4, 'max_depth': 7, 'criterion': 'gini', 'class_weight': None}


{'Target': 'lsaE',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7060931899641577,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7092473118279571,
 'CV': 0.9183333333333333}

Colonna:tetM
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'saga', 'penalty': 'l2', 'intercept_scaling': 2, 'fit_intercept': False, 'class_weight': None, 'C': 3792.690190732246}


{'Target': 'tetM',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.5036420395421436,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.5891661594643944,
 'CV': 0.8456666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'newton-cg', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': True, 'class_weight': None, 'C': 0.03359818286283781}


{'Target': 'tetM',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.6451612903225806,
 'Precision': 0.607741935483871,
 'Recall': 0.6451612903225806,
 'F1-Score': 0.6210020590253946,
 'CV': 0.8620000000000001}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 11, 'max_depth': 1, 'class_weight': None}


{'Target': 'tetM',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7277265745007679,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7314165497896213,
 'CV': 0.8296666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 6, 'max_depth': 14, 'class_weight': None}


{'Target': 'tetM',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7277265745007679,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7314165497896213,
 'CV': 0.8543333333333335}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 12, 'algorithm': 'ball_tree'}


{'Target': 'tetM',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6708482676224611,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.6686585304096826,
 'CV': 0.8453333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 11, 'algorithm': 'auto'}


{'Target': 'tetM',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.810752688172043,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.6585607940446649,
 'CV': 0.8456666666666667}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 68, 'min_samples_split': 3, 'max_depth': 14, 'criterion': 'gini', 'class_weight': None}


{'Target': 'tetM',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.847926267281106,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.7696774193548387,
 'CV': 0.8459999999999999}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 22, 'min_samples_split': 8, 'max_depth': 2, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'tetM',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.5036420395421436,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.5891661594643944,
 'CV': 0.8539999999999999}

Colonna:tetO
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'newton-cg', 'penalty': 'l2', 'intercept_scaling': 0.5, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 10000.0}


{'Target': 'tetO',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.5993756503642039,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.6756598240469208,
 'CV': 0.732}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'liblinear', 'penalty': 'l1', 'intercept_scaling': 0.5, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 78.47599703514607}


{'Target': 'tetO',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7419354838709677,
 'Precision': 0.7419354838709677,
 'Recall': 0.7419354838709677,
 'F1-Score': 0.7419354838709677,
 'CV': 0.772}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 6, 'max_depth': 1, 'class_weight': 'balanced'}


{'Target': 'tetO',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.6129032258064516,
 'Precision': 0.6809384164222875,
 'Recall': 0.6129032258064516,
 'F1-Score': 0.638318670576735,
 'CV': 0.747}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 5, 'max_depth': 13, 'class_weight': None}


{'Target': 'tetO',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.7086999022482893,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6905680224403926,
 'CV': 0.747}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 10, 'algorithm': 'ball_tree'}


{'Target': 'tetO',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.6774193548387096,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6774193548387096,
 'CV': 0.8206666666666667}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 10, 'algorithm': 'ball_tree'}


{'Target': 'tetO',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.6946236559139786,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7014736415658076,
 'CV': 0.8286666666666667}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 41, 'min_samples_split': 4, 'max_depth': 7, 'criterion': 'entropy', 'class_weight': None}


{'Target': 'tetO',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.6774193548387096,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6774193548387096,
 'CV': 0.8046666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 98, 'min_samples_split': 13, 'max_depth': 7, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'tetO',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.7242286115007013,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.7162663006177077,
 'CV': 0.8206666666666667}

Colonna:mf3
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'lbfgs', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 1.623776739188721}


{'Target': 'mf3',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.5036420395421436,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.5891661594643944,
 'CV': 0.6666666666666666}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'none', 'intercept_scaling': 2, 'fit_intercept': True, 'class_weight': None, 'C': 0.00026366508987303583}


{'Target': 'mf3',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.5036420395421436,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.5891661594643944,
 'CV': 0.6666666666666666}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 6, 'max_depth': 6, 'class_weight': None}


{'Target': 'mf3',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.7096774193548387,
 'Precision': 0.684516129032258,
 'Recall': 0.7096774193548387,
 'F1-Score': 0.6899107755662321,
 'CV': 0.634}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 6, 'max_depth': 3, 'class_weight': None}


{'Target': 'mf3',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.6774193548387096,
 'Precision': 0.6774193548387096,
 'Recall': 0.6774193548387096,
 'F1-Score': 0.6774193548387096,
 'CV': 0.6506666666666667}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 7, 'algorithm': 'kd_tree'}


{'Target': 'mf3',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.8234604105571848,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.811520737327189,
 'CV': 0.6423333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 11, 'algorithm': 'brute'}


{'Target': 'mf3',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.8234604105571848,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.811520737327189,
 'CV': 0.7316666666666667}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 72, 'min_samples_split': 11, 'max_depth': 9, 'criterion': 'gini', 'class_weight': 'balanced'}


{'Target': 'mf3',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.8064516129032258,
 'Precision': 0.8054590570719603,
 'Recall': 0.8064516129032258,
 'F1-Score': 0.7868663594470046,
 'CV': 0.7153333333333334}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 95, 'min_samples_split': 12, 'max_depth': 6, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'mf3',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.7741935483870968,
 'Precision': 0.7612903225806452,
 'Recall': 0.7741935483870968,
 'F1-Score': 0.7588194921070694,
 'CV': 0.7406666666666666}

Colonna:subspecies
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'newton-cg', 'penalty': 'none', 'intercept_scaling': 1, 'fit_intercept': True, 'class_weight': None, 'C': 0.0006951927961775605}


{'Target': 'subspecies',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.3225806451612903,
 'Precision': 0.1040582726326743,
 'Recall': 0.3225806451612903,
 'F1-Score': 0.15735641227380015,
 'CV': 0.5393333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'none', 'intercept_scaling': 2, 'fit_intercept': False, 'class_weight': None, 'C': 0.0001}


{'Target': 'subspecies',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.967741935483871,
 'Precision': 0.970674486803519,
 'Recall': 0.967741935483871,
 'F1-Score': 0.967269289849935,
 'CV': 0.9433333333333334}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 4, 'max_depth': 3, 'class_weight': None}


{'Target': 'subspecies',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.9354838709677419,
 'Precision': 0.946236559139785,
 'Recall': 0.9354838709677419,
 'F1-Score': 0.9330400782013685,
 'CV': 0.9109999999999999}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 5, 'max_depth': 8, 'class_weight': None}


{'Target': 'subspecies',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.9354838709677419,
 'Precision': 0.946236559139785,
 'Recall': 0.9354838709677419,
 'F1-Score': 0.9330400782013685,
 'CV': 0.9276666666666668}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 3, 'algorithm': 'auto'}


{'Target': 'subspecies',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.967741935483871,
 'Precision': 0.970674486803519,
 'Recall': 0.967741935483871,
 'F1-Score': 0.967269289849935,
 'CV': 0.9263333333333333}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 1, 'algorithm': 'brute'}


{'Target': 'subspecies',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.9513333333333334}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 35, 'min_samples_split': 4, 'max_depth': 13, 'criterion': 'gini', 'class_weight': 'balanced'}


{'Target': 'subspecies',
 'Model': 'Random Forest_PCA',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.9433333333333334}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 34, 'min_samples_split': 3, 'max_depth': 4, 'criterion': 'entropy', 'class_weight': 'balanced'}


{'Target': 'subspecies',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 1.0,
 'Precision': 1.0,
 'Recall': 1.0,
 'F1-Score': 1.0,
 'CV': 0.9513333333333331}

Colonna:ST
Modello:Logistic Regression
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'lbfgs', 'penalty': 'none', 'intercept_scaling': 0.5, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.0018329807108324356}


{'Target': 'ST',
 'Model': 'Logistic Regression_PCA',
 'Accuracy': 0.12903225806451613,
 'Precision': 0.016649323621227886,
 'Recall': 0.12903225806451613,
 'F1-Score': 0.02949308755760369,
 'CV': 0.13}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'solver': 'sag', 'penalty': 'none', 'intercept_scaling': 2, 'fit_intercept': False, 'class_weight': None, 'C': 0.615848211066026}


{'Target': 'ST',
 'Model': 'Logistic Regression_Best_PCA',
 'Accuracy': 0.3548387096774194,
 'Precision': 0.3075268817204301,
 'Recall': 0.3548387096774194,
 'F1-Score': 0.31822836661546333,
 'CV': 0.2753333333333333}

Modello:Decision Tree
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 12, 'max_depth': 13, 'class_weight': None}


{'Target': 'ST',
 'Model': 'Decision Tree_PCA',
 'Accuracy': 0.22580645161290322,
 'Precision': 0.2043010752688172,
 'Recall': 0.22580645161290322,
 'F1-Score': 0.20967741935483872,
 'CV': 0.17099999999999999}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 14, 'max_depth': 9, 'class_weight': None}


{'Target': 'ST',
 'Model': 'Decision Tree_Best_PCA',
 'Accuracy': 0.3870967741935484,
 'Precision': 0.2790322580645161,
 'Recall': 0.3870967741935484,
 'F1-Score': 0.31505376344086017,
 'CV': 0.187}

Modello:K-nn
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'distance', 'n_neighbors': 4, 'algorithm': 'auto'}


{'Target': 'ST',
 'Model': 'K-nn_PCA',
 'Accuracy': 0.3870967741935484,
 'Precision': 0.33064516129032256,
 'Recall': 0.3870967741935484,
 'F1-Score': 0.3405529953917051,
 'CV': 0.284}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'weights': 'uniform', 'n_neighbors': 1, 'algorithm': 'auto'}


{'Target': 'ST',
 'Model': 'K-nn_Best_PCA',
 'Accuracy': 0.3548387096774194,
 'Precision': 0.37634408602150543,
 'Recall': 0.3548387096774194,
 'F1-Score': 0.3471582181259601,
 'CV': 0.29933333333333334}

Modello:Random Forest
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 62, 'min_samples_split': 2, 'max_depth': 6, 'criterion': 'gini', 'class_weight': None}


{'Target': 'ST',
 'Model': 'Random Forest_PCA',
 'Accuracy': 0.3870967741935484,
 'Precision': 0.3548387096774194,
 'Recall': 0.3870967741935484,
 'F1-Score': 0.34715821812596004,
 'CV': 0.364}

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 72, 'min_samples_split': 3, 'max_depth': 14, 'criterion': 'gini', 'class_weight': None}


{'Target': 'ST',
 'Model': 'Random Forest_Best_PCA',
 'Accuracy': 0.3870967741935484,
 'Precision': 0.31648745519713256,
 'Recall': 0.3870967741935484,
 'F1-Score': 0.31903580290677064,
 'CV': 0.381}

Unnamed: 0,Target,Model,Accuracy,Precision,Recall,F1-Score,CV


Unnamed: 0,Target,Model,Accuracy,Precision,Recall,F1-Score,CV


Imports necessary libraries
Loads and preprocesses the data
Defines a function for standard scaling
Defines a function for dimensionality reduction using PCA
Defines functions for training and evaluating the following models:
Logistic Regression
Decision Trees
k-Nearest Neighbors
Random Forest
Lasso Regularization
Ridge Regularization
Defines a function for tuning the hyperparameters of the models using Random Search
Defines a function for saving the metric scores to a CSV file
Defines a function for creating a confusion matrix for each model
Initializes an empty dataframe to store the results
Iterates through the models, training and evaluating them both with and without dimensionality reduction using PCA and standard scaling
Saves the results to a CSV file

First, you will need to import the necessary libraries for data manipulation, model training, and evaluation. These can include pandas, numpy, sklearn and others.

Then, you will need to load the data and split it into training and testing sets. You can use the train_test_split function from sklearn to do this.

Next, you will need to implement the different models, such as logistic regression, decision trees, k-nn, and random forest. You can use the corresponding classes from sklearn to do this.

After that, you will need to implement the dimensionality reduction techniques, such as PCA, Lasso regularization, and Ridge regularization, and use the Standard Scaler to preprocess the data.

After that, you will need to train the models on the original data and the data reduced with PCA, and use the appropriate evaluation metric for classification problem.

Finally, you will need to use GridSearchCV or RandomizedSearchCV for tuning the hyperparameters for each model, and save the results of the evaluation in a CSV file for future reference.

Keep in mind that this is a high-level overview and you may need to add more detail and fine-tuning to the script for it to work correctly.

In [13]:
# Import Lasso from sklearn's linear_model module
from sklearn.linear_model import Lasso

# Initialize Lasso model
lasso = Lasso()

# Define a range of alpha values to test
alphas = np.logspace(-5, 5, 100)

# Create a dictionary containing Lasso hyperparameters to test
param_grid = {'alpha': alphas}

# Use RandomizedSearchCV to optimize Lasso hyperparameters
lasso_regressor = RandomizedSearchCV(lasso, param_grid, cv=5)

# Fit the Lasso model to the data
lasso_regressor.fit(X_train, y_train)

# Print the optimal alpha value
print("Optimal alpha value:", lasso_regressor.best_params_)

# Predict on the test set
y_pred_lasso = lasso_regressor.predict(X_test)

# Print the performance metrics
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_lasso))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lasso))
print("R2 Score:", r2_score(y_test, y_pred_lasso))


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 955, in fit
    X, y = self._validate_data(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 1090, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 1115, in _check_y
    y = y.astype(np.float64)
ValueError: could not convert string to float: 'ST231'

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 955, in fit
    X, y = self._validate_data(
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 1090, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "d:\PycharmProjects\Thesis-Streptococcus-Classification\venv\lib\site-packages\sklearn\utils\validation.py", line 1115, in _check_y
    y = y.astype(np.float64)
ValueError: could not convert string to float: 'ST13'
