In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle

from scipy.spatial import distance
from sklearn.utils.multiclass import unique_labels
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

#from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import StackingClassifier

import warnings

warnings.filterwarnings("ignore")
np.set_printoptions(precision=5, suppress=True)


RANDOM_STATE = 46
N_JOBS = -1

class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]

map_target = {
    "Streptococcus canis": 2,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 1,
    "Streptococcus dysgalactiae subsp. equisimilis": 0
}

map_target_inv = {
    2: "Canis",
    1: "Dysgalactiae",
    0: "Equisimilis"

}
map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}

map_target_antibiotici_inv = {
    1 : "S",
    0 : "NS"
}

maps_cluster = {
    2 : 0, 
    1 : 2, 
    0 : 1
}

metrics = ['accuracy', 'recall_weighted', 'precision_weighted','f1_weighted']
metrics_cluster = ['Silhouette', 'Calinski', 'Davies']
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

start = 9
n_antibiotici = 9
n_geni = 27
n_virulenza = 18
n = 306

In [2]:
# Define a function for standard scaling
def standard_scaler(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Define a function for dimensionality reduction using PCA
def dimensionality_reduction(X_train, X_test, n_components):
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)
    #print(X_train_pca.shape)
    return X_train_pca, X_test_pca

def dimensionality_reduction_cluster(X, n_components):
    X.columns = X.columns.astype(str)
    print(X.shape)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    X_pca = pd.DataFrame(X_pca, index=X.index.to_list())
    print(X_pca.shape)
    X_pca.columns = X_pca.columns.astype(str)
    return X_pca

def makeScoreMeanWithoutNaN(metrics):
    for name, metrica in metrics.items():
        print(name)
        print(metrics[name])
        metrics[name] = metrics[name][~np.isnan(metrics[name])]
        print(metrics[name])
        metrics[name] = np.mean(metrics[name])
        print(metrics[name])
    print(metrics)
    return metrics

In [3]:
def makeScore(y_test, y_pred):
    score = {}

    score['acc'] = accuracy_score(y_test, y_pred)
    score['b_acc'] = balanced_accuracy_score(y_test, y_pred)
    score['st'] = score['acc'].std()
    score['prec'] = precision_score(y_test, y_pred, average='weighted')
    score['rec'] = recall_score(y_test, y_pred, average='weighted')
    score['f1'] = f1_score(y_test, y_pred, average='weighted')

    return score

def makeCrossValidation(model, X_train, y_train):
    score = {}
    cv = cross_validate(estimator=model, X=X_train, y=y_train,
                        scoring=metrics, cv=skfold,
                        n_jobs=N_JOBS, verbose=0)

    score['acc'] = cv.get('test_accuracy').mean()
    score['st'] = cv.get('test_accuracy').std()
    score['prec'] = cv.get('test_precision_weighted').mean()
    score['rec'] = cv.get('test_recall_weighted').mean()
    score['f1'] = cv.get('test_f1_weighted').mean()

    return score

def makeCrossValidationCluster(model, X):
    model.fit(X)
    labels = model.labels_
    pca = PCA(n_components = 2)
    pca.fit(X)
    X_pca = pca.transform(X)
    avg_silhouette = silhouette_score(X_pca, labels)
    avg_calinski_harabasz = calinski_harabasz_score(X_pca, labels)
    avg_davies_bouldin = davies_bouldin_score(X_pca, labels)

    score = {}
    score['Silhouette'] = avg_silhouette
    score['Calinski'] = avg_calinski_harabasz
    score['Davies'] = avg_davies_bouldin

    return score

def makeTuning(model, X_train, y_train, name):
    score = {}
    params = param_grid[name]
    rs = RandomizedSearchCV(estimator=model, param_distributions=params,
                            scoring=metrics, refit="accuracy", cv=skfold,
                            n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
    rs.fit(X_train, y_train)

    results = rs.cv_results_
    model_best = rs.best_estimator_
    parametri = model.get_params()
    cv_best = rs.best_score_

    score = makeCrossValidation(model_best, X_train, y_train)

    return clone(model_best), score

def makeTuningCluster(model, X, y, name):
    score = {}
    params = param_grid_cluster[name]
    max_score = {}
    max_score['acc'] = 0
    model_best = clone(model)
    for param in params:
        #print(param)
        model_cl = clone(model)
        model_cl.set_params(**param)
        #print(model.get_params())
        model_cl.fit(X)
        y_pred = model_cl.labels_
        score = makeScore(y,y_pred)
        #print(score['acc'])
        if score['acc'] > max_score['acc']:
            max_score = score
            model_best = clone(model_cl)
            y_pred_max = y_pred
            '''print('Max:')
            print(max_score)
            print(model_best.get_params())'''

    score_cluster = makeCrossValidationCluster(model_best, X)
    '''print('Model best final:')
    print(model_best.get_params())'''
    return clone(model_best), max_score, score_cluster,y_pred_max

In [4]:
N_CLUSTERS = 3
list_animals = ['Dog', 'Cat', 'Bovine', 'Swine', 'Ovine', 'Goat', 'Hedgehog',
       'Horse', 'Donkey', 'Wolf', 'Reference strain (CCUG)',
       'Water buffalo','Wild boar']
list_agg = ['Animal species of origin_Bovine', 'Animal species of origin_Cat',
       'Animal species of origin_Dog', 'Animal species of origin_Donkey',
       'Animal species of origin_Goat', 'Animal species of origin_Hedgehog',
       'Animal species of origin_Horse', 'Animal species of origin_Ovine',
       'Animal species of origin_Reference strain (CCUG)',
       'Animal species of origin_Swine',
       'Animal species of origin_Water buffalo',
       'Animal species of origin_Wolf',
       'Animal species of origin_Wild boar',
       #'LANCEFIELD GROUP_A', 'LANCEFIELD GROUP_C', 'LANCEFIELD GROUP_G',
       'Haemolysis_a', 'Haemolysis_b']
list_subs = ["K-means_Canis", "K-means_Dysgalactiae", "K-means_Equisimilis"]

models_cluster = [
  'K-means',
  #'AgglomerativeClustering'
]

In [5]:
df = pd.read_csv("..\data\Dati_Matemaldomics_"+str(n)+"picchi.csv",
                    delimiter=';', index_col='ID Strain')
df['subspecies'] = df["Putative Subspecies"].map(map_target)

animals = df.iloc[:,2]
feat_agg = df.iloc[:,[2,8]]
display(feat_agg)
st = df.iloc[:,[4]]
display(st)
subspecies = df[['subspecies']]
maldi = df.iloc[:,start:start+n]
antibiotici = df.iloc[:,start+n:start+n+n_antibiotici]
geni_antibiotici = df.iloc[:,start+n+n_antibiotici:start+n+n_antibiotici+n_geni]
virulenza = df.iloc[:,start+n+n_antibiotici+n_geni:start+n+n_antibiotici+n_geni+n_virulenza]

maldi.fillna(0, inplace=True)
maldi = maldi.replace(',', '.', regex=True)
columns = maldi.columns
for column in columns:
    maldi[column] = maldi[column].astype(float)
display(maldi)

targets = {'antibiotici' : antibiotici,
            'geni_antibiotici' : geni_antibiotici,
            'virulenza' : virulenza}

for str_target,target in targets.items():
    columns = target.columns
    for column in columns:
        if str_target == 'antibiotici':
            target[column] = df[column].map(map_target_antibiotici)
        rapporto = (target[column] == 0).sum() / target.shape[0]
        #if (antibiotici[column] == 0).all() or (antibiotici[column] == 1).all():
        print(column+" : "+str(rapporto))
        if rapporto < 0.15 or rapporto > 0.85:
            target.drop([column], axis=1, inplace=True)

    display(target)

targets['subspecies'] = subspecies

Unnamed: 0_level_0,Animal species of origin,Haemolysis
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1
V13,Dog,b
V142,Dog,b
V151,Dog,b
V160,Dog,b
V161,Cat,b
...,...,...
V800,Bovine,a
V82,Cat,b
V90,Dog,b
V91,Dog,b


Unnamed: 0_level_0,ST
ID Strain,Unnamed: 1_level_1
V13,ST13
V142,ST23
V151,ST95
V160,ST15
V161,ST9
...,...
V800,ST307
V82,ST9
V90,ST13
V91,ST9


Unnamed: 0_level_0,"2021,944237","2043,278686","2057,143278","2064,798679","2071,138797","2085,647901","2103,986922","2117,879078","2129,309534","2143,905333",...,"13617,65054","14104,26499","14945,23828","15048,68998","15154,39575","15353,52046","15399,07159","15495,16655","16076,29338","16202,09535"
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V13,0.000000,0.000066,0.000080,0.0,0.000000,0.000000,0.000086,0.000000,0.000052,0.000047,...,0.0,0.000000,0.000066,0.000142,0.0,0.0,0.000000,0.0,0.0,0.0
V142,0.000000,0.000052,0.000000,0.0,0.000050,0.000045,0.000078,0.000000,0.000047,0.000000,...,0.0,0.000000,0.000085,0.000194,0.0,0.0,0.000000,0.0,0.0,0.0
V151,0.000000,0.000058,0.000000,0.0,0.000060,0.000048,0.000100,0.000000,0.000000,0.000047,...,0.0,0.000000,0.000092,0.000070,0.0,0.0,0.000000,0.0,0.0,0.0
V160,0.000000,0.000059,0.000000,0.0,0.000066,0.000000,0.000089,0.000000,0.000051,0.000000,...,0.0,0.000000,0.000098,0.000136,0.0,0.0,0.000000,0.0,0.0,0.0
V161,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000077,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000089,0.000203,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V800,0.000056,0.000000,0.000055,0.0,0.000000,0.000060,0.000105,0.000000,0.000000,0.000000,...,0.0,0.000036,0.000073,0.000060,0.0,0.0,0.000000,0.0,0.0,0.0
V82,0.000000,0.000050,0.000043,0.0,0.000000,0.000000,0.000066,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000090,0.000144,0.0,0.0,0.000000,0.0,0.0,0.0
V90,0.000000,0.000052,0.000000,0.0,0.000051,0.000000,0.000089,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000104,0.000137,0.0,0.0,0.000051,0.0,0.0,0.0
V91,0.000000,0.000055,0.000000,0.0,0.000048,0.000000,0.000085,0.000046,0.000000,0.000000,...,0.0,0.000000,0.000093,0.000214,0.0,0.0,0.000000,0.0,0.0,0.0


Eritromicina : 0.461038961038961
Ceftiofur : 0.0
Tetraciclina : 0.5194805194805194
Gentamicina : 0.6233766233766234
Penicillina : 0.0
Ampicillina : 0.0
Sulfametossazolo_trimethoprim : 0.01948051948051948
Clindamicina : 0.2662337662337662
Enrofloxacin : 0.6688311688311688


Unnamed: 0_level_0,Eritromicina,Tetraciclina,Gentamicina,Clindamicina,Enrofloxacin
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
V13,0,0,0,1,0
V142,0,1,1,1,0
V151,1,1,0,1,0
V160,1,0,0,1,0
V161,1,1,0,1,0
...,...,...,...,...,...
V800,1,0,1,1,0
V82,1,1,0,1,1
V90,1,0,0,1,0
V91,1,1,0,1,0


aad(6) : 0.935064935064935
ANT(6)-Ia : 0.8246753246753247
APH(2'')-IIIa : 1.0
APH(3')-IIIa : 0.9025974025974026
catS : 0.9675324675324676
dfrF : 0.9805194805194806
E. faecalis chloramphenicol acetyltransferase : 0.9935064935064936
Erm(47) : 0.987012987012987
ErmB : 0.8181818181818182
fexA : 0.9935064935064936
L._reuteri cat-TC : 1.0
lmrP : 0.006493506493506494
lnuC : 0.987012987012987
lnuD : 0.9935064935064936
lsaC : 0.961038961038961
lsaE : 0.7857142857142857
mefE : 0.8506493506493507
optrA : 0.9935064935064936
poxtA : 0.9935064935064936
SAT-4 : 0.922077922077922
tet(40) : 0.987012987012987
tet(L) : 0.9935064935064936
tetM : 0.8181818181818182
tetO : 0.7402597402597403
tetS : 0.9805194805194806
tetT : 0.974025974025974
vatE : 0.9935064935064936


Unnamed: 0_level_0,ANT(6)-Ia,ErmB,lsaE,tetM,tetO
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
V13,0,0,0,0,0
V142,0,0,0,0,0
V151,0,0,0,0,0
V160,0,0,0,0,0
V161,0,0,0,0,0
...,...,...,...,...,...
V800,0,0,0,1,0
V82,0,0,0,0,0
V90,0,0,0,0,0
V91,0,0,0,0,0


fbp54 : 0.0
gbs0630 : 0.9935064935064936
gbs0631 : 0.9935064935064936
gbs0632 : 0.9935064935064936
hasC : 0.0
lmb : 0.9935064935064936
mf2 : 0.961038961038961
mf3 : 0.6753246753246753
scpA : 0.9935064935064936
sda : 0.8766233766233766
ska : 0.9935064935064936
slo : 0.9935064935064936
smeZ : 0.9935064935064936
spec : 0.974025974025974
speg : 0.9090909090909091
spek : 0.961038961038961
spel : 0.974025974025974
spem : 0.948051948051948


Unnamed: 0_level_0,mf3
ID Strain,Unnamed: 1_level_1
V13,0
V142,1
V151,0
V160,0
V161,1
...,...
V800,0
V82,1
V90,0
V91,1


In [6]:
n = 56
df_test = pd.read_csv(r'D:\PycharmProjects\Thesis-Streptococcus-Classification\data\test_14record\Nuovi_spettri_iterations100_SNR3_SuperSmooth_0.csv',
                      delimiter=';', index_col='ID Strain')

df_test['subspecies'] = df_test["Putative Subspecies"].map(map_target)
df_test['Haemolysis'] = df_test['Haemolysis'].str.replace(" ", "")
animals_test = df_test.iloc[:,2]
feat_agg_test = df_test.iloc[:,[2,8]]
display(feat_agg_test)
st_test = df_test.iloc[:,[4]]
display(st_test)
subspecies_test = df_test[['subspecies']]
maldi_test = df_test.iloc[:,start:start+n]
antibiotici_test = df_test.iloc[:,start+n:start+n+n_antibiotici]
geni_antibiotici_test = df_test.iloc[:,start+n+n_antibiotici:start+n+n_antibiotici+n_geni]
virulenza_test = df_test.iloc[:,start+n+n_antibiotici+n_geni:start+n+n_antibiotici+n_geni+n_virulenza]

maldi_test.fillna(0, inplace=True)
maldi_test = maldi_test.replace(',', '.', regex=True)
columns = maldi_test.columns
for column in columns:
    maldi_test[column] = maldi_test[column].astype(float)
display(maldi_test)

targets_test = {#'antibiotici' : antibiotici_test,
            #'geni_antibiotici' : geni_antibiotici_test,
            'virulenza' : virulenza_test}

for str_target,target in targets_test.items():
    columns = target.columns
    for column in columns:
        if str_target == 'antibiotici':
            target[column] = df_test[column].map(map_target_antibiotici)
        rapporto = (target[column] == 0).sum() / target.shape[0]
        #if (antibiotici[column] == 0).all() or (antibiotici[column] == 1).all():
        print(column+" : "+str(rapporto))
        '''if rapporto < 0.15 or rapporto > 0.85:
            target.drop([column], axis=1, inplace=True)'''
        target.fillna(0, inplace = True)
    display(target)

targets_test['subspecies'] = subspecies_test

Unnamed: 0_level_0,Animal species of origin,Haemolysis
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1
V1046,Swine,b
V1203,Swine,b
V1226,Swine,b
V1285,Wild boar,b
V1398,Swine,b
V1450,Swine,b
V1451,Swine,b
V1524,Swine,b
V234,Bovine,b
V257,Swine,b


Unnamed: 0_level_0,ST
ID Strain,Unnamed: 1_level_1
V1046,ST580
V1203,ST338
V1226,ST685
V1285,ST259
V1398,ST684
V1450,ST397
V1451,ST686
V1524,ST634
V234,ST338
V257,ST-NA


Unnamed: 0_level_0,"2225,446106","2237,371524","2245,272151","2263,964901","2271,177324","2284,555455","2958,224222","2971,796638","2979,800302","2993,666074",...,"7263,911218","7387,713575","7409,087763","7910,100883","7932,755395","8187,083885","8210,870156","9491,927409","9516,723663","10935,95302"
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V1046,0.000327,0.000241,0.000199,0.000139,1.2e-05,6.6e-05,9.2e-05,0.000114,0.000397,0.000286,...,0.000344,0.000409,0.000312,0.000368,0.000264,0.000283,0.000209,0.000308,0.000225,8.7e-05
V1203,0.000316,0.000246,0.000208,0.000163,3.2e-05,0.000113,9.2e-05,0.000111,0.000388,0.000277,...,0.000321,0.000359,0.000271,0.000406,0.000299,0.000343,0.000262,0.000341,0.000257,0.000119
V1226,0.000309,0.000218,0.000203,0.000127,2.3e-05,7.4e-05,8.7e-05,0.000109,0.000383,0.000272,...,0.000292,0.000292,0.000206,0.000326,0.00023,0.000317,0.000233,0.000239,0.000168,8.6e-05
V1285,0.000313,0.000209,0.000206,8.1e-05,7.7e-05,8.1e-05,6.4e-05,8.4e-05,0.000362,0.00025,...,0.000162,0.00042,0.00031,0.000309,0.000224,0.000281,0.000204,0.000225,0.000161,6.9e-05
V1398,0.000283,0.000183,0.000184,0.000124,7.1e-05,9.2e-05,7.2e-05,8.4e-05,0.000361,0.000244,...,0.000261,0.000365,0.000262,0.000298,0.000216,0.000307,0.00023,0.000258,0.00019,9.9e-05
V1450,0.000406,0.000271,0.000261,8e-05,3.8e-05,0.000102,0.000377,0.000267,0.000224,0.000123,...,0.000263,0.000228,0.000159,0.000345,0.000247,0.000288,0.000213,0.000252,0.000184,7.7e-05
V1451,0.000273,0.00018,0.000188,0.000134,1.7e-05,7.2e-05,0.000346,0.000243,0.000207,0.000114,...,0.000215,0.000401,0.000297,0.000332,0.000242,0.000329,0.000244,0.000316,0.000227,0.00011
V1524,0.0004,0.000271,0.000261,9.5e-05,0.000149,0.000156,8.3e-05,0.000103,0.00043,0.0003,...,0.000167,0.00015,9.6e-05,0.000325,0.000238,0.000221,0.000162,0.000176,0.000127,6.4e-05
V234,0.000283,0.000217,0.000172,0.000143,2.6e-05,9.2e-05,8.2e-05,0.000101,0.000352,0.000243,...,0.00034,0.000346,0.00026,0.000393,0.00028,0.000241,0.000173,0.00041,0.000303,7.3e-05
V257,0.000311,0.000206,0.000198,7.1e-05,0.000112,0.000102,6.3e-05,7.5e-05,0.000334,0.000221,...,0.000234,0.000183,0.000134,0.000261,0.000189,0.000169,0.000128,0.000132,9.3e-05,5.5e-05


fbp54 : 0.0
gbs0630 : 1.0
gbs0631 : 1.0
gbs0632 : 1.0
hasC : 0.0
lmb : 1.0
mf2 : 1.0
mf3 : 0.9285714285714286
scpA : 1.0
sda : 0.7857142857142857
ska : 1.0
slo : 1.0
smeZ : 0.9285714285714286
spec : 1.0
speg : 0.9285714285714286
spek : 1.0
spel : 1.0
spem : 1.0


Unnamed: 0_level_0,fbp54,gbs0630,gbs0631,gbs0632,hasC,lmb,mf2,mf3,scpA,sda,ska,slo,smeZ,spec,speg,spek,spel,spem
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
V1046,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
V1203,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
V1226,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
V1285,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
V1398,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
V1450,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
V1451,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
V1524,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0
V234,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
V257,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
animals_dummies = pd.DataFrame.from_dict(pd.get_dummies(animals_test))
feat_agg_dummies = pd.DataFrame.from_dict(pd.get_dummies(feat_agg_test))

missing_cols_animals = set(list_animals) - set(animals_dummies.columns)
print(len(missing_cols_animals))

missing_cols_agg = set(list_agg) - set(feat_agg_dummies.columns)
print(len(missing_cols_agg))

# Add a missing column in test set with default value equal to 0
for c in missing_cols_animals:
    animals_dummies[str(c)] = 0
# Ensure the order of column in the test set is in the same order than in train set
animals_dummies = animals_dummies[list_animals]

for c in missing_cols_agg:
    feat_agg_dummies[str(c)] = 0
feat_agg_dummies = feat_agg_dummies[list_agg]

display(animals_dummies)
display(feat_agg_dummies)

8
9


Unnamed: 0_level_0,Dog,Cat,Bovine,Swine,Ovine,Goat,Hedgehog,Horse,Donkey,Wolf,Reference strain (CCUG),Water buffalo,Wild boar
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
V1046,0,0,0,1,0,0,0,0,0,0,0,0,0
V1203,0,0,0,1,0,0,0,0,0,0,0,0,0
V1226,0,0,0,1,0,0,0,0,0,0,0,0,0
V1285,0,0,0,0,0,0,0,0,0,0,0,0,1
V1398,0,0,0,1,0,0,0,0,0,0,0,0,0
V1450,0,0,0,1,0,0,0,0,0,0,0,0,0
V1451,0,0,0,1,0,0,0,0,0,0,0,0,0
V1524,0,0,0,1,0,0,0,0,0,0,0,0,0
V234,0,0,1,0,0,0,0,0,0,0,0,0,0
V257,0,0,0,1,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,Animal species of origin_Bovine,Animal species of origin_Cat,Animal species of origin_Dog,Animal species of origin_Donkey,Animal species of origin_Goat,Animal species of origin_Hedgehog,Animal species of origin_Horse,Animal species of origin_Ovine,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
V1046,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1203,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1226,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1285,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
V1398,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1450,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1451,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V1524,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
V234,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
V257,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [8]:
col = maldi.columns.to_list()
col = [i.replace(',', '.') for i in col]
col = [str(i) for i in col]

col_test = maldi_test.columns.to_list()
col_test = [i.replace(',', '.') for i in col_test]
col_test = [str(i) for i in col_test]

maldi.columns = col
maldi_test.columns = col_test

maldi_all = pd.concat([maldi, maldi_test], axis=1)
maldi_all.fillna(0, inplace = True)
maldi_all

Unnamed: 0_level_0,2021.944237,2043.278686,2057.143278,2064.798679,2071.138797,2085.647901,2103.986922,2117.879078,2129.309534,2143.905333,...,7263.911218,7387.713575,7409.087763,7910.100883,7932.755395,8187.083885,8210.870156,9491.927409,9516.723663,10935.95302
ID Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
V13,0.0,0.000066,0.00008,0.0,0.000000,0.000000,0.000086,0.0,0.000052,0.000047,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
V142,0.0,0.000052,0.00000,0.0,0.000050,0.000045,0.000078,0.0,0.000047,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
V151,0.0,0.000058,0.00000,0.0,0.000060,0.000048,0.000100,0.0,0.000000,0.000047,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
V160,0.0,0.000059,0.00000,0.0,0.000066,0.000000,0.000089,0.0,0.000051,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
V161,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000077,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
V257,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000234,0.000183,0.000134,0.000261,0.000189,0.000169,0.000128,0.000132,0.000093,0.000055
V767,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000059,0.000333,0.000239,0.000368,0.000267,0.000288,0.000204,0.000145,0.000101,0.000062
V912,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000020,0.000097,0.000314,0.000418,0.000313,0.000273,0.000200,0.000204,0.000141,0.000069
V946,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000068,0.000365,0.000263,0.000376,0.000276,0.000319,0.000242,0.000384,0.000285,0.000104


In [9]:
ss = StandardScaler()
maldi_ss = pd.DataFrame(ss.fit_transform(maldi_all), index = maldi_all.index)

In [10]:
maldi_pca = dimensionality_reduction_cluster(maldi_ss,100)
maldi_pca = maldi_pca[maldi.shape[0]:]

(168, 362)
(168, 100)


In [11]:
df_animals = pd.concat([maldi_pca, animals_dummies], axis=1)
display(df_animals)

df_agg = pd.concat([maldi_pca, feat_agg_dummies], axis=1)
display(df_agg)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Swine,Ovine,Goat,Hedgehog,Horse,Donkey,Wolf,Reference strain (CCUG),Water buffalo,Wild boar
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,1,0,0,0,0,0,0,0,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,1,0,0,0,0,0,0,0,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,1,0,0,0,0,0,0,0,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,0,0,0,0,0,1
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,1,0,0,0,0,0,0,0,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,1,0,0,0,0,0,0,0,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,1,0,0,0,0,0,0,0,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,1,0,0,0,0,0,0,0,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,0,0,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Animal species of origin_Hedgehog,Animal species of origin_Horse,Animal species of origin_Ovine,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0,0,0,0,1,0,0,0,0,1
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0,0,0,0,1,0,0,0,0,1
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0,0,0,0,1,0,0,0,0,1
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,0,0,0,1,0,1
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0,0,0,0,1,0,0,0,0,1
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0,0,0,0,1,0,0,0,0,1
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0,0,0,0,1,0,0,0,0,1
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0,0,0,0,1,0,0,0,0,1
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,0,0,0,1
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,0,0,0,0,1,0,0,0,0,1


In [12]:
#Dataframe per confronto predizioni
pred_ensemble_cluster = {}

dfs_cluster = {'maldi' : maldi_pca,
       'animals' : df_animals,
       'agg' : df_agg}

#Dataframe con risultati metriche per ogni modello
metrics_df_cluster = pd.DataFrame(columns=['Target', 'Dataframe', 'Model', 'Accuracy', 'St. Dev.',
                            'Precision', 'Recall', 'F1-Score', 'Bal. Accuracy',
                            'Silhouette', 'Calinski', 'Davies'])
column = 'subspecies'
y = subspecies_test.values
for str_df, X in dfs_cluster.items():
  print('Dataframe: '+str_df)
  pred_ensemble_cluster[str_df] = pd.DataFrame()
  display(X)
  #Scorre i modelli nel dizionario dei modelli utilizzati
  for name in models_cluster:
    print("Modello "+name)
    model = pickle.load(open('models_obj/prova_cluster_'+str_df+'_'+name+'.pkl', "rb"))
    y_pred = model.predict(X)
    y_pred = pd.DataFrame(y_pred,X.index)
    y_pred = y_pred.iloc[:,0].map(maps_cluster)
    pred_ensemble_cluster[str_df][name] = y_pred
    score_cluster = makeCrossValidationCluster(model, X)
    print(y)
    print(y_pred.values)
    score = makeScore(y, y_pred.values)
    ris = {'Target': column,
              'Dataframe' : str_df,
              'Model': name,
              'Accuracy' : score['acc'],
              'St. Dev.' : score['st'],
              'Precision' : score['prec'],
              'Recall' : score['rec'],
              'F1-Score' : score['f1'],
              'Bal. Accuracy' : score['b_acc'],
              'Silhouette' : score_cluster['Silhouette'],
              'Calinski' : score_cluster['Calinski'],
              'Davies' : score_cluster['Davies']}
    #display(ris)
    metrics_df_cluster = metrics_df_cluster.append(ris, ignore_index=True)
  pred_ensemble_cluster[str_df].index = maldi_pca.index
  pred_ensemble_cluster[str_df].to_csv('../Risultati/prediction/cluster_'+str_df+'_picchi'+str(n)+'.csv', index = True)
  #Aggiunge i valori del target nei dizionari
  #pred_ensemble_cluster[str_df].index = maldi_pca.index

display(metrics_df_cluster)

Dataframe: maldi


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,-0.044846,0.37631,0.112413,0.115528,0.062653,-0.09163,0.0045,0.12679,0.044606,0.067876
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0.018554,-0.128528,-0.029731,-0.010256,0.003915,-0.023046,0.00209,0.067286,0.025854,0.04408
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0.118015,-1.02254,-0.313906,-0.336536,-0.185978,0.272793,-0.013005,-0.359119,-0.123783,-0.180414
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,-0.02622,0.229809,0.071126,0.076817,0.04231,-0.06108,0.0028,0.075807,0.025639,0.03597
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,-0.042799,0.369206,0.112779,0.119362,0.065077,-0.0934,0.004292,0.116589,0.03958,0.056003
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0.054415,-0.42677,-0.11827,-0.100744,-0.045211,0.049098,-0.001454,-0.032209,-0.008966,-0.008127
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,-0.018514,0.156622,0.047135,0.049116,0.026905,-0.039829,0.00199,0.056487,0.020013,0.03089
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0.010011,-0.029772,0.007231,0.041585,0.036703,-0.07651,0.004808,0.142984,0.051995,0.081903
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,-0.006808,0.028147,-0.000137,-0.018075,-0.017198,0.037166,-0.0024,-0.072073,-0.02644,-0.042382
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,-0.007211,0.030766,-0.000134,-0.021501,-0.021401,0.048719,-0.003329,-0.10174,-0.037821,-0.061806


Modello K-means
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [0]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Dataframe: animals


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Swine,Ovine,Goat,Hedgehog,Horse,Donkey,Wolf,Reference strain (CCUG),Water buffalo,Wild boar
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,1,0,0,0,0,0,0,0,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,1,0,0,0,0,0,0,0,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,1,0,0,0,0,0,0,0,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,0,0,0,0,0,1
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,1,0,0,0,0,0,0,0,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,1,0,0,0,0,0,0,0,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,1,0,0,0,0,0,0,0,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,1,0,0,0,0,0,0,0,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,0,0,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,1,0,0,0,0,0,0,0,0,0


Modello K-means
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [0]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Dataframe: agg


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Animal species of origin_Hedgehog,Animal species of origin_Horse,Animal species of origin_Ovine,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0,0,0,0,1,0,0,0,0,1
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0,0,0,0,1,0,0,0,0,1
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0,0,0,0,1,0,0,0,0,1
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,0,0,0,1,0,1
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0,0,0,0,1,0,0,0,0,1
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0,0,0,0,1,0,0,0,0,1
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0,0,0,0,1,0,0,0,0,1
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0,0,0,0,1,0,0,0,0,1
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,0,0,0,1
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,0,0,0,0,1,0,0,0,0,1


Modello K-means
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [0]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2]


Unnamed: 0,Target,Dataframe,Model,Accuracy,St. Dev.,Precision,Recall,F1-Score,Bal. Accuracy,Silhouette,Calinski,Davies
0,subspecies,maldi,K-means,0.142857,0.0,0.020408,0.142857,0.035714,0.5,0.438181,15.771463,0.650648
1,subspecies,animals,K-means,0.142857,0.0,0.020408,0.142857,0.035714,0.5,0.438789,15.843221,0.650905
2,subspecies,agg,K-means,0.142857,0.0,0.020408,0.142857,0.035714,0.5,0.438789,15.843221,0.650905


In [13]:
clusters = {}
clusters_str = {}
clusters_dummies = {}
for dfs in dfs_cluster:
  clusters[dfs] = pred_ensemble_cluster[dfs]
  display(clusters[dfs])
  clusters_str[dfs] = clusters[dfs].applymap(map_target_inv.get)
  display(clusters_str[dfs])
  clusters_dummies[dfs] = pd.DataFrame.from_dict(pd.get_dummies(clusters_str[dfs]))
  display(clusters_dummies[dfs])
  missing_cols_cluster = set(list_subs) - set(clusters_dummies[dfs].columns)
  print(len(missing_cols_cluster))

  # Add a missing column in test set with default value equal to 0
  for c in missing_cols_cluster:
      clusters_dummies[dfs][str(c)] = 0
  # Ensure the order of column in the test set is in the same order than in train set
  clusters_dummies[dfs] = clusters_dummies[dfs][list_subs]

  display(clusters_dummies[dfs])

Unnamed: 0,K-means
V1046,2
V1203,2
V1226,2
V1285,2
V1398,2
V1450,2
V1451,2
V1524,2
V234,2
V257,2


Unnamed: 0,K-means
V1046,Canis
V1203,Canis
V1226,Canis
V1285,Canis
V1398,Canis
V1450,Canis
V1451,Canis
V1524,Canis
V234,Canis
V257,Canis


Unnamed: 0,K-means_Canis
V1046,1
V1203,1
V1226,1
V1285,1
V1398,1
V1450,1
V1451,1
V1524,1
V234,1
V257,1


2


Unnamed: 0,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,1,0,0
V1203,1,0,0
V1226,1,0,0
V1285,1,0,0
V1398,1,0,0
V1450,1,0,0
V1451,1,0,0
V1524,1,0,0
V234,1,0,0
V257,1,0,0


Unnamed: 0,K-means
V1046,2
V1203,2
V1226,2
V1285,2
V1398,2
V1450,2
V1451,2
V1524,2
V234,2
V257,2


Unnamed: 0,K-means
V1046,Canis
V1203,Canis
V1226,Canis
V1285,Canis
V1398,Canis
V1450,Canis
V1451,Canis
V1524,Canis
V234,Canis
V257,Canis


Unnamed: 0,K-means_Canis
V1046,1
V1203,1
V1226,1
V1285,1
V1398,1
V1450,1
V1451,1
V1524,1
V234,1
V257,1


2


Unnamed: 0,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,1,0,0
V1203,1,0,0
V1226,1,0,0
V1285,1,0,0
V1398,1,0,0
V1450,1,0,0
V1451,1,0,0
V1524,1,0,0
V234,1,0,0
V257,1,0,0


Unnamed: 0,K-means
V1046,2
V1203,2
V1226,2
V1285,2
V1398,2
V1450,2
V1451,2
V1524,2
V234,2
V257,2


Unnamed: 0,K-means
V1046,Canis
V1203,Canis
V1226,Canis
V1285,Canis
V1398,Canis
V1450,Canis
V1451,Canis
V1524,Canis
V234,Canis
V257,Canis


Unnamed: 0,K-means_Canis
V1046,1
V1203,1
V1226,1
V1285,1
V1398,1
V1450,1
V1451,1
V1524,1
V234,1
V257,1


2


Unnamed: 0,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,1,0,0
V1203,1,0,0
V1226,1,0,0
V1285,1,0,0
V1398,1,0,0
V1450,1,0,0
V1451,1,0,0
V1524,1,0,0
V234,1,0,0
V257,1,0,0


In [15]:
df_clusters = pd.concat([maldi_pca, clusters_dummies['maldi']], axis=1)
display(df_clusters)

df_cluster_agg = pd.concat([df_agg, clusters_dummies['agg']], axis=1)
display(df_cluster_agg)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0.115528,0.062653,-0.09163,0.0045,0.12679,0.044606,0.067876,1,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,-0.010256,0.003915,-0.023046,0.00209,0.067286,0.025854,0.04408,1,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,-0.336536,-0.185978,0.272793,-0.013005,-0.359119,-0.123783,-0.180414,1,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0.076817,0.04231,-0.06108,0.0028,0.075807,0.025639,0.03597,1,0,0
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0.119362,0.065077,-0.0934,0.004292,0.116589,0.03958,0.056003,1,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,-0.100744,-0.045211,0.049098,-0.001454,-0.032209,-0.008966,-0.008127,1,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0.049116,0.026905,-0.039829,0.00199,0.056487,0.020013,0.03089,1,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0.041585,0.036703,-0.07651,0.004808,0.142984,0.051995,0.081903,1,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,-0.018075,-0.017198,0.037166,-0.0024,-0.072073,-0.02644,-0.042382,1,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,-0.021501,-0.021401,0.048719,-0.003329,-0.10174,-0.037821,-0.061806,1,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0,1,0,0,0,0,1,1,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0,1,0,0,0,0,1,1,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0,1,0,0,0,0,1,1,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,1,0,1,1,0,0
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0,1,0,0,0,0,1,1,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0,1,0,0,0,0,1,1,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0,1,0,0,0,0,1,1,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0,1,0,0,0,0,1,1,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,1,1,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,0,1,0,0,0,0,1,1,0,0


In [20]:
dfs_test = {'maldi' : maldi_pca,
       'clusters' : df_clusters,
       #'animals' : df_animals,
       'agg' : df_agg,
       'clusters+agg' : df_cluster_agg}

targets = {#'antibiotici' : antibiotici,
            #'geni' : geni_antibiotici,
            'virulenza' : virulenza,
            'subspecies' : subspecies
}
metrics_test = pd.DataFrame(columns=['Target', 'Dataframe', 'Model', 'Accuracy','Bal. Accuracy',
                                     'St. Dev.', 'Precision', 'Recall','F1-Score',])
prediction = {}

for str_df,X in dfs_test.items():
  print('Dataframe: '+str_df)
  display(X)
  prediction[str_df] = pd.DataFrame(index = df_cluster_agg.index)
  for str_target, target in targets.items():
    columns = target.columns
    #Per ogni tipologia di target scorre tutti i target
    for column in columns:
      #print("Colonna:"+column)
      y = targets_test[str_target][column]
      display(y)
      print('Model: '+column+'_'+str_df+'_Stack')
      model = pickle.load(open('models_obj/prova_'+column+'_'+str_df+'_Stack.pkl', "rb"))
      print(model)
      y_pred = model.predict(X)
      display(y_pred)
      score = makeScore(y, y_pred)
      prediction[str_df][column+'_pred'] = y_pred
      if (str_target == 'antibiotici'):
        prediction[str_df][column+'_pred'] = prediction[str_df][column+'_pred'].map(map_target_antibiotici_inv)
      prediction[str_df][column] = df_test[column]
      ris = {'Target': column,
            'Dataframe' : str_df,
            'Model': 'Stack',
            'Accuracy' : score['acc'],
            'Bal. Accuracy' : score['b_acc'],
            'St. Dev.' : score['st'],
            'Precision' : score['prec'],
            'Recall' : score['rec'],
            'F1-Score' : score['f1']}

      #display(ris)
      metrics_test = metrics_test.append(ris, ignore_index=True)
  prediction[str_df]['subspecies'] = prediction[str_df]['subspecies'].map(map_target_inv)
  prediction[str_df]['subspecies_pred'] = prediction[str_df]['subspecies_pred'].map(map_target_inv)
  prediction[str_df].to_csv('../Risultati/prediction/predizioni_'+str_df+'_picchi'+str(n)+'.csv', index = True)
  display(prediction[str_df])
metrics_test.to_csv('../Risultati/results/stack_test_picchi'+str(n)+'.csv', index = True)
display(metrics_test)


Dataframe: maldi


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,-0.044846,0.37631,0.112413,0.115528,0.062653,-0.09163,0.0045,0.12679,0.044606,0.067876
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0.018554,-0.128528,-0.029731,-0.010256,0.003915,-0.023046,0.00209,0.067286,0.025854,0.04408
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0.118015,-1.02254,-0.313906,-0.336536,-0.185978,0.272793,-0.013005,-0.359119,-0.123783,-0.180414
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,-0.02622,0.229809,0.071126,0.076817,0.04231,-0.06108,0.0028,0.075807,0.025639,0.03597
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,-0.042799,0.369206,0.112779,0.119362,0.065077,-0.0934,0.004292,0.116589,0.03958,0.056003
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0.054415,-0.42677,-0.11827,-0.100744,-0.045211,0.049098,-0.001454,-0.032209,-0.008966,-0.008127
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,-0.018514,0.156622,0.047135,0.049116,0.026905,-0.039829,0.00199,0.056487,0.020013,0.03089
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0.010011,-0.029772,0.007231,0.041585,0.036703,-0.07651,0.004808,0.142984,0.051995,0.081903
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,-0.006808,0.028147,-0.000137,-0.018075,-0.017198,0.037166,-0.0024,-0.072073,-0.02644,-0.042382
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,-0.007211,0.030766,-0.000134,-0.021501,-0.021401,0.048719,-0.003329,-0.10174,-0.037821,-0.061806


ID Strain
V1046    1
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     0
V912     0
V946     0
V971     0
Name: mf3, dtype: int64

Model: mf3_maldi_Stack
StackingClassifier(estimators=[('virulenza_mf3_maldi_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('virulenza_mf3_maldi_K-nn_Best',
                                KNeighborsClassifier(n_neighbors=8,
                                                     weights='distance')),
                               ('virulenza_mf3_maldi_LogisticRegression_Best',
                                LogisticRegression(C=0.00046415888336127773,
                                                   class_weight='balanced',
                                                   intercept_scaling=2,
                                                   penalty='none',
                                                   random_state=46,
                                                   solver='sag')),
                               ('virulenza_mf3_maldi_BernoulliNB_Best',
                                BernoulliNB(alp

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int64)

ID Strain
V1046    0
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     2
V912     2
V946     0
V971     0
Name: subspecies, dtype: int64

Model: subspecies_maldi_Stack
StackingClassifier(estimators=[('subspecies_subspecies_maldi_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('subspecies_subspecies_maldi_LogisticRegression_Best',
                                LogisticRegression(C=0.00046415888336127773,
                                                   class_weight='balanced',
                                                   intercept_scaling=2,
                                                   penalty='none',
                                                   random_state=46,
                                                   solver='sag')),
                               ('subspecies_subspecies_maldi_Ridge_Best',
                                RidgeClassifier(alpha=16.451905877536642,
                                                random_state=46)),
                               ('subspecies_subspecies_maldi_GaussianNB_Best',
          

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

Unnamed: 0,mf3_pred,mf3,subspecies_pred,subspecies
V1046,0,1,Canis,Equisimilis
V1203,0,0,Canis,Equisimilis
V1226,0,0,Canis,Equisimilis
V1285,0,0,Canis,Equisimilis
V1398,0,0,Canis,Equisimilis
V1450,1,0,Canis,Equisimilis
V1451,0,0,Canis,Equisimilis
V1524,1,0,Canis,Equisimilis
V234,0,0,Canis,Equisimilis
V257,0,0,Canis,Equisimilis


Dataframe: clusters


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0.115528,0.062653,-0.09163,0.0045,0.12679,0.044606,0.067876,1,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,-0.010256,0.003915,-0.023046,0.00209,0.067286,0.025854,0.04408,1,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,-0.336536,-0.185978,0.272793,-0.013005,-0.359119,-0.123783,-0.180414,1,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0.076817,0.04231,-0.06108,0.0028,0.075807,0.025639,0.03597,1,0,0
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0.119362,0.065077,-0.0934,0.004292,0.116589,0.03958,0.056003,1,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,-0.100744,-0.045211,0.049098,-0.001454,-0.032209,-0.008966,-0.008127,1,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0.049116,0.026905,-0.039829,0.00199,0.056487,0.020013,0.03089,1,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0.041585,0.036703,-0.07651,0.004808,0.142984,0.051995,0.081903,1,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,-0.018075,-0.017198,0.037166,-0.0024,-0.072073,-0.02644,-0.042382,1,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,-0.021501,-0.021401,0.048719,-0.003329,-0.10174,-0.037821,-0.061806,1,0,0


ID Strain
V1046    1
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     0
V912     0
V946     0
V971     0
Name: mf3, dtype: int64

Model: mf3_clusters_Stack
StackingClassifier(estimators=[('virulenza_mf3_clusters_BernoulliNB',
                                BernoulliNB()),
                               ('virulenza_mf3_clusters_K-nn_Best',
                                KNeighborsClassifier(n_neighbors=8,
                                                     weights='distance')),
                               ('virulenza_mf3_clusters_SGDClassifier_Best',
                                SGDClassifier(alpha=1000.0,
                                              class_weight={0: 0.4, 1: 0.6},
                                              eta0=10, learning_rate='constant',
                                              loss='squared_hinge',
                                              penalty='l1')),
                               ('virulenza_mf3_clusters_LogisticRegression_Best',
                                LogisticRegression(C=0.21544346900318823,
                                                   intercept_sca

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0], dtype=int64)

ID Strain
V1046    0
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     2
V912     2
V946     0
V971     0
Name: subspecies, dtype: int64

Model: subspecies_clusters_Stack
StackingClassifier(estimators=[('subspecies_subspecies_clusters_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('subspecies_subspecies_clusters_LogisticRegression_Best',
                                LogisticRegression(C=0.00046415888336127773,
                                                   class_weight='balanced',
                                                   intercept_scaling=2,
                                                   penalty='none',
                                                   random_state=46,
                                                   solver='sag')),
                               ('subspecies_subspecies_clusters_Ridge_Best',
                                RidgeClassifier(alpha=16.451905877536642,
                                                random_state=46)),
                               ('subspecies_subspecies_clusters_DecisionTree'

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

Unnamed: 0,mf3_pred,mf3,subspecies_pred,subspecies
V1046,0,1,Canis,Equisimilis
V1203,1,0,Canis,Equisimilis
V1226,1,0,Canis,Equisimilis
V1285,1,0,Canis,Equisimilis
V1398,1,0,Canis,Equisimilis
V1450,1,0,Canis,Equisimilis
V1451,1,0,Canis,Equisimilis
V1524,1,0,Canis,Equisimilis
V234,1,0,Canis,Equisimilis
V257,1,0,Canis,Equisimilis


Dataframe: agg


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Animal species of origin_Hedgehog,Animal species of origin_Horse,Animal species of origin_Ovine,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0,0,0,0,1,0,0,0,0,1
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0,0,0,0,1,0,0,0,0,1
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0,0,0,0,1,0,0,0,0,1
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,0,0,0,1,0,1
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0,0,0,0,1,0,0,0,0,1
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0,0,0,0,1,0,0,0,0,1
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0,0,0,0,1,0,0,0,0,1
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0,0,0,0,1,0,0,0,0,1
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,0,0,0,1
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,0,0,0,0,1,0,0,0,0,1


ID Strain
V1046    1
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     0
V912     0
V946     0
V971     0
Name: mf3, dtype: int64

Model: mf3_agg_Stack
StackingClassifier(estimators=[('virulenza_mf3_agg_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('virulenza_mf3_agg_K-nn_Best',
                                KNeighborsClassifier(n_neighbors=8,
                                                     weights='distance')),
                               ('virulenza_mf3_agg_LinearSVC', LinearSVC()),
                               ('virulenza_mf3_agg_LogisticRegression_Best',
                                LogisticRegression(C=2.154434690031882,
                                                   penalty='l1',
                                                   random_state=46,
                                                   solver='saga'))],
                   final_estimator=LogisticRegression())


array([0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0], dtype=int64)

ID Strain
V1046    0
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     2
V912     2
V946     0
V971     0
Name: subspecies, dtype: int64

Model: subspecies_agg_Stack
StackingClassifier(estimators=[('subspecies_subspecies_agg_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('subspecies_subspecies_agg_LogisticRegression_Best',
                                LogisticRegression(C=0.00046415888336127773,
                                                   class_weight='balanced',
                                                   intercept_scaling=2,
                                                   penalty='none',
                                                   random_state=46,
                                                   solver='sag')),
                               ('subspecies_subspecies_agg_RandomForest_Best',
                                RandomForestClassifier(ccp_alpha=0.01,
                                                       class_weight='balanced',
                                                       max_depth=4,
            

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

Unnamed: 0,mf3_pred,mf3,subspecies_pred,subspecies
V1046,0,1,Canis,Equisimilis
V1203,0,0,Canis,Equisimilis
V1226,0,0,Canis,Equisimilis
V1285,0,0,Canis,Equisimilis
V1398,0,0,Canis,Equisimilis
V1450,1,0,Canis,Equisimilis
V1451,1,0,Canis,Equisimilis
V1524,1,0,Canis,Equisimilis
V234,0,0,Canis,Equisimilis
V257,1,0,Canis,Equisimilis


Dataframe: clusters+agg


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Animal species of origin_Reference strain (CCUG),Animal species of origin_Swine,Animal species of origin_Water buffalo,Animal species of origin_Wolf,Animal species of origin_Wild boar,Haemolysis_a,Haemolysis_b,K-means_Canis,K-means_Dysgalactiae,K-means_Equisimilis
V1046,27.655895,-2.022817,-0.877445,-1.185949,-0.015429,-0.129214,-0.290826,-0.351878,-0.107726,0.115971,...,0,1,0,0,0,0,1,1,0,0
V1203,29.245761,-2.291795,-1.063802,-1.48201,-0.020036,-0.182866,-0.419796,-0.52459,-0.162699,0.179009,...,0,1,0,0,0,0,1,1,0,0
V1226,27.351716,-1.971502,-0.841989,-1.129686,-0.014554,-0.11905,-0.266395,-0.319148,-0.097305,0.104009,...,0,1,0,0,0,0,1,1,0,0
V1285,26.575237,-1.833959,-0.741584,-0.965936,-0.011911,-0.085044,-0.182336,-0.200591,-0.058667,0.057678,...,0,0,0,0,1,0,1,1,0,0
V1398,27.08324,-1.92093,-0.802706,-1.06377,-0.01345,-0.103544,-0.227229,-0.261818,-0.078323,0.080599,...,0,1,0,0,0,0,1,1,0,0
V1450,26.928568,-1.895445,-0.785619,-1.037101,-0.013046,-0.09922,-0.217122,-0.24905,-0.074379,0.076352,...,0,1,0,0,0,0,1,1,0,0
V1451,27.506947,-1.997524,-0.859902,-1.158102,-0.014996,-0.124282,-0.279081,-0.336497,-0.102891,0.110579,...,0,1,0,0,0,0,1,1,0,0
V1524,27.494017,-1.987135,-0.8458,-1.129897,-0.014426,-0.113066,-0.248763,-0.287105,-0.085831,0.087983,...,0,1,0,0,0,0,1,1,0,0
V234,27.299875,-1.963174,-0.836595,-1.121436,-0.014433,-0.117889,-0.263785,-0.316107,-0.096403,0.103116,...,0,0,0,0,0,0,1,1,0,0
V257,24.662523,-1.50958,-0.51614,-0.607159,-0.006312,-0.019305,-0.023877,0.012805,0.009446,-0.020876,...,0,1,0,0,0,0,1,1,0,0


ID Strain
V1046    1
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     0
V912     0
V946     0
V971     0
Name: mf3, dtype: int64

Model: mf3_clusters+agg_Stack
StackingClassifier(estimators=[('virulenza_mf3_clusters+agg_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('virulenza_mf3_clusters+agg_K-nn_Best',
                                KNeighborsClassifier(n_neighbors=8,
                                                     weights='distance')),
                               ('virulenza_mf3_clusters+agg_LinearSVC',
                                LinearSVC()),
                               ('virulenza_mf3_clusters+agg_LogisticRegression_Best',
                                LogisticRegression(C=2.154434690031882,
                                                   penalty='l1',
                                                   random_state=46,
                                                   solver='saga'))],
                   final_estimator=LogisticRegression())


array([0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0], dtype=int64)

ID Strain
V1046    0
V1203    0
V1226    0
V1285    0
V1398    0
V1450    0
V1451    0
V1524    0
V234     0
V257     0
V767     2
V912     2
V946     0
V971     0
Name: subspecies, dtype: int64

Model: subspecies_clusters+agg_Stack
StackingClassifier(estimators=[('subspecies_subspecies_clusters+agg_LogisticRegression',
                                LogisticRegression(random_state=46)),
                               ('subspecies_subspecies_clusters+agg_LogisticRegression_Best',
                                LogisticRegression(C=0.00046415888336127773,
                                                   class_weight='balanced',
                                                   intercept_scaling=2,
                                                   penalty='none',
                                                   random_state=46,
                                                   solver='sag')),
                               ('subspecies_subspecies_clusters+agg_Ridge_Best',
                                RidgeClassifier(alpha=16.451905877536642,
                                                random_state=46)),
                               ('subspecies_subspecies_cluste

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

Unnamed: 0,mf3_pred,mf3,subspecies_pred,subspecies
V1046,0,1,Canis,Equisimilis
V1203,0,0,Canis,Equisimilis
V1226,0,0,Canis,Equisimilis
V1285,0,0,Canis,Equisimilis
V1398,0,0,Canis,Equisimilis
V1450,1,0,Canis,Equisimilis
V1451,1,0,Canis,Equisimilis
V1524,1,0,Canis,Equisimilis
V234,0,0,Canis,Equisimilis
V257,1,0,Canis,Equisimilis


Unnamed: 0,Target,Dataframe,Model,Accuracy,Bal. Accuracy,St. Dev.,Precision,Recall,F1-Score
0,mf3,maldi,Stack,0.714286,0.384615,0.0,0.844156,0.714286,0.77381
1,subspecies,maldi,Stack,0.142857,0.5,0.0,0.020408,0.142857,0.035714
2,mf3,clusters,Stack,0.214286,0.115385,0.0,0.696429,0.214286,0.327731
3,subspecies,clusters,Stack,0.142857,0.5,0.0,0.020408,0.142857,0.035714
4,mf3,agg,Stack,0.571429,0.307692,0.0,0.825397,0.571429,0.675325
5,subspecies,agg,Stack,0.142857,0.5,0.0,0.020408,0.142857,0.035714
6,mf3,clusters+agg,Stack,0.571429,0.307692,0.0,0.825397,0.571429,0.675325
7,subspecies,clusters+agg,Stack,0.142857,0.5,0.0,0.020408,0.142857,0.035714


In [None]:
filter = metrics_test["Target"]=="subspecies"

filter_df = metrics_test['Dataframe']=='maldi'
#filter_df_pca = metrics_df['Dataframe']=='_PCA'

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()

#subs_df = metrics_df.where(filter & filter_df & filter_df_pca), inplace = False).dropna()
subs_df

In [None]:
filter = metrics_test["Target"]=="subspecies"

filter_df = metrics_test['Dataframe'].str.startswith('cluster')

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()
subs_df.head(20)

In [None]:
filter = metrics_test["Target"]=="subspecies"

filter_df = metrics_test['Dataframe'].str.startswith('animals')

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()
subs_df

In [None]:
filter = metrics_test["Target"]=="mf3"

filter_df = metrics_test['Dataframe']=='maldi'
#filter_df_pca = metrics_df['Dataframe']=='_PCA'

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()

#subs_df = metrics_df.where(filter & filter_df & filter_df_pca), inplace = False).dropna()
subs_df

In [None]:
filter = metrics_test["Target"]=="mf3"

filter_df = metrics_test['Dataframe'].str.startswith('cluster')

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()
subs_df.head(20)

In [None]:
filter = metrics_test["Target"]=="mf3"

filter_df = metrics_test['Dataframe'].str.startswith('animals')

# filtering data
subs_df = metrics_test.where(filter & filter_df, inplace = False).dropna()
subs_df

In [None]:
metrics_test.to_csv('results.csv', index = False)

In [None]:
name_best = list()
score_best = list()
target_list = list()
for str_target, target in targets.items():
  columns = target.columns
  for column in columns:
    filter = metrics_test["Target"]==column
    subs_df = metrics_test.where(filter, inplace = False).dropna()
    name = subs_df['Accuracy'].idxmax()
    score = subs_df['Accuracy'].max()
    target_list.append(column)
    name_best.append(name)
    score_best.append(score)

print(target_list)
print(name_best)
print(score_best)

In [None]:
# bars are by default width 0.8, so we'll add 0.1 to the left coordinates
# so that each bar is centered
y_pos = np.arange(len(target_list))

# plot bars with left x-coordinates [xs], heights [num_oscars]
plt.barh(y_pos, score_best, align='center')
# label x-axis with movie names at bar centers
plt.yticks(y_pos, target_list)
plt.xlabel("% of Accuracy")
plt.title("Risultati di Balanced Accuracy sul miglior modello ensemble sui targets")
plt.show()