In [1]:
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from tkinter import TclError, ttk

import openpyxl

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
from scipy.spatial import distance
from sklearn.utils.multiclass import unique_labels
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import StackingClassifier
import warnings

warnings.filterwarnings("ignore")
np.set_printoptions(precision=5, suppress=True)
RANDOM_STATE = 46
N_JOBS = -1
class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]
map_target = {
    "Streptococcus canis": 2,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 1,
    "Streptococcus dysgalactiae subsp. equisimilis": 0
}
map_target_inv = {
    2: "Canis",
    1: "Dysgalactiae",
    0: "Equisimilis"
}
map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}
map_target_antibiotici_inv = {
    1 : "S",
    0 : "NS"
}
maps_cluster = {
    2 : 0,
    1 : 2,
    0 : 1
}
metrics = ['accuracy', 'recall_weighted', 'precision_weighted','f1_weighted']
metrics_cluster = ['Silhouette', 'Calinski', 'Davies']
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
start = 2
n_antibiotici = 9
n_geni = 27
n_virulenza = 18
scaled = ''
scaler = ''
tutti_picchi = 'tutti_picchi_'
reduction = ''


In [3]:
# Define a function for standard scaling
def standard_scaler(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Define a function for dimensionality reduction using PCA
def dimensionality_reduction(X_train, X_test, n_components):
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)
    #print(X_train_pca.shape)
    return X_train_pca, X_test_pca

def dimensionality_reduction_cluster(X, n_components):
    X.columns = X.columns.astype(str)
    print(X.shape)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    X_pca = pd.DataFrame(X_pca, index=X.index.to_list())
    print(X_pca.shape)
    X_pca.columns = X_pca.columns.astype(str)
    return X_pca

def makeScoreMeanWithoutNaN(metrics):
    for name, metrica in metrics.items():
        print(name)
        print(metrics[name])
        metrics[name] = metrics[name][~np.isnan(metrics[name])]
        print(metrics[name])
        metrics[name] = np.mean(metrics[name])
        print(metrics[name])
    print(metrics)
    return metrics

def makeScore(y_test, y_pred):
    score = {}

    score['acc'] = accuracy_score(y_test, y_pred)
    score['b_acc'] = balanced_accuracy_score(y_test, y_pred)
    score['st'] = score['acc'].std()
    score['prec'] = precision_score(y_test, y_pred, average='weighted')
    score['rec'] = recall_score(y_test, y_pred, average='weighted')
    score['f1'] = f1_score(y_test, y_pred, average='weighted')

    return score

def makeCrossValidation(model, X_train, y_train):
    score = {}
    cv = cross_validate(estimator=model, X=X_train, y=y_train,
                        scoring=metrics, cv=skfold,
                        n_jobs=N_JOBS, verbose=0)

    score['acc'] = cv.get('test_accuracy').mean()
    score['st'] = cv.get('test_accuracy').std()
    score['prec'] = cv.get('test_precision_weighted').mean()
    score['rec'] = cv.get('test_recall_weighted').mean()
    score['f1'] = cv.get('test_f1_weighted').mean()

    return score

def makeCrossValidationCluster(model, X):
    model.fit(X)
    labels = model.labels_
    pca = PCA(n_components = 2)
    pca.fit(X)
    X_pca = pca.transform(X)
    avg_silhouette = silhouette_score(X_pca, labels)
    avg_calinski_harabasz = calinski_harabasz_score(X_pca, labels)
    avg_davies_bouldin = davies_bouldin_score(X_pca, labels)

    score = {}
    score['Silhouette'] = avg_silhouette
    score['Calinski'] = avg_calinski_harabasz
    score['Davies'] = avg_davies_bouldin

    return score

N_CLUSTERS = 3
list_animals = ['Dog', 'Cat', 'Bovine', 'Swine', 'Ovine', 'Goat', 'Hedgehog',
       'Horse', 'Donkey', 'Wolf', 'Reference strain (CCUG)',
       'Water buffalo','Wild boar']
list_animals_agg = ['Animal species of origin_Bovine', 'Animal species of origin_Cat',
       'Animal species of origin_Dog', 'Animal species of origin_Donkey',
       'Animal species of origin_Goat', 'Animal species of origin_Hedgehog',
       'Animal species of origin_Horse', 'Animal species of origin_Ovine',
       'Animal species of origin_Reference strain (CCUG)',
       'Animal species of origin_Swine',
       'Animal species of origin_Water buffalo',
       'Animal species of origin_Wolf',
       'Animal species of origin_Wild boar']
list_haem = ['Haemolysis_a', 'Haemolysis_b']
list_subs = ["K-means_Canis", "K-means_Dysgalactiae", "K-means_Equisimilis"]

models = {
  'LogisticRegression': LogisticRegression(random_state=RANDOM_STATE),
  'Ridge' : RidgeClassifier(random_state=RANDOM_STATE),
  #'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE),
  #'K-nn': KNeighborsClassifier(),
  'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
  'BernoulliNB': BernoulliNB(),
  'GaussianNB': GaussianNB(),
  #'NearestCentroid': NearestCentroid(),
  'SVC' : SVC(),
  'LinearSVC' : LinearSVC(),
  'LabelPropagation' : LabelPropagation(),
  'LabelSpreading' : LabelSpreading(),
  'SGDClassifier' : SGDClassifier()
  #'stack' : StackingCVClassifier
}

models_cluster = [
  'K-means',
  #'AgglomerativeClustering'
]

In [4]:
def prediction_cluster(X, y, str_df):
  pred_cluster = pd.DataFrame()

  #Dataframe con risultati metriche per ogni modello
  metrics_df_cluster = pd.DataFrame(columns=['Target', 'Dataframe', 'Model',
                              'Silhouette', 'Calinski', 'Davies'])

  for name in models_cluster:
    print("Modello "+name)
    model = pickle.load(open('../models/cluster_'+tutti_picchi+str_df+'_'+name+'.pkl', "rb"))
    y_pred = model.predict(X)
    y_pred = pd.DataFrame(y_pred,X.index)
    pred_cluster[name] = y_pred
  pred_cluster.index = X.index 
  pred_cluster.to_csv('../new_prediction/cluster_'+str_df+'.csv', index = True)  
  display(pred_cluster)
  return pred_cluster

In [6]:
class AIModelApp:

    def __init__(self, root):
        self.root = root
        self.root.title("Streptococcus Subspecies Predictor")
        self.root.resizable(0, 0)

        self.frame_path = ttk.Frame(self.root)
        self.frame_path.pack()
        #frame.title("Streptococcus Subspecies Predictor")
        #self.root.resizable(0, 0)

        '''try:
            # windows only (remove the minimize/maximize button)
            self.root.attributes('-toolwindow', True)
        except TclError:
            print('Not supported on your platform')'''
        
        '''self.root.columnconfigure(0, weight=4)
        self.root.columnconfigure(1, weight=1)
        self.file_path = None

        # grid layout for the input frame
        self.frame.columnconfigure(0, weight=1)
        self.frame.columnconfigure(0, weight=3)'''
        self.path_frame = ttk.LabelFrame(self.frame_path, text='Posizione File:')
        self.path_frame.grid(column=0, row=0)

        self.path_entry = ttk.Entry(self.path_frame)
        self.path_entry.insert(0, "C:\...\$filename.xlsx ")
        self.path_entry.bind("<FocusIn>", lambda e: self.path_entry.delete('0', 'end'))
        self.path_entry.grid(column=0, row=0, sticky="ew")

        ttk.Button(self.frame_path, text='Carica file Maldi', command=self.load_file).grid(column=1, row=0)
    
    def create_frame_form(self):
        self.feature_vars = ["Specie animale", "Haemolysis"]
        self.target_vars = ["Sottospecie", "Clindamicina", "IsaE"]
        self.window = tk.Tk()
        self.window.title('Elements table')
        style = ttk.Style(self.window)
        self.window.tk.call("source", "forest-dark.tcl")
        style.theme_use("forest-dark")
        
        self.option_frame = ttk.LabelFrame(self.window, text='Elementi tabella')
        self.option_frame.grid(row=0, column=0)

        self.picchi_spinbox = ttk.Spinbox(self.option_frame, from_=1, to=10000)
        self.picchi_spinbox.insert(0, "56")
        self.picchi_spinbox.grid(row=0, column=0, padx=5, pady=5, sticky="ew")
        
        ttk.Label(self.option_frame, text='Features aggiuntive:').grid(column=0, row=1, sticky="ew")
        # Match Case checkbox
        row = 2
        self.feat_case = {}
        self.feat_case_check = {}
        
        for var in self.feature_vars:
            self.feat_case[var] = tk.BooleanVar()
            self.feat_case_check[var] = ttk.Checkbutton(self.option_frame, text=var, variable=self.feat_case[var])
            self.feat_case_check[var].grid(column=0, row=row, sticky='nsew')
            
            row += 1
        
        ttk.Label(self.option_frame, text='Target ricercati:').grid(column=1, row=1, sticky="ew")
        # Match Case checkbox
        row = 2
        self.target_case = {}
        self.target_case_check = {}
        for var in self.target_vars:
            self.target_case[var] = tk.BooleanVar()
            self.target_case_check[var] = ttk.Checkbutton(self.option_frame, text=var, variable=self.target_case[var])
            
            self.target_case_check[var].grid(column=1, row=row, sticky='nsew')
            row += 1
        
        self.button_frame = ttk.Frame(self.window)
        self.button_frame.grid(row=1, column=0)
        self.button_modify = ttk.Button(self.button_frame, text='Modifica tabella', command=self.modify_table).grid(column=0, row=6)
        self.button_modify = ttk.Button(self.button_frame, text='Avvia previsione', command=self.predict).grid(column=1, row=6)

        for widget in self.option_frame.winfo_children():
            widget.grid(padx=5, pady=5)
        for widget in self.button_frame.winfo_children():
            widget.grid(padx=5, pady=5)

        self.preview_table()
    
    def preview_table(self):
        self.window_table = tk.Tk()
        self.window_table.geometry("600x400")
        self.window_table.title('Preview Table')
        style = ttk.Style(self.window_table)
        self.window_table.tk.call("source", "forest-dark.tcl")
        style.theme_use("forest-dark")

        self.treeFrame = ttk.Frame(self.window_table)
        self.treeFrame.grid(row=0, column=0)
        #self.treeScroll = ttk.Scrollbar(self.treeFrame)
        #self.treeScroll.pack(side="left", fill="y")
        self.treeScrollX = ttk.Scrollbar(self.treeFrame, orient='horizontal')
        self.treeScrollX.pack(side="bottom", fill="x")

        self.workbook = openpyxl.load_workbook(self.file_path)
        self.sheet = self.workbook.active

        self.list_values = list(self.sheet.values)
        print(self.list_values)
        cols = self.list_values[0]
        
        self.treeview = ttk.Treeview(self.treeFrame, show="headings", columns=cols,
                                 xscrollcommand=self.treeScrollX.set)
        self.treeview.column("ID Strain", width=70)
        self.treeview.column("Animal species of origin", width=150)
        self.treeview.column("Haemolysis", width=70)
        self.treeScrollX.config(command=self.treeview.xview)
        self.treeview.pack()
        
        #self.treeScroll.config(command=self.treeview.yview)
        

        for col_name in cols:
            self.treeview.heading(col_name, text = col_name)
        
        for value_tuple in self.list_values[1:]:
            self.treeview.insert('',tk.END, values=value_tuple)

    def load_file(self):
        self.file_path = filedialog.askopenfilename(filetypes=[("Xlsx File", "*.xlsx")])
        self.path_entry.insert(0, self.file_path)
        self.create_frame_form()

    def modify_table():
        return 0
    
    def load_data(self, features, targets, n):
        self.df = pd.read_excel(self.file_path,
                             index_col='ID Strain')
        display(self.df)
        maldi = self.df.iloc[:,start:start+n]
        maldi.fillna(0, inplace=True)
        maldi = maldi.replace(',', '.', regex=True)
        columns = maldi.columns
        for column in columns:
            maldi[column] = maldi[column].astype(float)
        display(maldi)
        
        col = maldi.columns.to_list()
        '''col = [i.replace(',', '.') for i in col]
        col = [int(float(i)) for i in col]'''
        col = [int(i) for i in col]
        maldi.columns = col
        print(col
        )
        for i in range(2000,16500):
            if i not in maldi.columns:
                maldi[i] = 0
        maldi = maldi.reindex(sorted(maldi.columns), axis=1)
        maldi = maldi.fillna(0)
        
        maldi.columns = maldi.columns.astype(str)
        data = maldi
        
        for feature in features:
            if feature == 'Specie animale':
                animals = df.iloc[:,0]
                #display(animals)
                animals_dummies = pd.DataFrame.from_dict(pd.get_dummies(animals))
                if ('Specie animale' and 'Haemolysis') in features:
                    str_df = 'agg'
                    columns = animals_dummies.columns
                    for column in columns:
                        animals_dummies.rename(columns = {column : 'Animal species of origin_'+column}, inplace=True)
                    missing_cols_animals = set(list_animals_agg) - set(animals_dummies.columns)
                    # Add a missing column in test set with default value equal to 0
                    for c in missing_cols_animals:
                        animals_dummies[str(c)] = 0
                    # Ensure the order of column in the test set is in the same order than in train set
                    animals_dummies = animals_dummies[list_animals_agg]
                else:
                    str_df = 'animals'
                    missing_cols_animals = set(list_animals) - set(animals_dummies.columns)
                    # Add a missing column in test set with default value equal to 0
                    for c in missing_cols_animals:
                        animals_dummies[str(c)] = 0
                    # Ensure the order of column in the test set is in the same order than in train set
                    animals_dummies = animals_dummies[list_animals]
                data = pd.concat([data,animals_dummies], axis=1)
            if feature == 'Haemolysis':
                haem = df.iloc[:,1]
                #display(haem)
                if 'Specie animale' not in features:
                    str_df = 'hae'
                haem_dummies = pd.DataFrame.from_dict(pd.get_dummies(haem))
                haem_dummies.rename(columns = {'a' : 'Haemolysis_a', 'b' : 'Haemolysis_b'}, inplace=True)
                missing_cols_haem = set(list_haem) - set(haem_dummies.columns)
                for c in missing_cols_haem:
                    haem_dummies[str(c)] = 0
                haem_dummies = haem_dummies[list_haem]
                data = pd.concat([data,haem_dummies], axis=1)
    
        display(data)
        
        targets_col = 'Sottospecie'
        for target in targets:
            if target == 'Sottospecie':
                targets_col.append('subspecies')
            else:
                targets_col.append(target)
        print(targets_col)
        str_df = ''
        str_df = str_df+'_npicchi306'
        return data, str_df, targets_col

    def predict(self):
        if self.file_path is None:
            messagebox.showerror("Errore", "Carica prima un file CSV!")
            return

        try:
            num_picchi = int(self.picchi_spinbox.get())
        except ValueError:
            messagebox.showerror("Errore", "Inserisci un numero valido per i picchi!")
            return
        print(num_picchi)
        
        features = list()
        targets = list()
        for var, checkbox in self.feat_case.items():
            print(checkbox.get())
            if checkbox.get() == True:
                features = features.append(var)
        for var, checkbox in self.target_case.items():
            if checkbox.get() == True:
                targets = features.append(var)

        print(features)
        print(targets)
        data, str_df, targets_col = self.load_data(features, targets, num_picchi)
        print(str_df)
        print(data.columns)
        #data = pd.read_csv(self.file_path)
        if 'subspecies' in targets_col:
            y = 'subspecies'
            pred_cluster = prediction_cluster(data, y, str_df)
            data_cluster = pd.concat([data, pred_cluster], axis = 1)
        X = data
        prediction = {}
        for target in targets_col:
            prediction[target] = pd.DataFrame(index = X.index)
            '''if target == 'subspecies':
                X = data_cluster
            else:
                x = data'''
            for name in models:
                #print("Modello "+name)
                path = '../models/models_base/'+name+'_'+tutti_picchi+reduction+scaled+scaler+target+'_'+str_df+'.pkl'
                print(path)
                model = pickle.load(open(path, 'rb'))

                y_pred = model.predict(X)
                prediction[target][name] = y_pred
            prediction[target].index = X.index
            
            #Aggiunge i valori del target nei dizionari
            prediction[target].to_csv('../new_prediction/'+target+tutti_picchi+reduction+scaled+scaler+'_basemodel_'+name+'_'+str_df+'.csv', index = True)
            display(prediction[target])
        
        prediction = pd.DataFrame(index = X.index)
        for target in targets_col:
            model = pickle.load(open('../models/stack_'+tutti_picchi+reduction+scaled+scaler+target+'_'+str_df+'.pkl', "rb"))
            print(model)
            y_pred = model.predict(X)
            display(y_pred)
            prediction[target] = y_pred
            if (target == 'Clindamicina'):
                prediction[target] = prediction[target].map(map_target_antibiotici_inv)
            if (target == 'subspecies'):
                prediction[target] = prediction[target].map(map_target_inv)
        prediction.to_csv('../new_prediction/stack_'+tutti_picchi+reduction+scaled+scaler+str_df+'.csv', index = True)
        display(prediction)
        
        # Mostra i risultati della previsione in una nuova finestra
        result_window = tk.Toplevel(self.root)
        result_window.title("Risultato previsione")
        result_window.geometry("300x100")

        for target in targets:
            result_label = tk.Label(result_window, text=f"Risultato {target} previsione: {prediction[target]}")
            result_label.pack(pady=20)

        # Pulsante per chiudere la finestra
        close_button = tk.Button(result_window, text="Esci", command=result_window.destroy)
        close_button.pack()

if __name__ == "__main__":
    root = tk.Tk()
    style = ttk.Style(root)
    root.tk.call("source", "forest-dark.tcl")
    style.theme_use("forest-dark")
    app = AIModelApp(root)
    root.mainloop()
