In [2]:
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from tkinter import TclError, ttk

import openpyxl

In [3]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
from scipy.spatial import distance
from sklearn.utils.multiclass import unique_labels
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import StackingClassifier
import warnings

warnings.filterwarnings("ignore")
np.set_printoptions(precision=5, suppress=True)
RANDOM_STATE = 46
N_JOBS = -1
class_names = ["Canis", "Dysg. Equisimilis", "Dysg. Dysgalactiae"]
map_target = {
    "Streptococcus canis": 2,
    "Streptococcus dysgalactiae subsp. dysgalactiae": 1,
    "Streptococcus dysgalactiae subsp. equisimilis": 0
}
map_target_inv = {
    2: "Canis",
    1: "Dysgalactiae",
    0: "Equisimilis"
}
map_target_antibiotici = {
    "S" : 1,
    "NS" : 0
}
map_target_antibiotici_inv = {
    1 : "S",
    0 : "NS"
}
maps_cluster = {
    2 : 0,
    1 : 2,
    0 : 1
}
metrics = ['accuracy', 'recall_weighted', 'precision_weighted','f1_weighted']
metrics_cluster = ['Silhouette', 'Calinski', 'Davies']
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
start = 2
n_antibiotici = 9
n_geni = 27
n_virulenza = 18
scaled = ''
scaler = ''
tutti_picchi = 'tutti_picchi_'
reduction = ''


In [4]:
# Define a function for standard scaling
def standard_scaler(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Define a function for dimensionality reduction using PCA
def dimensionality_reduction(X_train, X_test, n_components):
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)
    #print(X_train_pca.shape)
    return X_train_pca, X_test_pca

def dimensionality_reduction_cluster(X, n_components):
    X.columns = X.columns.astype(str)
    print(X.shape)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    X_pca = pd.DataFrame(X_pca, index=X.index.to_list())
    print(X_pca.shape)
    X_pca.columns = X_pca.columns.astype(str)
    return X_pca

def makeScoreMeanWithoutNaN(metrics):
    for name, metrica in metrics.items():
        print(name)
        print(metrics[name])
        metrics[name] = metrics[name][~np.isnan(metrics[name])]
        print(metrics[name])
        metrics[name] = np.mean(metrics[name])
        print(metrics[name])
    print(metrics)
    return metrics

def makeScore(y_test, y_pred):
    score = {}

    score['acc'] = accuracy_score(y_test, y_pred)
    score['b_acc'] = balanced_accuracy_score(y_test, y_pred)
    score['st'] = score['acc'].std()
    score['prec'] = precision_score(y_test, y_pred, average='weighted')
    score['rec'] = recall_score(y_test, y_pred, average='weighted')
    score['f1'] = f1_score(y_test, y_pred, average='weighted')

    return score

def makeCrossValidation(model, X_train, y_train):
    score = {}
    cv = cross_validate(estimator=model, X=X_train, y=y_train,
                        scoring=metrics, cv=skfold,
                        n_jobs=N_JOBS, verbose=0)

    score['acc'] = cv.get('test_accuracy').mean()
    score['st'] = cv.get('test_accuracy').std()
    score['prec'] = cv.get('test_precision_weighted').mean()
    score['rec'] = cv.get('test_recall_weighted').mean()
    score['f1'] = cv.get('test_f1_weighted').mean()

    return score

def makeCrossValidationCluster(model, X):
    model.fit(X)
    labels = model.labels_
    pca = PCA(n_components = 2)
    pca.fit(X)
    X_pca = pca.transform(X)
    avg_silhouette = silhouette_score(X_pca, labels)
    avg_calinski_harabasz = calinski_harabasz_score(X_pca, labels)
    avg_davies_bouldin = davies_bouldin_score(X_pca, labels)

    score = {}
    score['Silhouette'] = avg_silhouette
    score['Calinski'] = avg_calinski_harabasz
    score['Davies'] = avg_davies_bouldin

    return score

N_CLUSTERS = 3
list_animals = ['Dog', 'Cat', 'Bovine', 'Swine', 'Ovine', 'Goat', 'Hedgehog',
       'Horse', 'Donkey', 'Wolf', 'Reference strain (CCUG)',
       'Water buffalo','Wild boar']
list_animals_agg = ['Animal species of origin_Bovine', 'Animal species of origin_Cat',
       'Animal species of origin_Dog', 'Animal species of origin_Donkey',
       'Animal species of origin_Goat', 'Animal species of origin_Hedgehog',
       'Animal species of origin_Horse', 'Animal species of origin_Ovine',
       'Animal species of origin_Reference strain (CCUG)',
       'Animal species of origin_Swine',
       'Animal species of origin_Water buffalo',
       'Animal species of origin_Wolf',
       'Animal species of origin_Wild boar']
list_haem = ['Haemolysis_a', 'Haemolysis_b']
list_subs = ["K-means_Canis", "K-means_Dysgalactiae", "K-means_Equisimilis"]

models = {
  'LogisticRegression': LogisticRegression(random_state=RANDOM_STATE),
  'Ridge' : RidgeClassifier(random_state=RANDOM_STATE),
  #'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE),
  #'K-nn': KNeighborsClassifier(),
  'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
  'BernoulliNB': BernoulliNB(),
  'GaussianNB': GaussianNB(),
  #'NearestCentroid': NearestCentroid(),
  'SVC' : SVC(),
  'LinearSVC' : LinearSVC(),
  'LabelPropagation' : LabelPropagation(),
  'LabelSpreading' : LabelSpreading(),
  'SGDClassifier' : SGDClassifier()
  #'stack' : StackingCVClassifier
}

models_cluster = [
  'K-means',
  #'AgglomerativeClustering'
]

In [5]:
def preview_data(path):
    window = tk.Tk()
    window.title("Excel Viewer")
    workbook = openpyxl.load_workbook(path)
    sheet = workbook.active

    list_values = list(sheet.values)
    cols = list_values[0]
    tree = ttk.Treeview(window, column= cols, show="headings")
    for col_name in cols:
        tree.heading(col_name, text = col_name)
    tree.pack(expand=True, fill='x')
    
    for value_tuple in list_values[1:]:
        tree.insert('',tk.END, values=value_tuple)

In [6]:
def load_data(self, features, targets, n):
    df = pd.read_csv(self.file_path,
                        delimiter=';', index_col='ID Strain')
    display(df)
    maldi = df.iloc[:,start:start+n]
    maldi.fillna(0, inplace=True)
    maldi = maldi.replace(',', '.', regex=True)
    columns = maldi.columns
    for column in columns:
        maldi[column] = maldi[column].astype(float)
    display(maldi)
    
    col = maldi.columns.to_list()
    col = [i.replace(',', '.') for i in col]
    col = [int(float(i)) for i in col]

    maldi.columns = col
    
    for i in range(2000,16500):
        if i not in maldi.columns:
            maldi[i] = 0
    maldi = maldi.reindex(sorted(maldi.columns), axis=1)
    maldi = maldi.fillna(0)
    
    maldi.columns = maldi.columns.astype(str)
    data = maldi
    
    for feature in features:
        if feature == 'Specie animale':
            animals = df.iloc[:,0]
            #display(animals)
            animals_dummies = pd.DataFrame.from_dict(pd.get_dummies(animals))
            if ('Specie animale' and 'Haemolysis') in features:
                str_df = 'agg'
                columns = animals_dummies.columns
                for column in columns:
                    animals_dummies.rename(columns = {column : 'Animal species of origin_'+column}, inplace=True)
                missing_cols_animals = set(list_animals_agg) - set(animals_dummies.columns)
                # Add a missing column in test set with default value equal to 0
                for c in missing_cols_animals:
                    animals_dummies[str(c)] = 0
                # Ensure the order of column in the test set is in the same order than in train set
                animals_dummies = animals_dummies[list_animals_agg]
            else:
                str_df = 'animals'
                missing_cols_animals = set(list_animals) - set(animals_dummies.columns)
                # Add a missing column in test set with default value equal to 0
                for c in missing_cols_animals:
                    animals_dummies[str(c)] = 0
                # Ensure the order of column in the test set is in the same order than in train set
                animals_dummies = animals_dummies[list_animals]
        if feature == 'Haemolysis':
            haem = df.iloc[:,1]
            #display(haem)
            if 'Specie animale' not in features:
                str_df = 'hae'
            haem_dummies = pd.DataFrame.from_dict(pd.get_dummies(haem))
            haem_dummies.rename(columns = {'a' : 'Haemolysis_a', 'b' : 'Haemolysis_b'}, inplace=True)
            missing_cols_haem = set(list_haem) - set(haem_dummies.columns)
            for c in missing_cols_haem:
                haem_dummies[str(c)] = 0
            haem_dummies = haem_dummies[list_haem]
    
            
    
    data = pd.concat([data,animals_dummies], axis=1)
    data = pd.concat([data,haem_dummies], axis=1)
    display(data)
    
    targets_col = list()
    for target in targets:
        if target == 'Sottospecie':
            targets_col.append('subspecies')
        else:
            targets_col.append(target)
    print(targets_col)
    
    str_df = str_df+'_npicchi306'
    return data, str_df, targets_col
            

In [7]:
def prediction_cluster(X, y, str_df):
  pred_cluster = pd.DataFrame()

  #Dataframe con risultati metriche per ogni modello
  metrics_df_cluster = pd.DataFrame(columns=['Target', 'Dataframe', 'Model',
                              'Silhouette', 'Calinski', 'Davies'])

  for name in models_cluster:
    print("Modello "+name)
    model = pickle.load(open('../models/cluster_'+tutti_picchi+str_df+'_'+name+'.pkl', "rb"))
    y_pred = model.predict(X)
    y_pred = pd.DataFrame(y_pred,X.index)
    pred_cluster[name] = y_pred
  pred_cluster.index = X.index 
  pred_cluster.to_csv('../new_prediction/cluster_'+str_df+'.csv', index = True)  
  display(pred_cluster)
  return pred_cluster

In [8]:
class AIModelApp:

    def __init__(self, root):
        self.root = root
        self.root.title("Streptococcus Subspecies Predictor")
        self.root.resizable(0, 0)

        '''try:
            # windows only (remove the minimize/maximize button)
            self.root.attributes('-toolwindow', True)
        except TclError:
            print('Not supported on your platform')'''
        
        self.root.columnconfigure(0, weight=4)
        self.root.columnconfigure(1, weight=1)
        self.file_path = None

        feature_vars = ["Specie animale", "Haemolysis"]
        target_vars = ["Sottospecie", "Clindamicina", "IsaE"]

        self.frame = ttk.Frame(root)

        # grid layout for the input frame
        self.frame.columnconfigure(0, weight=1)
        self.frame.columnconfigure(0, weight=3)
        # Find what
        ttk.Label(self.frame, text='Anteprima File:').grid(column=0, row=0, sticky=tk.W)

        # Replace with:
        ttk.Label(self.frame, text='Numero picchi:').grid(column=0, row=1, sticky=tk.W)
        self.num_picchi_entry = ttk.Entry(self.frame)
        self.num_picchi_entry.insert(0, "56")
        self.num_picchi_entry.grid(column=1, row=1, sticky=tk.W)
        
        ttk.Label(self.frame, text='Features aggiuntive:').grid(column=0, row=2, sticky=tk.W)
        # Match Case checkbox
        row = 3
        self.feat_case = {}
        self.feat_case_check = {}
        for var in feature_vars:
            self.feat_case[var] = tk.StringVar()
            self.feat_case_check[var] = ttk.Checkbutton(
                self.frame,
                text=var,
                variable=self.feat_case[var],
                onvalue=var,
                offvalue='')
            self.feat_case_check[var].grid(column=0, row=row, sticky=tk.W)
            row += 1

        ttk.Label(self.frame, text='Target ricercati:').grid(column=1, row=2, sticky=tk.W)
        # Match Case checkbox
        row = 3
        self.target_case = {}
        self.target_case_check = {}
        for var in target_vars:
            self.target_case[var] = tk.StringVar()
            self.target_case_check[var] = ttk.Checkbutton(
                self.frame,
                text=var,
                variable=self.target_case[var],
                command=lambda: print(self.target_case_check[var].get()))
            self.target_case_check[var].grid(column=1, row=row, sticky=tk.W)
            row += 1

        for widget in self.frame.winfo_children():
            widget.grid(padx=5, pady=5)

        self.frame.grid(column=0, row=0)

        button_frame = ttk.Frame(root)

        button_frame.columnconfigure(0, weight=1)

        ttk.Button(button_frame, text='Carica file XLSX', command=self.load_file).grid(column=0, row=0)
        ttk.Button(button_frame, text='Avvia previsione', command=self.predict).grid(column=0, row=1)
        ttk.Button(button_frame, text='Cancella').grid(column=0, row=2)
        ttk.Button(button_frame, text='Esci').grid(column=0, row=3)

        for widget in button_frame.winfo_children():
            widget.grid(padx=5, pady=5)
        button_frame.grid(column=1, row=0)
        '''self.predict_button = tk.Button(self.root, text="Avvia previsione", command=self.predict, height=2, width=20)
        self.predict_button.pack(pady=10)'''

    def load_file(self):
        self.file_path = filedialog.askopenfilename(filetypes=[("Xlsx File", "*.xlsx")])
        preview_data(self.file_path)

    def predict(self):
        if self.file_path is None:
            messagebox.showerror("Errore", "Carica prima un file CSV!")
            return

        try:
            num_picchi = int(self.num_picchi_entry.get())
        except ValueError:
            messagebox.showerror("Errore", "Inserisci un numero valido per i picchi!")
            return
        
        features = [var for var, checkbox in self.feature_vars_checkbox.items() if checkbox.get()]
        targets = [var for var, checkbox in self.target_vars_checkbox.items() if checkbox.get()]
        print(features)
        print(targets)
        data, str_df, targets_col = load_data(self, features, targets, num_picchi)
        print(data.columns)
        #data = pd.read_csv(self.file_path)
        if 'subspecies' in targets_col:
            y = 'subspecies'
            pred_cluster = prediction_cluster(data, y, str_df)
            data_cluster = pd.concat([data, pred_cluster], axis = 1)
        X = data
        prediction = {}
        for target in targets_col:
            prediction[target] = pd.DataFrame(index = X.index)
            '''if target == 'subspecies':
                X = data_cluster
            else:
                x = data'''
            for name in models:
                #print("Modello "+name)
                path = '../models/models_base/'+name+'_'+tutti_picchi+reduction+scaled+scaler+target+'_'+str_df+'.pkl'
                print(path)
                model = pickle.load(open(path, 'rb'))

                y_pred = model.predict(X)
                prediction[target][name] = y_pred
            prediction[target].index = X.index
            
            #Aggiunge i valori del target nei dizionari
            prediction[target].to_csv('../new_prediction/'+target+tutti_picchi+reduction+scaled+scaler+'_basemodel_'+name+'_'+str_df+'.csv', index = True)
            display(prediction[target])
        
        prediction = pd.DataFrame(index = X.index)
        for target in targets_col:
            model = pickle.load(open('../models/stack_'+tutti_picchi+reduction+scaled+scaler+target+'_'+str_df+'.pkl', "rb"))
            print(model)
            y_pred = model.predict(X)
            display(y_pred)
            prediction[target] = y_pred
            if (target == 'Clindamicina'):
                prediction[target] = prediction[target].map(map_target_antibiotici_inv)
            if (target == 'subspecies'):
                prediction[target] = prediction[target].map(map_target_inv)
        prediction.to_csv('../new_prediction/stack_'+tutti_picchi+reduction+scaled+scaler+str_df+'.csv', index = True)
        display(prediction)
        
        # Mostra i risultati della previsione in una nuova finestra
        result_window = tk.Toplevel(self.root)
        result_window.title("Risultato previsione")
        result_window.geometry("300x100")

        for target in targets:
            result_label = tk.Label(result_window, text=f"Risultato {target} previsione: {prediction[target]}")
            result_label.pack(pady=20)

        # Pulsante per chiudere la finestra
        close_button = tk.Button(result_window, text="Esci", command=result_window.destroy)
        close_button.pack()

if __name__ == "__main__":
    root = tk.Tk()
    style = ttk.Style(root)
    print(style.theme_names())
    root.tk.call("source", "forest-light.tcl")
    root.tk.call("source", "forest-dark.tcl")
    style.theme_use("forest-dark")
    app = AIModelApp(root)
    root.mainloop()


('winnative', 'clam', 'alt', 'default', 'classic', 'vista', 'xpnative')


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\gabri\AppData\Local\Temp\ipykernel_40576\4141399178.py", line 62, in <lambda>
    command=lambda: print(self.target_case_check[var].get()))
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'Checkbutton' object has no attribute 'get'
Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\gabri\AppData\Local\Temp\ipykernel_40576\4141399178.py", line 62, in <lambda>
    command=lambda: print(self.target_case_check[var].get()))
                  