## Preprocessing

In [None]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import shap
import matplotlib.pyplot as plt
import tqdm
from timeit import default_timer as timer

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [None]:
os.chdir('c:/users/fre_f/pythonprojects/bacteriological/')

In [None]:
df = pd.read_csv('./data/Genus_DESL.csv')

In [None]:
df.head()

In [None]:
df.iloc[:,2:600]

In [None]:
df.iloc[:,1]

In [None]:
X = df.iloc[:,2:600]
y = df.iloc[:,1]

## Train, predict and explain

### XGBoost

In [54]:
le = LabelEncoder()
y_xgb = le.fit_transform(y)
scoring = ['f1_weighted']

clfxgb = XGBClassifier(random_state=42)
looxgb = LeaveOneOut()
xgb_scores = cross_validate(clfxgb, X, y_xgb, cv=looxgb, scoring=scoring)


In [55]:
xgb_scores

{'fit_time': array([0.35795212, 0.36647773, 0.37270188, 0.37232685, 0.3637383 ,
        0.3819623 , 0.42426085, 0.43257499, 0.47839713, 0.44162798,
        0.43728924, 0.44355345, 0.48326778, 0.45343781, 0.45857191,
        0.46914005, 0.44369864, 0.45211458, 0.45962214, 0.44283271,
        0.43702364, 0.44392109, 0.43498397, 0.42000175, 0.43300247,
        0.43833113, 0.45280743, 0.45444226, 0.44372296, 0.4342854 ,
        0.4471724 , 0.45114207, 0.46797371, 0.43994308, 0.45329404,
        0.44049406, 0.45454788, 0.45683432, 0.43274832, 0.44979191,
        0.44952273, 0.45472383, 0.45292521, 0.44776464, 0.4508419 ,
        0.47417283, 0.47305059, 0.48154092, 0.46846676, 0.46272635,
        0.4770515 , 0.47290754, 0.49470949, 0.45540524, 0.47462654,
        0.46225333, 0.46141696, 0.45608091, 0.45486021, 0.45693088,
        0.47331166]),
 'score_time': array([0.0181849 , 0.01808238, 0.025069  , 0.02424431, 0.01804996,
        0.02974224, 0.03280187, 0.03023982, 0.02169061, 0.03125763,


### LightGBM

In [56]:
clflgbm = LGBMClassifier(random_state=42)
loolgbm = LeaveOneOut()
scoring = ['f1_weighted']
lgbm_scores = cross_validate(clflgbm, X, y, cv=loolgbm, scoring=scoring)

In [57]:
lgbm_scores

{'fit_time': array([0.06967068, 0.06085372, 0.03634334, 0.04246044, 0.04180646,
        0.04233122, 0.04391956, 0.04238629, 0.04237247, 0.03635931,
        0.04942203, 0.04825902, 0.05444622, 0.05959558, 0.07372069,
        0.06040144, 0.05767465, 0.0504806 , 0.06031919, 0.05178761,
        0.05065084, 0.04826188, 0.05837202, 0.0495522 , 0.04688787,
        0.05412745, 0.04688883, 0.04726362, 0.03437352, 0.04025722,
        0.03978801, 0.05338335, 0.04032826, 0.07151246, 0.04688644,
        0.05632186, 0.04688668, 0.0472734 , 0.03126264, 0.03163648,
        0.04047418, 0.03825927, 0.02422619, 0.04687405, 0.05143332,
        0.04888225, 0.0386219 , 0.03124857, 0.04727006, 0.03126454,
        0.03164744, 0.07157707, 0.05579782, 0.05277133, 0.05031848,
        0.06289911, 0.04029822, 0.06239152, 0.05270529, 0.05927968,
        0.04727221]),
 'score_time': array([0.        , 0.00606275, 0.00609207, 0.00561762, 0.00613284,
        0.00604796, 0.00461435, 0.00611663, 0.00607586, 0.00769162,


### Multilayer perceptron 

In [58]:
from sklearn.neural_network import MLPClassifier
clfmlp = MLPClassifier(solver="adam", activation="relu", learning_rate="constant", random_state=42, max_iter=3000, verbose=False)
clfmlp.out_activation_="softmax"
scoring = ['f1_weighted']

loomlp = LeaveOneOut()
mlp_scores = cross_validate(clfmlp, X, y, cv=loomlp, scoring=scoring)

In [13]:
mlp_scores

{'fit_time': array([1.15124893, 1.13180661, 1.12348795, 1.11253762, 1.0327301 ,
        1.08632827, 1.07356429, 1.13688278, 1.06864095, 1.10787559,
        1.1359446 , 1.13596559, 1.1419642 , 1.06195903, 1.10058141,
        1.21590781, 1.21515846, 1.16775227, 1.12964678, 1.18704486,
        1.03480268, 1.14733267, 1.14688277, 1.26409721, 1.05851269,
        1.07968211, 1.10201979, 1.32567525, 1.33324385, 1.26595473,
        1.27010584, 1.33705735, 1.38004303, 1.18911695, 1.28280592,
        1.40883517, 1.25776386, 1.23089933, 1.32003713, 1.13269424,
        1.18157077, 1.40812492, 1.49038196, 1.45721126, 1.45493889,
        1.3393631 , 1.23874021, 1.26300812, 1.17970824, 1.15119743,
        1.20248461, 1.26914668, 1.24925923, 1.30362558, 1.4275496 ,
        1.32264185, 1.23635983, 1.20800066, 1.16651869, 1.11043262,
        1.15139747]),
 'score_time': array([0.00622296, 0.0055244 , 0.00754285, 0.        , 0.0155766 ,
        0.        , 0.        , 0.        , 0.        , 0.01562405,
