## Preprocessing

In [14]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import shap
import matplotlib.pyplot as plt
import tqdm
from timeit import default_timer as timer

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [15]:
os.chdir('c:/users/fre_f/pythonprojects/bacteriological/')

In [16]:
df = pd.read_csv('./data/Genus_DESL.csv')

In [17]:
df.head()

Unnamed: 0,Index,DESL,Corynebacterium,Staphylococcus,Propionibacterium,sp50014,Moraxella,Blautia,sp50017,sp5276,...,Light treatment,OSDI score,DEQ5 score,TBUT OD,TBUT OS,OSS OD,OSS OS,ST OD,ST OS,Demodicosis
0,P1,3,0.092816,0.001058,0.042075,0.0,0.0,0.0,0.0,0.0,...,0,,,3,3,0,0,23,23,0
1,P2,3,0.698862,0.001191,0.004366,0.0,0.0,0.0,0.0,0.0,...,0,31.25,17.0,2,4,1,3,14,18,0
2,P3,2,0.034017,0.259365,0.00364,0.0,0.0,0.0,0.0,0.0,...,1,33.3,16.0,8,8,3,3,11,8,0
3,P4,3,0.029042,0.177362,0.015877,0.0,0.0,0.127216,0.0,0.0,...,0,93.75,21.0,3,2,2,2,12,11,0
4,P5,1,0.307662,0.164086,0.050351,0.0,0.0,0.000331,0.0,0.000463,...,1,6.25,7.0,11,11,0,0,22,32,0


In [18]:
df.iloc[:,2:600]

Unnamed: 0,Corynebacterium,Staphylococcus,Propionibacterium,sp50014,Moraxella,Blautia,sp50017,sp5276,Finegoldia,Acinetobacter,...,Dyella,Actinopolymorpha,Trichococcus,melongena,Methylophilus,Murdochiella,Intrasporangium,sp10518,sp44151,Limnobacter
0,0.092816,0.001058,0.042075,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.006020,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
1,0.698862,0.001191,0.004366,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
2,0.034017,0.259365,0.003640,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
3,0.029042,0.177362,0.015877,0.000000,0.0,0.127216,0.0,0.000000,0.0,0.000000,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
4,0.307662,0.164086,0.050351,0.000000,0.0,0.000331,0.0,0.000463,0.0,0.002183,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.358462,0.185519,0.108214,0.016877,0.0,0.000397,0.0,0.000000,0.0,0.000000,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
57,0.077600,0.355253,0.048227,0.000000,0.0,0.006814,0.0,0.036518,0.0,0.001125,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
58,0.296309,0.601019,0.004895,0.000000,0.0,0.004631,0.0,0.000000,0.0,0.000000,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0
59,0.834877,0.046375,0.005028,0.000000,0.0,0.008997,0.0,0.004829,0.0,0.006285,...,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0


In [19]:
df.iloc[:,1]

0     3
1     3
2     2
3     3
4     1
     ..
56    3
57    2
58    2
59    2
60    3
Name: DESL, Length: 61, dtype: int64

In [20]:
X = df.iloc[:,2:600]
y = df.iloc[:,1]

## Train, predict and explain

### XGBoost

In [21]:
le = LabelEncoder()
y_xgb = le.fit_transform(y)


clfxgb = XGBClassifier(random_state=42)
looxgb = LeaveOneOut()
xgb_scores = cross_validate(clfxgb, X, y_xgb, cv=looxgb, scoring='accuracy', return_estimator=True)


In [22]:
xgb_scores

{'fit_time': array([0.3502984 , 0.33567834, 0.33617973, 0.33873701, 0.33876991,
        0.33759999, 0.3442347 , 0.34208417, 0.33647633, 0.33900404,
        0.33948588, 0.3402791 , 0.34473228, 0.34388804, 0.34215546,
        0.33780193, 0.33466601, 0.33759356, 0.33825326, 0.33685732,
        0.33916259, 0.33708119, 0.34215117, 0.33966851, 0.33718657,
        0.3311193 , 0.32818985, 0.32700896, 0.32939029, 0.32574391,
        0.33784103, 0.33548331, 0.33569217, 0.33013916, 0.33644509,
        0.3340261 , 0.33540297, 0.3361237 , 0.33407855, 0.33933759,
        0.33105159, 0.33230329, 0.33625054, 0.32838869, 0.32660818,
        0.33617902, 0.33294964, 0.33822799, 0.34089351, 0.33254075,
        0.40274954, 0.33501697, 0.3362236 , 0.33684778, 0.33752155,
        0.33535099, 0.33653235, 0.33464694, 0.33811212, 0.33319426,
        0.33467174]),
 'score_time': array([0.06327653, 0.06360722, 0.06382895, 0.0656302 , 0.06565833,
        0.0672636 , 0.06562376, 0.0645628 , 0.06149364, 0.06510758,


### LightGBM

In [10]:
clflgbm = LGBMClassifier(random_state=42)
loolgbm = LeaveOneOut()
scoring = ['f1_weighted']
lgbm_scores = cross_validate(clflgbm, X, y, cv=loolgbm, scoring=scoring)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 156
[LightGBM] [Info] Number of data points in the train set: 60, number of used features: 11
[LightGBM] [Info] Start training from score -2.014903
[LightGBM] [Info] Start training from score -1.203973
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -2.708050
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 163
[LightGBM] [Info] Number of data points in the train set: 60, number of used features: 12
[LightGBM] [Info] Start training from score -2.014903
[LightGBM] [Info] Start training

In [11]:
lgbm_scores

{'fit_time': array([0.12350059, 0.02111578, 0.02708411, 0.02594805, 0.02836251,
        0.02845573, 0.03456187, 0.03678203, 0.01999378, 0.03265643,
        0.0219574 , 0.03363609, 0.02623153, 0.03304052, 0.02673602,
        0.02688456, 0.03370357, 0.02797103, 0.02237654, 0.02964997,
        0.03517556, 0.03209925, 0.02462578, 0.02266622, 0.03602505,
        0.02444625, 0.0277667 , 0.03052139, 0.03252053, 0.03535604,
        0.03339171, 0.02950335, 0.02624631, 0.03426933, 0.03041816,
        0.02939606, 0.0323174 , 0.02174687, 0.02432108, 0.0281899 ,
        0.02504635, 0.0262146 , 0.03529406, 0.02591228, 0.03181791,
        0.02785206, 0.02199054, 0.02157521, 0.02327657, 0.03066325,
        0.02736354, 0.02926707, 0.02403855, 0.02725887, 0.0239234 ,
        0.03498411, 0.02481103, 0.02302742, 0.033077  , 0.02530789,
        0.02871752]),
 'score_time': array([0.00436902, 0.00564098, 0.00649595, 0.00551629, 0.00376749,
        0.00721216, 0.00602126, 0.00454164, 0.00458813, 0.00858831,


### Multilayer perceptron 

In [12]:
from sklearn.neural_network import MLPClassifier
clfmlp = MLPClassifier(solver="adam", activation="relu", learning_rate="constant", random_state=42, max_iter=3000, verbose=False)
clfmlp.out_activation_="softmax"
scoring = ['f1_weighted']

loomlp = LeaveOneOut()
mlp_scores = cross_validate(clfmlp, X, y, cv=loomlp, scoring=scoring)

In [13]:
mlp_scores

{'fit_time': array([0.78245616, 0.80470777, 0.75782108, 0.79890633, 0.78826284,
        0.80951524, 0.77002215, 0.81368852, 0.77424836, 0.85178471,
        0.84682083, 0.79429817, 0.81956816, 0.7841754 , 0.75621581,
        0.79122901, 0.79242611, 0.77984667, 0.76619124, 0.81135726,
        0.72203755, 0.78317785, 0.78129578, 0.82806563, 0.73917937,
        0.75423598, 0.76799583, 0.78486705, 0.79518986, 0.78011036,
        0.78226542, 0.75724077, 0.78260207, 0.76543212, 0.73978662,
        0.76645827, 0.76233411, 0.7665329 , 0.75779843, 0.78260136,
        0.77470374, 0.78326058, 0.7835443 , 0.7752018 , 0.76888061,
        0.76549339, 0.76285148, 0.77026248, 0.78046346, 0.80844808,
        0.79991841, 0.74100184, 0.75185251, 0.7478807 , 0.77613115,
        0.76801276, 0.77076578, 0.74890947, 0.76818609, 0.76281309,
        0.78725171]),
 'score_time': array([0.00644493, 0.00559711, 0.00299978, 0.00522065, 0.00553489,
        0.0059371 , 0.00478601, 0.00553226, 0.0065217 , 0.00563526,
