# Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df1 = pd.read_csv("c:/users/fre_f/ARVOproject/BLclinicalremoved.csv", index_col = "ID", encoding="ansi")
df1.head()

Unnamed: 0_level_0,MeiboODinf-0,MeiboOSinf-0,Kjønn M=0,Alder,Synskorr.kir,Keratoconus,Cataract opr/linsebytte,Glaucom,Blepharoplastikk,Retinol,...,Kunstig tårevæske-0,RensVarme,Betennelsesdempende-0,Tåreplugger-0,Lysbehandling-0,Skjermbruk-0,Kontaktlinser-0,Mascara-0,Eyeliner-0,AC-0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,1,56,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
2,4.0,4.0,1,74,0,0,0,0,1,0,...,3,0,0,0,0,1,0,1,1,1
3,2.0,2.0,1,52,0,0,0,0,0,0,...,0,0,3,0,0,1,0,0,0,0
4,3.0,2.0,1,48,0,0,0,0,0,0,...,5,0,0,0,0,1,1,1,1,0
5,2.0,2.0,1,68,0,0,0,0,0,0,...,6,0,0,0,0,1,0,0,1,1


In [3]:
df1.dtypes

MeiboODinf-0       float64
MeiboOSinf-0       float64
Kjønn M=0            int64
Alder                int64
Synskorr.kir         int64
                    ...   
Skjermbruk-0         int64
Kontaktlinser-0      int64
Mascara-0            int64
Eyeliner-0           int64
AC-0                 int64
Length: 92, dtype: object

In [4]:
df1["Vinter"].unique()
df1["RensVarme"].unique()

array(['0', '2', '7', '4', '5', '6', '3', '1', '8', '9', '10', ' '],
      dtype=object)

In [5]:
#Remove empty strings
df1['Vinter'] = df1['Vinter'].replace(" ", np.nan)
df1['RensVarme'] = df1['RensVarme'].replace(" ", np.nan)

In [6]:
df1["Vinter"].unique()
df1["RensVarme"].unique()

array(['0', '2', '7', '4', '5', '6', '3', '1', '8', '9', '10', nan],
      dtype=object)

In [7]:
#Convert dtype
df1['RensVarme']=df1['RensVarme'].astype('float')
df1['Vinter']=df1['Vinter'].astype('float')

In [8]:
df1['MeiboODinf-0'].unique()


array([ 2.,  4.,  3.,  1., nan,  0.])

In [9]:
df1["MeiboODinf-0"].value_counts()

2.0    215
3.0    183
4.0    129
1.0     44
0.0      2
Name: MeiboODinf-0, dtype: int64

In [10]:
#Implement class 0 in class 1
df1['MeiboODinf-0'] = df1['MeiboODinf-0'].replace(0, 1)
df1['MeiboOSinf-0'] = df1['MeiboOSinf-0'].replace(0, 1)

In [11]:
df1.shape

(582, 92)

In [12]:
df1 = df1.dropna(axis=0, subset=["MeiboODinf-0"])
df1 = df1.dropna(axis=0, subset=["MeiboOSinf-0"])

In [13]:
df1.shape

(573, 92)

In [14]:
df1['MeiboODinf-0'].unique()
df1['MeiboOSinf-0'].unique()

array([2., 4., 3., 1.])

In [15]:
df1.iloc[:,2:92]

Unnamed: 0_level_0,Kjønn M=0,Alder,Synskorr.kir,Keratoconus,Cataract opr/linsebytte,Glaucom,Blepharoplastikk,Retinol,Antidepressiva,SS,...,Kunstig tårevæske-0,RensVarme,Betennelsesdempende-0,Tåreplugger-0,Lysbehandling-0,Skjermbruk-0,Kontaktlinser-0,Mascara-0,Eyeliner-0,AC-0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,56,0,0,0,0,0,0,0,0,...,0,0.0,0,0,0,1,0,1,1,1
2,1,74,0,0,0,0,1,0,0,0,...,3,0.0,0,0,0,1,0,1,1,1
3,1,52,0,0,0,0,0,0,0,0,...,0,0.0,3,0,0,1,0,0,0,0
4,1,48,0,0,0,0,0,0,0,0,...,5,0.0,0,0,0,1,1,1,1,0
5,1,68,0,0,0,0,0,0,0,0,...,6,0.0,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,0,44,0,0,0,0,0,0,0,0,...,6,5.0,7,0,0,1,0,0,0,0
579,1,54,0,0,0,0,0,0,0,0,...,5,0.0,0,0,0,0,0,0,0,0
580,1,44,0,0,0,0,0,0,0,0,...,5,5.0,7,0,6,1,1,1,0,1
581,0,28,0,0,0,0,0,0,0,0,...,3,0.0,0,0,0,1,0,0,0,0


In [16]:
#imp = SimpleImputer(strategy='mean')
#imp = IterativeImputer(random_state=42, max_iter=100)
imp = KNNImputer(n_neighbors=3, weights="uniform")
df_imputed = imp.fit_transform(df1)
df2 = pd.DataFrame(df_imputed, columns = df1.columns.tolist(), index = df1.index)
df2

Unnamed: 0_level_0,MeiboODinf-0,MeiboOSinf-0,Kjønn M=0,Alder,Synskorr.kir,Keratoconus,Cataract opr/linsebytte,Glaucom,Blepharoplastikk,Retinol,...,Kunstig tårevæske-0,RensVarme,Betennelsesdempende-0,Tåreplugger-0,Lysbehandling-0,Skjermbruk-0,Kontaktlinser-0,Mascara-0,Eyeliner-0,AC-0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,1.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
2,4.0,4.0,1.0,74.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
3,2.0,2.0,1.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3.0,2.0,1.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
5,2.0,2.0,1.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,4.0,4.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,5.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
579,2.0,2.0,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
580,3.0,4.0,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,7.0,0.0,6.0,1.0,1.0,1.0,0.0,1.0
581,2.0,2.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
type(df2)

pandas.core.frame.DataFrame

In [18]:
df2.iloc[:,0]

ID
1      2.0
2      4.0
3      2.0
4      3.0
5      2.0
      ... 
578    4.0
579    2.0
580    3.0
581    2.0
582    3.0
Name: MeiboODinf-0, Length: 573, dtype: float64

In [19]:
df2.iloc[:,2:] 

Unnamed: 0_level_0,Kjønn M=0,Alder,Synskorr.kir,Keratoconus,Cataract opr/linsebytte,Glaucom,Blepharoplastikk,Retinol,Antidepressiva,SS,...,Kunstig tårevæske-0,RensVarme,Betennelsesdempende-0,Tåreplugger-0,Lysbehandling-0,Skjermbruk-0,Kontaktlinser-0,Mascara-0,Eyeliner-0,AC-0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
2,1.0,74.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
3,1.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
5,1.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,5.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
579,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
580,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,7.0,0.0,6.0,1.0,1.0,1.0,0.0,1.0
581,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
X = df2.iloc[:,2:]


In [21]:
y = df2.iloc[:,0]

In [22]:
#y_train for class 4
y_Class4OD = y.replace(1, 0)
y_Class4OD = y_Class4OD.replace(4, 1)
y_Class4OD = y_Class4OD.replace(2, 0)
y_Class4OD = y_Class4OD.replace(3, 0)

In [23]:
y_Class4OD.value_counts()

0.0    444
1.0    129
Name: MeiboODinf-0, dtype: int64

In [24]:
y.value_counts()

2.0    215
3.0    183
4.0    129
1.0     46
Name: MeiboODinf-0, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y_Class4OD, test_size = 0.2, random_state=42)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(458, 90)
(115, 90)
(458,)
(115,)


In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_trainScaled = scaler.fit_transform(X_train)
X_testScaled = scaler.transform(X_test)
X_trainScaled = pd.DataFrame(X_trainScaled, columns = X_train.columns.tolist(), index=X_train.index)
X_testScaled = pd.DataFrame(X_testScaled, columns = X_test.columns.tolist(), index=X_test.index)

In [28]:
#X_trainScaled

In [29]:
print(X_trainScaled.shape)
print(X_testScaled.shape)
print(y_train.shape)
print(y_test.shape)

(458, 90)
(115, 90)
(458,)
(115,)


In [30]:
def metrics():
    y_pred = clf.predict(X_testScaled)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cf = confusion_matrix(y_test, y_pred)
    target_names = ['class 0', 'class 1']
    print(f"Balanced accuracy: {bal_acc}")
    print(f"MCC: {mcc}")
    print(f"F1: {f1}")
    print(f"Confusion matrix: \n{cf}")
    print(f"Classification report: \n{classification_report(y_test, y_pred, target_names=target_names)}")

# Training and predicting on the right eye

In [31]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy="most_frequent")
clf.fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()


Balanced accuracy: 0.5
MCC: 0.0
F1: 0.45754716981132076
Confusion matrix: 
[[97  0]
 [18  0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.84      1.00      0.92        97
     class 1       0.00      0.00      0.00        18

    accuracy                           0.84       115
   macro avg       0.42      0.50      0.46       115
weighted avg       0.71      0.84      0.77       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=5000).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5873424971363116
MCC: 0.18846559943068314
F1: 0.593139516997385
Confusion matrix: 
[[87 10]
 [13  5]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.87      0.90      0.88        97
     class 1       0.33      0.28      0.30        18

    accuracy                           0.80       115
   macro avg       0.60      0.59      0.59       115
weighted avg       0.79      0.80      0.79       115



In [33]:
clf = RandomForestClassifier().fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5575601374570447
MCC: 0.16441334129691496
F1: 0.5663650075414781
Confusion matrix: 
[[92  5]
 [15  3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.86      0.95      0.90        97
     class 1       0.38      0.17      0.23        18

    accuracy                           0.83       115
   macro avg       0.62      0.56      0.57       115
weighted avg       0.78      0.83      0.80       115



In [34]:
clf = RandomForestClassifier(n_estimators=5000, criterion="gini", bootstrap=True, oob_score=False, warm_start=False, random_state=42).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5627147766323024
MCC: 0.1906167661590583
F1: 0.5736585365853659
Confusion matrix: 
[[93  4]
 [15  3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.86      0.96      0.91        97
     class 1       0.43      0.17      0.24        18

    accuracy                           0.83       115
   macro avg       0.64      0.56      0.57       115
weighted avg       0.79      0.83      0.80       115



In [35]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=1000, bootstrap=True, oob_score=True, random_state=42).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5020045819014891
MCC: 0.006550689977064098
F1: 0.4882686084142395
Confusion matrix: 
[[92  5]
 [17  1]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.84      0.95      0.89        97
     class 1       0.17      0.06      0.08        18

    accuracy                           0.81       115
   macro avg       0.51      0.50      0.49       115
weighted avg       0.74      0.81      0.77       115



In [36]:
neigh = KNeighborsClassifier(n_neighbors=3)
clf = neigh.fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5266323024054983
MCC: 0.05918833512737464
F1: 0.5280934343434344
Confusion matrix: 
[[86 11]
 [15  3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.85      0.89      0.87        97
     class 1       0.21      0.17      0.19        18

    accuracy                           0.77       115
   macro avg       0.53      0.53      0.53       115
weighted avg       0.75      0.77      0.76       115



In [37]:
# lbfgs, sgd, adam
#logistic, tanh, relu
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver="adam", activation="tanh", learning_rate="constant", random_state=42, max_iter=3000, verbose=False).fit(X_trainScaled, y_train)
#clf.out_activation_="softmax"
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.6377434135166093
MCC: 0.2754868270332188
F1: 0.6377434135166093
Confusion matrix: 
[[86 11]
 [11  7]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.89      0.89      0.89        97
     class 1       0.39      0.39      0.39        18

    accuracy                           0.81       115
   macro avg       0.64      0.64      0.64       115
weighted avg       0.81      0.81      0.81       115



In [38]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=500, random_state=42).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.6202749140893471
MCC: 0.25952639921602266
F1: 0.628518689432395
Confusion matrix: 
[[88  9]
 [12  6]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.88      0.91      0.89        97
     class 1       0.40      0.33      0.36        18

    accuracy                           0.82       115
   macro avg       0.64      0.62      0.63       115
weighted avg       0.80      0.82      0.81       115



In [39]:
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier().fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5853379152348224
MCC: 0.23089788616708895
F1: 0.6013501185914979
Confusion matrix: 
[[92  5]
 [14  4]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.87      0.95      0.91        97
     class 1       0.44      0.22      0.30        18

    accuracy                           0.83       115
   macro avg       0.66      0.59      0.60       115
weighted avg       0.80      0.83      0.81       115



In [40]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=None, random_state=42).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5738831615120275
MCC: 0.13212082268063838
F1: 0.5630699088145897
Confusion matrix: 
[[79 18]
 [12  6]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.87      0.81      0.84        97
     class 1       0.25      0.33      0.29        18

    accuracy                           0.74       115
   macro avg       0.56      0.57      0.56       115
weighted avg       0.77      0.74      0.75       115



In [41]:
from xgboost import XGBClassifier
clf = XGBClassifier(max_depth=10, eta=0.3, reg_lambda=50, alpha=0).fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5575601374570447
MCC: 0.16441334129691496
F1: 0.5663650075414781
Confusion matrix: 
[[92  5]
 [15  3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.86      0.95      0.90        97
     class 1       0.38      0.17      0.23        18

    accuracy                           0.83       115
   macro avg       0.62      0.56      0.57       115
weighted avg       0.78      0.83      0.80       115



In [42]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
clf = LGBMClassifier().fit(X_trainScaled, y_train)
clf.predict(X_testScaled)

metrics()

Balanced accuracy: 0.5904925544100802
MCC: 0.25848067586977674
F1: 0.6097285067873303
Confusion matrix: 
[[93  4]
 [14  4]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.87      0.96      0.91        97
     class 1       0.50      0.22      0.31        18

    accuracy                           0.84       115
   macro avg       0.68      0.59      0.61       115
weighted avg       0.81      0.84      0.82       115



# Training and predicting on the left eye

In [43]:
X_OS = df2.iloc[:,2:]


In [44]:
df2.iloc[:,1]

ID
1      2.0
2      4.0
3      2.0
4      2.0
5      2.0
      ... 
578    4.0
579    2.0
580    4.0
581    2.0
582    4.0
Name: MeiboOSinf-0, Length: 573, dtype: float64

In [45]:
y_OS = df2.iloc[:,1]

In [46]:
#y_train for class 4
y_Class4OS = y_OS.replace(2, 0)
y_Class4OS = y_Class4OS.replace(3, 0)
y_Class4OS = y_Class4OS.replace(4, 0)

In [47]:
y_Class4OS.value_counts()

0.0    538
1.0     35
Name: MeiboOSinf-0, dtype: int64

In [48]:
X_trainOS, X_testOS, y_trainOS, y_testOS = train_test_split(X_OS, y_Class4OS, test_size = 0.2, random_state=42)


In [49]:
scaler = StandardScaler()
X_trainScaledOS = scaler.fit_transform(X_trainOS)
X_testScaledOS = scaler.transform(X_testOS)
X_trainScaledOS = pd.DataFrame(X_trainScaledOS, columns = X_trainOS.columns.tolist(), index=X_trainOS.index)
X_testScaledOS = pd.DataFrame(X_testScaledOS, columns = X_testOS.columns.tolist(), index=X_testOS.index)

In [50]:
type(X_trainScaledOS)
type(X_testScaledOS)

pandas.core.frame.DataFrame

In [51]:
print(X_trainScaledOS.shape)
print(X_testScaledOS.shape)
print(y_trainOS.shape)
print(y_testOS.shape)

(458, 90)
(115, 90)
(458,)
(115,)


In [52]:
#X_trainScaledOS

In [53]:
def metricsOS():
    y_predOS = clf.predict(X_testScaledOS)
    bal_acc = balanced_accuracy_score(y_testOS, y_predOS)
    mcc = matthews_corrcoef(y_testOS, y_predOS)
    f1 = f1_score(y_testOS, y_predOS, average="macro")
    cf = confusion_matrix(y_testOS, y_predOS)
    target_names = ['class 0', 'class 1']
    print(f"Balanced accuracy: {bal_acc}")
    print(f"MCC: {mcc}")
    print(f"F1: {f1}")
    print(f"Confusion matrix: \n{cf}")
    print(f"Classification report: \n{classification_report(y_testOS, y_predOS, target_names=target_names)}")

In [54]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy="most_frequent")
clf.fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.48198198198198194
Confusion matrix: 
[[107   0]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.93       115
   macro avg       0.47      0.50      0.48       115
weighted avg       0.87      0.93      0.90       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter=5000).fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.6314252336448598
MCC: 0.19856376833012096
F1: 0.5893719806763285
Confusion matrix: 
[[95 12]
 [ 5  3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.95      0.89      0.92       107
     class 1       0.20      0.38      0.26         8

    accuracy                           0.85       115
   macro avg       0.57      0.63      0.59       115
weighted avg       0.90      0.85      0.87       115



In [56]:
clf = RandomForestClassifier().fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.48198198198198194
Confusion matrix: 
[[107   0]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.93       115
   macro avg       0.47      0.50      0.48       115
weighted avg       0.87      0.93      0.90       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
clf = RandomForestClassifier(n_estimators=5000, criterion="gini", bootstrap=True, oob_score=False, warm_start=True, random_state=42).fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.48198198198198194
Confusion matrix: 
[[107   0]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.93       115
   macro avg       0.47      0.50      0.48       115
weighted avg       0.87      0.93      0.90       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
neigh = KNeighborsClassifier(n_neighbors=3)
clf = neigh.fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.49065420560747663
MCC: -0.03637716757892587
F1: 0.47727272727272724
Confusion matrix: 
[[105   2]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      0.98      0.95       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.91       115
   macro avg       0.46      0.49      0.48       115
weighted avg       0.86      0.91      0.89       115



In [59]:
# lbfgs, sgd, adam
#logistic, tanh, relu
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver="adam", activation="relu", learning_rate="constant", random_state=42, max_iter=3000, verbose=False).fit(X_trainScaledOS, y_trainOS)
clf.out_activation_="softmax"
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.06504065040650406
Confusion matrix: 
[[  0 107]
 [  0   8]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.00      0.00      0.00       107
     class 1       0.07      1.00      0.13         8

    accuracy                           0.07       115
   macro avg       0.03      0.50      0.07       115
weighted avg       0.00      0.07      0.01       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [60]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=50, random_state=42).fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.6828271028037383
MCC: 0.5077102953881362
F1: 0.7362385321100917
Confusion matrix: 
[[106   1]
 [  5   3]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.95      0.99      0.97       107
     class 1       0.75      0.38      0.50         8

    accuracy                           0.95       115
   macro avg       0.85      0.68      0.74       115
weighted avg       0.94      0.95      0.94       115



In [61]:
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier().fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.48198198198198194
Confusion matrix: 
[[107   0]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.93       115
   macro avg       0.47      0.50      0.48       115
weighted avg       0.87      0.93      0.90       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=None, random_state=42).fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5391355140186915
MCC: 0.08954658875326628
F1: 0.5436507936507937
Confusion matrix: 
[[102   5]
 [  7   1]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.94      0.95      0.94       107
     class 1       0.17      0.12      0.14         8

    accuracy                           0.90       115
   macro avg       0.55      0.54      0.54       115
weighted avg       0.88      0.90      0.89       115



In [63]:
from xgboost import XGBClassifier
clf = XGBClassifier(max_depth=10, eta=0.3, reg_lambda=50, alpha=0).fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaledOS)

metricsOS()

Balanced accuracy: 0.5
MCC: 0.0
F1: 0.48198198198198194
Confusion matrix: 
[[107   0]
 [  8   0]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       107
     class 1       0.00      0.00      0.00         8

    accuracy                           0.93       115
   macro avg       0.47      0.50      0.48       115
weighted avg       0.87      0.93      0.90       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
clf = LGBMClassifier().fit(X_trainScaledOS, y_trainOS)
clf.predict(X_testScaled)

metricsOS()

Balanced accuracy: 0.6203271028037383
MCC: 0.3841144978851179
F1: 0.6658364466583644
Confusion matrix: 
[[106   1]
 [  6   2]]
Classification report: 
              precision    recall  f1-score   support

     class 0       0.95      0.99      0.97       107
     class 1       0.67      0.25      0.36         8

    accuracy                           0.94       115
   macro avg       0.81      0.62      0.67       115
weighted avg       0.93      0.94      0.93       115

