In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile 
from sklearn.feature_selection import RFECV

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
np.set_printoptions(precision=2)

In [4]:
df = pd.read_csv('all_samples.csv')
df.drop(columns=['Unnamed: 0', 'name'], inplace=True)
df.head()

Unnamed: 0,ILMN_1651217,ILMN_1651229,ILMN_1651234,ILMN_1651236,ILMN_1651237,ILMN_1651254,ILMN_1651259,ILMN_1651260,ILMN_1651261,ILMN_1651262,...,ILMN_1815885,ILMN_1815908,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,CELIAC
0,4.229567,4.802085,4.145582,4.274502,4.268115,6.853804,4.40135,4.123169,4.639975,7.136778,...,4.376735,4.395501,4.338936,5.198647,4.594269,4.264604,4.25631,4.821757,5.005588,1
1,4.197183,4.820311,4.171221,4.332524,4.186809,6.663657,4.559615,4.27886,4.994493,6.803521,...,4.732124,4.417266,4.656831,4.61544,4.594269,4.336589,4.317376,4.518347,4.308311,1
2,4.131493,4.640774,4.075849,4.233316,4.334549,6.694727,4.370504,4.169419,5.093272,6.720391,...,4.292552,4.379864,4.211071,5.530672,4.570808,4.379545,4.241886,4.680351,4.780989,1
3,4.20741,4.508425,4.100585,4.166837,4.530517,6.506971,4.483179,4.24286,5.138309,6.881151,...,4.37118,4.406084,4.186757,5.358646,4.632107,4.282658,4.237614,4.60268,4.637598,1
4,4.24523,4.538779,4.040637,4.266853,4.326313,6.774611,4.40994,4.22886,4.948306,6.847382,...,4.345227,4.488653,4.364008,5.6059,4.6242,4.275774,4.251683,4.686359,4.687048,1


In [5]:
input_cols = list(df.columns[:-1])
inputs = df[input_cols]
target = df['CELIAC']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(inputs, target, stratify=target)
# X inputs (genes), Y target (disease state)
# default test/train split 75/25

In [7]:
tree = DecisionTreeClassifier(random_state=0)

In [8]:
# Support vector machine classifier (recommended for high-dimensional data; can work with small sample size)
svc = SVC(kernel='linear') # only linear models have coef_ and feature_importance_ attributes
svc_rfecv = RFECV(estimator=svc,
              step=0.1, # remove x% of  features at each iteration
              scoring='balanced_accuracy')

In [9]:
gnb = GaussianNB()

In [10]:
sgd_log = SGDClassifier(loss='log', random_state=0)
sgd_hinge = SGDClassifier(loss='hinge', random_state=0) # default

sgd_l_rfecv = RFECV(estimator=sgd_log,
              step=0.1,
              scoring='balanced_accuracy')
sgd_h_rfecv = RFECV(estimator=sgd_hinge,
              step=0.1,
              scoring='balanced_accuracy')

In [11]:
forest = RandomForestClassifier(random_state=0)

In [12]:
mlp_adam = MLPClassifier(random_state=0)
mlp_lb = MLPClassifier(solver='lbfgs',
                      random_state=0) # limited memory: recommended for smaller sample size
                                        # hidden_layer_sizes: default (100,)

In [13]:
models = [('Decision Tree', tree), 
          ('Random Forest', forest), 
          ("Gaussian Naive Bayes", gnb), 
          ("Gradient Descent (logistic)", sgd_l_rfecv),
          ("Gradient Descent (hinge)", sgd_h_rfecv),
          ("Support Vector Machines", svc_rfecv),
          ("MLP (Adam)", mlp_adam),
          ("MLP (LBFGS)", mlp_lb)
         ]

In [14]:
# precision = TP / (TP + FP)
# recall = TP / (TP + FN)
# f1-score: harmonic mean of precision and recall

for name, model in models:
    print(name)
    model = model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    print(classification_report(Y_test, prediction,
                               zero_division=0))
    print(confusion_matrix(Y_test, prediction, 
                           labels=model.classes_,
                          normalize='all'))
    print('\n')

Decision Tree
              precision    recall  f1-score   support

           0       0.38      0.50      0.43         6
           1       0.88      0.81      0.85        27

    accuracy                           0.76        33
   macro avg       0.63      0.66      0.64        33
weighted avg       0.79      0.76      0.77        33

[[0.09 0.09]
 [0.15 0.67]]


Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.82      1.00      0.90        27

    accuracy                           0.82        33
   macro avg       0.41      0.50      0.45        33
weighted avg       0.67      0.82      0.74        33

[[0.   0.18]
 [0.   0.82]]


Gaussian Naive Bayes
              precision    recall  f1-score   support

           0       1.00      0.17      0.29         6
           1       0.84      1.00      0.92        27

    accuracy                           0.85        33
   macro avg       0.

In [None]:
# before RFECV: random forest performs best
# after: gradient descent with hinge & SVC perform best