In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow import feature_column

In [3]:
# feature selection: SelectPercentile or RFE/RFECV (recursive feature elimination)
# PCA (combines correlated features)
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

In [4]:
np.set_printoptions(precision=3)

In [5]:
df = pd.read_csv('all_samples.csv')
df.drop(columns=['Unnamed: 0', 'name'], inplace=True)
df.head()

Unnamed: 0,ILMN_1651217,ILMN_1651229,ILMN_1651234,ILMN_1651236,ILMN_1651237,ILMN_1651254,ILMN_1651259,ILMN_1651260,ILMN_1651261,ILMN_1651262,...,ILMN_1815885,ILMN_1815908,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,CELIAC
0,4.229567,4.802085,4.145582,4.274502,4.268115,6.853804,4.40135,4.123169,4.639975,7.136778,...,4.376735,4.395501,4.338936,5.198647,4.594269,4.264604,4.25631,4.821757,5.005588,1
1,4.197183,4.820311,4.171221,4.332524,4.186809,6.663657,4.559615,4.27886,4.994493,6.803521,...,4.732124,4.417266,4.656831,4.61544,4.594269,4.336589,4.317376,4.518347,4.308311,1
2,4.131493,4.640774,4.075849,4.233316,4.334549,6.694727,4.370504,4.169419,5.093272,6.720391,...,4.292552,4.379864,4.211071,5.530672,4.570808,4.379545,4.241886,4.680351,4.780989,1
3,4.20741,4.508425,4.100585,4.166837,4.530517,6.506971,4.483179,4.24286,5.138309,6.881151,...,4.37118,4.406084,4.186757,5.358646,4.632107,4.282658,4.237614,4.60268,4.637598,1
4,4.24523,4.538779,4.040637,4.266853,4.326313,6.774611,4.40994,4.22886,4.948306,6.847382,...,4.345227,4.488653,4.364008,5.6059,4.6242,4.275774,4.251683,4.686359,4.687048,1


In [6]:
input_cols = list(df.columns[:-1])
inputs = df[input_cols]
target = df['CELIAC']

In [7]:
# Take 10% best features (columns) based on ANOVA fit
reduced_inputs = SelectPercentile().fit_transform(inputs, target)

In [8]:
reduced_inputs.shape

(132, 1898)

In [45]:
reduced_df = pd.DataFrame(reduced_inputs)
reduced_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897
0,4.275539,4.306251,4.618896,5.867003,6.521506,5.298801,5.940992,8.906818,4.752583,4.545288,...,4.205682,5.666388,5.70603,6.184561,5.777488,5.416096,4.431379,4.775627,5.408801,5.492253
1,4.277917,4.210349,4.430293,6.3531,5.759897,5.670972,6.240782,9.213189,4.58674,4.307685,...,4.288189,5.344717,5.572897,6.34702,5.544384,5.666388,4.124898,4.249952,5.486251,5.244002
2,4.215328,4.197578,4.707465,5.749865,5.771616,5.053427,5.306891,8.434439,4.709823,4.559511,...,4.193437,6.042542,5.353712,6.239423,6.421545,4.855159,4.540678,4.342777,5.279572,5.143543
3,4.302285,4.27927,4.501162,5.544384,5.632207,4.922094,5.362721,8.47981,4.631826,4.613997,...,4.204624,6.099494,5.354935,6.127527,6.274439,5.072586,4.480567,4.334735,5.374753,5.241133
4,4.302685,4.251547,4.683031,5.954345,5.754295,5.154511,5.450355,8.273638,4.790827,4.592735,...,4.080225,6.106028,5.635927,6.312757,6.326601,4.929269,4.611365,4.183861,5.151075,5.134198


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(reduced_inputs, target, test_size=0.4, stratify=target)

In [10]:
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, stratify=Y_test)

print(len(X_train), 'training examples')
print(len(X_val), 'validation examples')
print(len(X_test), 'test examples')

79 training examples
14 validation examples
39 test examples


In [13]:
train_labels = np.array(Y_train)

In [15]:
train_features = np.array(X_train)
train_features[0]

array([4.231, 4.218, 4.613, ..., 4.327, 5.44 , 5.165])

In [16]:
val_labels = np.array(Y_val)
val_features = np.array(X_val)

In [17]:
test_labels = np.array(Y_test)
test_features = np.array(X_test)

In [46]:
reduced_input_cols = list(reduced_df.columns)
for i in range(len(reduced_input_cols)):
    reduced_input_cols[i] = str(reduced_input_cols[i])

In [47]:
feature_columns = [feature_column.numeric_column(c) for c in reduced_input_cols]

In [48]:
feature_columns[:5]

[NumericColumn(key='0', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='1', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='3', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='4', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [49]:
feature_layer = keras.layers.DenseFeatures(feature_columns)

In [22]:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
metrics = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='bin-accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [54]:
# we will determine bias after training runs
def make_model(output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = tf.keras.Sequential([
#        feature_layer,
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation='sigmoid',
                           bias_initializer=output_bias,
                           name='output')
        ])
    
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=metrics)
    return model

In [29]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=10,
    mode='auto',
    restore_best_weights=True)

In [55]:
model = make_model()
model.fit(train_features,
          train_labels,
          batch_size=8,
          epochs=100,
          callbacks=[early_stopping],
          validation_data=(val_features, val_labels))
model.summary()

Train on 79 samples, validate on 14 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
 8/79 [==>...........................] - ETA: 0s - loss: 0.5633 - tp: 6.0000 - fp: 2.0000 - tn: 0.0000e+00 - fn: 0.0000e+00 - bin-accuracy: 0.7500 - precision: 0.7500 - recall: 1.0000 - auc: 0.5000Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             multiple                  60768     
_________________________________________________________________
dropout_8 (Dropout)          multiple                  0         
_________________________________________________________________
dense_13 (Dense)             multiple                  1056      
_________________________________________________________