In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow import feature_column

In [2]:
np.set_printoptions(precision=2)

In [3]:
df = pd.read_csv('all_samples.csv')
df.drop(columns=['Unnamed: 0', 'name'], inplace=True)
df.head()

Unnamed: 0,ILMN_1651217,ILMN_1651229,ILMN_1651234,ILMN_1651236,ILMN_1651237,ILMN_1651254,ILMN_1651259,ILMN_1651260,ILMN_1651261,ILMN_1651262,...,ILMN_1815885,ILMN_1815908,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,CELIAC
0,4.229567,4.802085,4.145582,4.274502,4.268115,6.853804,4.40135,4.123169,4.639975,7.136778,...,4.376735,4.395501,4.338936,5.198647,4.594269,4.264604,4.25631,4.821757,5.005588,1
1,4.197183,4.820311,4.171221,4.332524,4.186809,6.663657,4.559615,4.27886,4.994493,6.803521,...,4.732124,4.417266,4.656831,4.61544,4.594269,4.336589,4.317376,4.518347,4.308311,1
2,4.131493,4.640774,4.075849,4.233316,4.334549,6.694727,4.370504,4.169419,5.093272,6.720391,...,4.292552,4.379864,4.211071,5.530672,4.570808,4.379545,4.241886,4.680351,4.780989,1
3,4.20741,4.508425,4.100585,4.166837,4.530517,6.506971,4.483179,4.24286,5.138309,6.881151,...,4.37118,4.406084,4.186757,5.358646,4.632107,4.282658,4.237614,4.60268,4.637598,1
4,4.24523,4.538779,4.040637,4.266853,4.326313,6.774611,4.40994,4.22886,4.948306,6.847382,...,4.345227,4.488653,4.364008,5.6059,4.6242,4.275774,4.251683,4.686359,4.687048,1


In [4]:
input_cols = list(df.columns[:-1])
inputs = df[input_cols]
target = df['CELIAC']

In [14]:
# X_train, X_test, Y_train, Y_test = train_test_split(inputs, target, stratify=target)
train, test = train_test_split(df, test_size=0.2, stratify=target)
print(len(train), 'training examples')
print(len(test), 'test examples')

105 training examples
27 test examples


In [10]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
# source: https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('CELIAC')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [15]:
train_ds = df_to_dataset(train)
test_ds = df_to_dataset(test)

In [16]:
for feature_batch, label_batch in train_ds.take(1):
  print('10 features:', list(feature_batch.keys())[:10])
  print('A batch of targets:', label_batch )

10 features: ['ILMN_1651217', 'ILMN_1651229', 'ILMN_1651234', 'ILMN_1651236', 'ILMN_1651237', 'ILMN_1651254', 'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651261', 'ILMN_1651262']
A batch of targets: tf.Tensor([1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0], shape=(32,), dtype=int32)


In [23]:
feature_columns = [feature_column.numeric_column(h) for h in input_cols]

In [24]:
feature_columns[0]

NumericColumn(key='ILMN_1651217', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [25]:
feature_layer = keras.layers.DenseFeatures(feature_columns)

In [None]:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
metrics = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [None]:
# we will determine bias after training runs
def make_model(output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid',
                     bias_initializer=output_bias)
    
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=metrics)
    return model
])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
make_model()
model.summary()