In [1]:
import numpy as np
import pandas as pd
from keras import layers
from keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.models import Model, Sequential
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from tensorflow.keras.layers.experimental import preprocessing
import pydot
from IPython.display import SVG
from sklearn.model_selection import train_test_split
import keras.backend as K
K.set_image_data_format('channels_last')
import math

#import matplotlib.pyplot as plt
#from matplotlib.pyplot import imshow
#from plotly.offline import init_notebook_mode, iplot
#from plotly.graph_objs import *
# initiate notebook for offline plot
#init_notebook_mode(connected=True)

# Load and Normalize Data

In [2]:
def get_data():
    # Load Data
    # We load data into RAM since data is small and will fit in memory.
    cells = pd.read_csv("tumor_cycif.csv")
    cells = np.array(cells)
    
    # Drop the cell ID Column.
    cells = np.delete(cells, 0, 1)
    
    # Normalize Data
    # A simple max normalization, may worth trying 
    # alternative normalization methods 
    # (e.g., 0-1 normalization)
    cells = cells / cells.max(axis=0)
    
    # NOTE
    # Plotting data (e.g., value distribution on boxplot)
    # before and after nomalization is a good practice
    # to get yourself familiar with the data.
    # A good normalization will bring all the data to 
    # a common scale and roughly similar distribution.
    
    return cells

# Test Loaded Data

In [3]:
input = get_data()

# Number of cells
assert input.shape[0] == 12142
# Number of features per input
assert input.shape[1] == 47

# Define Pretext Tasks
The pretext tasks help learn latent features that describe input. 

## Pretext 1: Scale

In [4]:
def scale_input(input, scalar_pool):
    """
    Multiplies every measurements of a cell (i.e., marker intensities)
    by a scaler randomly selected from the given pool of scalars, and
    returns manipulated input and their corresponding scale factor.
    """
    # Generate output labels by creating an array of 
    # size equal to input populated by values randomly
    # selected from `scalar_pool` array.
    scalars = np.random.choice(scalar_pool, input.shape[0])
    
    # This reshape can simplify broadcasting.
    scalars = scalars.reshape(scalars.shape[0], 1)
    
    # Multiplies every value in a row by its corresponding
    # scalar in the array `scalars` (i.e., labels).
    X = input * scalars
    
    # There must be a much more efficient way for 
    # implementing the following block.
    scalar_dict = {}
    for s in scalar_pool:
        scalar_dict[s] = scalar_pool.index(s)
    y = []
    for s in scalars:
        s = s[0]
        t = [0] * len(scalar_pool)
        t[scalar_dict[s]] = 1
        y.append(t)
    y = np.array(y)

    return X, y

def get_model(input_shape, num_output_classes):
    model = Sequential()
    
    # First hidden layer is a densly/fully-connected neural network with 128 neurons, 
    # and activation function is ReLU.
    model.add(Dense(128, input_shape=input_shape, activation="relu", name="d1"))
    
    # Second hidden layer with 64 neurons and ReLU activation.
    model.add(Dense(64, activation="relu", name="d2"))
    
    model.add(Dense(num_output_classes, activation="softmax", name="output"))
    
    # Docs on optimizers:     https://keras.io/api/optimizers/
    # Docs on loss functions: https://keras.io/api/losses/
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [5]:
# Pretext 1: Scale
scalar_pool =[1, 10, 100, 1000]
X, y = scale_input(input, scalar_pool)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42, 
                                                    shuffle=True)

model = get_model((X_train.shape[1], ), len(scalar_pool))

# A good tip to get a sense of the model before training it.
model.summary()

# Train the model and keeping 5% of data for validation.
history = model.fit(
    X_train, y_train, 
    # Epoch: a pass through all the input data. 
    epochs=100,
    # Batch: the number of samples considered per epoch before weights are updated.
    batch_size=10, 
    validation_split=0.05,
    # set verbose to 1 if you want to see the individual steps.
    verbose=0)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
d1 (Dense)                   (None, 128)               6144      
_________________________________________________________________
d2 (Dense)                   (None, 64)                8256      
_________________________________________________________________
output (Dense)               (None, 4)                 260       
Total params: 14,660
Trainable params: 14,660
Non-trainable params: 0
_________________________________________________________________


## Evaluate the model

In [6]:
mse_nn, mae_nn = model.evaluate(X_test, y_test)
print('Mean squared error on test data: ', mse_nn)
print('Mean absolute error on test data: ', mae_nn)

Mean squared error on test data:  0.0005934028304181993
Mean absolute error on test data:  0.9997504353523254


Plotting the models performance usually gives good insights about its performance.

In [68]:
plt_train = go.Scattergl(y=history.history['loss'], name='Train')
plt_valid = go.Scattergl(y=history.history['val_loss'], name='Valid')
iplot([plt_train, plt_valid])