In [1]:
import numpy as np
import pandas as pd

X_train, X_test = pd.read_csv('./train_features.csv'), pd.read_csv('./test_features.csv')
y_train, y_train_extra = pd.read_csv('./train_targets_scored.csv'), pd.read_csv('./train_targets_nonscored.csv')

## Data:
`train_features.csv-features for training`
- g-: specifies gene data, [4: 776]
- c-: shows cell viability data [776:]
- cp_type: samples treated with a compound/control vehicle
- cp_dose: treatment dose
- cp_time: duration

`train_targets_scored`
- They are 206 columns with unique values either 1 or 0 in each row
- It also has nothing in common with train_targets_nonscored

## Proprecessing:
1. There are no missing values
2. The categorical variables have been mapped using 1 and -1, since it is more effective than binary in the neural networks.
3. The other features (genes and cells) do not need normalizing since there is obviously some extreme values in them that will cause the model to train better

In [2]:
# The cp_time feature in both train and test are either 24, 48, or 72
print('X_train unqiue cp_time: ', X_train['cp_time'].unique())
print('X_test unqiue cp_time: ', X_test['cp_time'].unique())

# hence it would make sense to normalize the input with regard to mean to generate negative numbers in the column
X_train['cp_time'] = (X_train['cp_time'] - 48) / 48
X_test['cp_time'] = (X_test['cp_time'] - 48) / 48

X_train unqiue cp_time:  [24 72 48]
X_test unqiue cp_time:  [24 72 48]


In [3]:
# Encoding categorical variables

# Dictionaries
cp_type = {'trt_cp': 1, 'ctl_vehicle': -1}
cp_dose = {'D1': 1, 'D2': -1}

# Mapping dictionaries to columns
X_train['cp_type'], X_test['cp_type'] = X_train['cp_type'].map(cp_type), X_test['cp_type'].map(cp_type)
X_train['cp_dose'], X_test['cp_dose'] = X_train['cp_dose'].map(cp_dose), X_test['cp_dose'].map(cp_dose)

In [23]:
X_train.describe()

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
count,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,...,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0
mean,0.843285,0.00042,0.020156,0.248366,-0.095684,0.152253,0.081971,0.057347,-0.138836,0.035961,...,-0.469244,-0.461411,-0.513256,-0.500142,-0.507093,-0.353726,-0.463485,-0.378241,-0.470252,-0.301505
std,0.537477,0.404225,0.999818,1.393399,0.812363,1.035731,0.950012,1.032091,1.179388,0.882395,...,2.000488,2.042475,2.001714,2.107105,2.159589,1.629291,2.059725,1.703615,1.834828,1.407918
min,-1.0,-0.5,-1.0,-5.513,-5.737,-9.104,-5.998,-6.369,-10.0,-10.0,...,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
25%,1.0,-0.5,-1.0,-0.473075,-0.5622,-0.43775,-0.429575,-0.470925,-0.602225,-0.4939,...,-0.566175,-0.565975,-0.589975,-0.5687,-0.563775,-0.567975,-0.552575,-0.561,-0.5926,-0.5629
50%,1.0,0.0,1.0,-0.00885,-0.0466,0.0752,0.00805,-0.0269,-0.01565,-0.00065,...,-0.0099,0.00325,-0.0091,-0.01375,-0.0033,-0.01025,-0.00125,-0.0068,0.014,-0.0195
75%,1.0,0.5,1.0,0.5257,0.403075,0.663925,0.4634,0.465375,0.510425,0.528725,...,0.45775,0.4615,0.445675,0.4529,0.4709,0.44475,0.465225,0.4464,0.461275,0.43865
max,1.0,0.5,1.0,10.0,5.039,8.257,10.0,10.0,7.282,7.333,...,4.069,3.96,3.927,3.596,3.747,2.814,3.505,2.924,3.111,3.805


In [25]:
# The breakdowns for the dataset

# Training part
cells = X_train.iloc[:,776: ].copy()
genes = X_train.iloc[:,4:776].copy()
cells_image = cells.values.reshape(cells.shape[0], 10, 10, 1)

# Testing
cells_test = X_test.iloc[:,776: ].copy()
genes_test = X_test.iloc[:,4:776].copy()
cells_image_test = cells_test.values.reshape(cells_test.shape[0], 10, 10, 1)

Modeling Ideas
- Using CNNs on the cell columns: Make 10x10 dataframes and then feed it from a different part of the model and then combine the results.
- Partitioning input: In general it is possible to break data into parts and process it differently and then aggregiate the results.
- Instead of treating the problem as multi-label classification, get the index in the dataframe row.

In [34]:
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, MaxPooling2D, ReLU, Dropout, Flatten, Dense, InputLayer, Concatenate, SeparableConv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import TruncatedNormal, he_uniform, he_normal

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [37]:
# Inputs to the NN models
cells_input = Input(shape=(cells.shape[1]), name='Cells')
genes_input = Input(shape=(genes.shape[1]), name='Genes')
cell_image_input = Input(shape=(10,10,1), name='Cells_image')

In [57]:
# Some convolutional structures used with in the CNN
# The inception functions are inspired by the inception modules used in the inception model

def inception01(input_data, mean=0, std=1):
    # Layers 1:
    layer11 = SeparableConv2D(256, (1,1), padding='same',
            kernel_initializer=TruncatedNormal(mean=mean * 0.5, stddev=std * 0.5, seed=14)
            )(input_data)
    
    # Layers 2:
    layer21 =  SeparableConv2D(512, (3,3),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 1.5, seed=15)
            )(input_data)
    layer22 =  SeparableConv2D(256, (1,1),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 1.5, seed=15)
            )(layer21)
    
    # Layers 3:
    layer31 =  SeparableConv2D(512, (1,1),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 2.5 + 0.1,seed=16)
            )(input_data)
    layer32 =  SeparableConv2D(256, (3,3),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 2.5 + 0.1, seed=16)
            )(layer31)
    
    # Layers 4:
    layer41 =  SeparableConv2D(512, (1,1),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std + 0.6,seed=17)
            )(input_data)
    layer42 =  SeparableConv2D(256, (5,5),padding='same',
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std +0.6,seed=17)
            )(layer41)

    return Concatenate()([layer11, layer22, layer32, layer42])

def inception02(input_data, mean=0, std=1):
    # Layers 1:
    layer11 = SeparableConv2D(128, (1,1),
            kernel_initializer=TruncatedNormal(mean=mean * 0.5, stddev=std * 0.5, seed=14)
            )(input_data)
    
    # Layers 2:
    layer21 =  SeparableConv2D(512, (1,1), activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 1.5, seed=15)
            )(input_data)
    layer22 =  SeparableConv2D(128, (2, 2),
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 1.5, seed=15)
            )(layer21)
    
    # Layers 3:
    layer31 =  SeparableConv2D(512, (1,1),activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 2.5 + 0.1,seed=16)
            )(input_data)
    layer32 =  SeparableConv2D(256, (1, 1),activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 2.5 + 0.1, seed=16)
            )(layer31)
    layer32 =  SeparableConv2D(128, (3,3),
            kernel_initializer=TruncatedNormal(mean=mean + 0.5, stddev=std * 2.5 + 0.1, seed=16)
            )(layer32)
    
    # Layers 4:
    layer41 =  SeparableConv2D(512, (1,1),activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std + 0.6,seed=17)
            )(input_data)
    layer42 =  SeparableConv2D(256, (1,1),activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std +0.6,seed=17)
            )(layer41)
    layer42 =  SeparableConv2D(256, (1,1),activation='relu',
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std +0.6,seed=17)
            )(layer41)
    layer42 =  SeparableConv2D(128, (5,5),
            kernel_initializer=TruncatedNormal(mean=mean - 0.5, stddev=std +0.6,seed=17)
            )(layer41)

    return Concatenate()(
        [Flatten()(layer11), 
         Flatten()(layer22), 
         Flatten()(layer32), 
         Flatten()(layer42)]
    )

In [68]:
# CNN model:

def CNN_model_01(input_data):
    """ Simple """
    conv1 = SeparableConv2D(256, (1,1), 
        kernel_initializer=TruncatedNormal(mean=0, stddev=0.05,seed=117)
    )(input_data)
    conv1 = BatchNormalization()(SeparableConv2D(256, (1, 1),
        kernel_initializer=TruncatedNormal(mean=0, stddev=0.05,seed=118)
    )(conv1))
    conv1 = SeparableConv2D(512, (3,3),
        kernel_initializer=TruncatedNormal(mean=0, stddev=0.05,seed=117), padding='same'
    )(conv1)
    
    fl1 = BatchNormalization()(Dense(1000, activation='relu')(Flatten()(conv1)))
    fl1 = Dense(1000, activation='relu')(Flatten()(fl1))
    
    conv2_1 = SeparableConv2D(256, (1,1), 
        kernel_initializer=TruncatedNormal(mean=10, stddev=5,seed=7)
    )(input_data)
    conv2_2 = BatchNormalization()(SeparableConv2D(256, (1, 1),
        kernel_initializer=TruncatedNormal(mean=10, stddev=5,seed=8)
    )(conv2_1))
    conv2_3 = SeparableConv2D(512, (3,3),
        kernel_initializer=TruncatedNormal(mean=10, stddev=5,seed=7), padding='same'
    )(conv2_2)
    
    fl2 = BatchNormalization()(Dense(1000, activation='relu')(Flatten()(conv2)))
    fl2 = Dense(1000, activation='relu')(Flatten()(fl2))
    
    conv3_1 = SeparableConv2D(256, (1,1), 
        kernel_initializer=TruncatedNormal(mean=-10, stddev=5,seed=7)
    )(input_data)
    conv3_2 = BatchNormalization()(SeparableConv2D(256, (1, 1),
        kernel_initializer=TruncatedNormal(mean=-10, stddev=5,seed=8)
    )(conv3_1))
    conv3_3 = SeparableConv2D(512, (3,3),
        kernel_initializer=TruncatedNormal(mean=-10, stddev=5,seed=7), padding='same'
    )(conv3_2)
    
    fl3 = Dense(1000, activation='relu')(Flatten()(conv3))
    fl3 = Dense(1000, activation='relu')(Flatten()(fl3))
    
    # Concatenating and running the fully connected layers
    
    
    return main_conv

In [69]:
CNN_model_01(cell_image_input)

<tf.Tensor 'flatten_9/Reshape:0' shape=(None, 4608) dtype=float32>