In [157]:
import numpy as np
import pandas as pd

X_train, X_test = pd.read_csv('./train_features.csv'), pd.read_csv('./test_features.csv')
y_train, y_train_extra = pd.read_csv('./train_targets_scored.csv'), pd.read_csv('./train_targets_nonscored.csv')

### Data:
`train_features.csv-features for training`
- g-: specifies gene data, [4: 776]
- c-: shows cell viability data [776:]
- cp_type: samples treated with a compound/control vehicle
- cp_dose: treatment dose
- cp_time: duration

`train_targets_scored`
- They are 206 columns with unique values either 1 or 0 in each row
- It also has nothing in common with train_targets_nonscored

Modeling Ideas
- Using CNNs on the cell columns: Make 10x10 dataframes and then feed it from a different part of the model and then combine the results.
- Partitioning input: In general it is possible to break data into parts and process it differently and then aggregiate the results.
- Instead of treating the problem as multi-label classification, get the index in the dataframe row.

In [158]:
# The cp_time feature in both train and test are either 24, 48, or 72
print('X_train unqiue cp_time: ', X_train['cp_time'].unique())
print('X_test unqiue cp_time: ', X_test['cp_time'].unique())

# hence it would make sense to normalize the input with regard to mean to generate negative numbers in the column
X_train['cp_time'] = (X_train['cp_time'] - 48) / 48
X_test['cp_time'] = (X_test['cp_time'] - 48) / 48

X_train unqiue cp_time:  [24 72 48]
X_test unqiue cp_time:  [24 72 48]


In [159]:
# Encoding categorical variables

# Dictionaries
cp_type = {'trt_cp': 1, 'ctl_vehicle': -1}
cp_dose = {'D1': 1, 'D2': -1}
# Mapping dictionaries to columns
X_train['cp_type'], X_test['cp_type'] = X_train['cp_type'].map(cp_type), X_test['cp_type'].map(cp_type)
X_train['cp_dose'], X_test['cp_dose'] = X_train['cp_dose'].map(cp_dose), X_test['cp_dose'].map(cp_dose)

In [161]:
X_test.describe()

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
count,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,...,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0,3982.0
mean,0.820191,0.000628,0.018081,0.197471,-0.063233,0.137516,0.049622,0.052434,-0.147852,0.010424,...,-0.42998,-0.413215,-0.475762,-0.462484,-0.479196,-0.34645,-0.423905,-0.316387,-0.404859,-0.269792
std,0.572162,0.404462,0.999962,1.373476,0.800573,1.066475,0.949972,0.994018,1.204934,0.859452,...,2.013142,1.995433,1.96344,2.128477,2.149705,1.645669,2.086895,1.655524,1.790883,1.355936
min,-1.0,-0.5,-1.0,-5.321,-4.142,-6.996,-3.765,-6.312,-8.717,-6.037,...,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
25%,1.0,-0.5,-1.0,-0.502725,-0.552975,-0.4318,-0.46535,-0.44575,-0.576,-0.521325,...,-0.527725,-0.52185,-0.539375,-0.54045,-0.548175,-0.561525,-0.49705,-0.521125,-0.531075,-0.54195
50%,1.0,0.0,1.0,-0.0607,-0.0275,0.0734,-0.03645,-0.0478,-0.0125,-0.0092,...,0.01005,0.0109,0.00655,0.0175,0.01345,-0.02,0.0438,0.02045,0.03075,-0.00625
75%,1.0,0.5,1.0,0.45465,0.42995,0.641125,0.42315,0.43605,0.49895,0.495975,...,0.4789,0.479625,0.44255,0.4733,0.487825,0.4544,0.50105,0.478925,0.487,0.4228
max,1.0,0.5,1.0,10.0,4.536,8.227,10.0,7.615,6.356,6.19,...,4.936,5.3,3.268,4.087,5.29,5.053,5.276,3.074,3.859,4.149


In [None]:
from tensorflow.keras import Model

In [None]:
class Model01(Model):
    def __ini__()