In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format ='retina'

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    gpu_num = 1
    try:
        tf.config.experimental.set_visible_devices(gpus[gpu_num], 'GPU')
        #tf.config.experimental.set_memory_growth(gpus[gpu_num], True)
        tf.config.experimental.set_virtual_device_configuration(
            gpus[gpu_num],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
    except RuntimeError as e:
        print(e)

In [4]:
from data_import_preprocessing import import_data_preprocessing
from model_define import CNNLSTM_model

In [5]:
train_data_name = 'data/nb_data_changed_HT_1-1.csv'
test_data_name = 'data/nb_data_changed_HT_1-2.csv'
preprocessing = import_data_preprocessing(train_data_file_name = train_data_name,
                                          test_data_file_name= test_data_name,
                                         )
print('train_data : ', preprocessing.train_data_file_sample_column)
print('train_data : ', preprocessing.test_data_file_sample_column)

train_data :  ['target', 'target_20bp', 'indel_percent', 'indel_rate']
train_data :  ['target', 'target_20bp', 'indel_percent', 'indel_rate']


In [6]:
data = preprocessing(sgRNA_column='target',
                    indel_column='indel_percent',
                    split_data=0.1
                    )
print(data.keys())
print(data['train'].keys())

dict_keys(['train', 'val', 'total', 'test'])
dict_keys(['seq', 'indel_rate', 'indel_class', 'read_cnt', 'info'])


In [7]:
X_train = data['train']['seq']
class_train = data['train']['indel_class']
rate_train = data['train']['indel_rate']

X_val = data['val']['seq']
class_val = data['val']['indel_class']
rate_val = data['val']['indel_rate']

X_test = data['test']['seq']
class_test = data['test']['indel_class']
rate_test = data['test']['indel_rate']

input_shape = X_train.shape[1:]

In [8]:
CNNLSTM = CNNLSTM_model(input_shape=input_shape)

In [11]:
CNNLSTM_model = CNNLSTM.MTL_model()
CNNLSTM_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 34, 4)]      0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 34, 128)      640         input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 34, 128)      1152        input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 34, 128)      1664        input_1[0][0]                    
____________________________________________________________________________________________

In [13]:
CNNLSTM_model.compile(optimizer='adam',
              loss={
                  #'num_mis': 'categorical_crossentropy',
                  'class_1': 'categorical_crossentropy',
                  'class_2': 'categorical_crossentropy',
                  'class_final': 'categorical_crossentropy',
                  'rate': 'mean_squared_error'},
              loss_weights={
                  #'num_mis': 0.5,
                  'class_1': 1,
                  'class_2': 1,
                  'class_final': 0.5,
                  'rate': 1},
              metrics={
                  #'num_mis': 'accuracy',
                  'class_1': "accuracy",
                  'class_2': "accuracy",
                  'class_final': "accuracy"}
             )

In [14]:
from sklearn.utils import class_weight
class_train_num = class_train.argmax(axis=-1)
class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(class_train_num),
                                                  class_train_num
                                                 )
class_weights_dict = dict(enumerate(class_weights))#{ i : class_weights[i] for i in range(11)}
print("class_weight")
print(class_weights_dict)

class_wieghts_dict_tuned = class_weights_dict
for i in range(0,11):
    multiple_constant = 2
    cutoff_class = 5
    
    if i <=cutoff_class:
        class_wieghts_dict_tuned[i] = class_weights_dict[i]#1.0
    else:
        class_wieghts_dict_tuned[i] = class_weights_dict[i]#multiple_constant*np.tanh((class_weights_dict[i]-2)/2) + multiple_constant
    
print("sample_weight")
print(class_wieghts_dict_tuned)

sample_weight = np.array([class_wieghts_dict_tuned[i] for i in class_train_num])

class_weight
{0: 0.4392529446215917, 1: 0.9573110197135158, 2: 1.1437770058459713, 3: 1.2115229291932155, 4: 1.149131767109295, 5: 1.0498483552375768, 6: 1.040943789035392, 7: 0.9701760689902983, 8: 1.0793955384984408, 9: 1.4303877940241576, 10: 1.8538862949739083}
sample_weight
{0: 0.4392529446215917, 1: 0.9573110197135158, 2: 1.1437770058459713, 3: 1.2115229291932155, 4: 1.149131767109295, 5: 1.0498483552375768, 6: 1.040943789035392, 7: 0.9701760689902983, 8: 1.0793955384984408, 9: 1.4303877940241576, 10: 1.8538862949739083}


In [17]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  min_delta=0.0005,
                                                  patience=50, verbose=0, mode='auto')

CNNLSTM_model.fit(x=X_train,
              y={
                  #'num_mis': num_mis_train,
                  'class_1': class_train,
                  'class_2': class_train,
                  'class_final': class_train,
                  'rate': rate_train},
              validation_data=(X_val, {#'num_mis': num_mis_val,
                                       'class_1': class_val,
                                       'class_2': class_val,
                                       'class_final': class_val,
                                       'rate': rate_val}),
              class_weight={
                  'class_1' : class_weights_dict,
                  'class_2' : class_weights_dict,
                  'class_final' : class_weights_dict},
              sample_weight={'rate' : sample_weight},
              shuffle=True,
              epochs=200,
              batch_size=128,
              verbose=1,
              callbacks=[early_stopping]
             )

Train on 13500 samples, validate on 1500 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200


Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200


Epoch 51/200


<tensorflow.python.keras.callbacks.History at 0x7fbb5c4c6390>