In [3]:
import pandas as pd
import numpy as np
import random
import os

import tensorflow as tf
from keras import layers, Input, Model


In [4]:
def one_hot_seqs(seqs) -> np.array:
    static_1hotmap = {
        'A' : np.array([1,0,0,0]),
        'a' : np.array([1,0,0,0]),
        'C' : np.array([0,1,0,0]),
        'c' : np.array([0,1,0,0]),
        'G' : np.array([0,0,1,0]),
        'g' : np.array([0,0,1,0]),
        'T' : np.array([0,0,0,1]),
        't' : np.array([0,0,0,1]),
    }
    onehot_seqs = []
    for seq in seqs:
        onehot_seqs.append(
            [static_1hotmap[seq[i]] if seq[i] in static_1hotmap.keys() else static_1hotmap[random.choice(['A','C','G','T'])] for i in range(len(seq))]
        )
    return np.stack(onehot_seqs)


In [5]:
activity_df = pd.read_csv("Data/activity.csv", index_col=0)
test_df = activity_df[activity_df['set'] == 'TEST']
train_df = activity_df[activity_df['set'] == 'TRAIN']

x_train = one_hot_seqs(train_df['sequence'])
x_test = one_hot_seqs(test_df['sequence'])

y_train = train_df['expression_log2'].values
y_test = test_df['expression_log2'].values

In [6]:


def identity_block(X, f, filters, stage, block):
   
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    F1, F2, F3 = filters

    X_shortcut = X
   
    X = layers.Conv1D(filters=F1, kernel_size=1, strides=1, padding='valid', name=conv_name_base + '2a')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2a')(X)
    X = layers.Activation('relu')(X)

    X = layers.Conv1D(filters=F2, kernel_size=f, strides=1, padding='same', name=conv_name_base + '2b')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2b')(X)
    X = layers.Activation('relu')(X)

    X = layers.Conv1D(filters=F3, kernel_size=1, strides=1, padding='valid', name=conv_name_base + '2c')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2c')(X)

    X = layers.Add()([X, X_shortcut])# SKIP Connection
    X = layers.Activation('relu')(X)

    return X




In [7]:
def convolutional_block(X, f, filters, stage, block, s=2):
   
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    F1, F2, F3 = filters

    X_shortcut = X

    X = layers.Conv1D(filters=F1, kernel_size=1, strides=s, padding='valid', name=conv_name_base + '2a')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2a')(X)
    X = layers.Activation('relu')(X)

    X = layers.Conv1D(filters=F2, kernel_size=f, strides=1, padding='same', name=conv_name_base + '2b')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2b')(X)
    X = layers.Activation('relu')(X)

    X = layers.Conv1D(filters=F3, kernel_size=1, strides=1, padding='valid', name=conv_name_base + '2c')(X)
    X = layers.BatchNormalization(name=bn_name_base + '2c')(X)

    X_shortcut = layers.Conv1D(filters=F3, kernel_size=1, strides=s, padding='valid', name=conv_name_base + '1')(X_shortcut)
    X_shortcut = layers.BatchNormalization(name=bn_name_base + '1')(X_shortcut)

    X = layers.Add()([X, X_shortcut])
    X = layers.Activation('relu')(X)

    return X

In [8]:
def ResNet50(input_shape=(164,4)):

    X_input = Input(input_shape)

    X = layers.ZeroPadding1D(3)(X_input)

    X = layers.Conv1D(64, 7, strides=2, name='conv1')(X)
    X = layers.BatchNormalization( name='bn_conv1')(X)
    X = layers.Activation('relu')(X)
    X = layers.MaxPooling1D(3, strides=2)(X)

    X = convolutional_block(X, f=3, filters=[64, 64, 256], stage=2, block='a', s=1)
    X = identity_block(X, 3, [64, 64, 256], stage=2, block='b')
    X = identity_block(X, 3, [64, 64, 256], stage=2, block='c')


    X = convolutional_block(X, f=3, filters=[128, 128, 512], stage=3, block='a', s=2)
    X = identity_block(X, 3, [128, 128, 512], stage=3, block='b')
    X = identity_block(X, 3, [128, 128, 512], stage=3, block='c')
    X = identity_block(X, 3, [128, 128, 512], stage=3, block='d')

    X = convolutional_block(X, f=3, filters=[256, 256, 1024], stage=4, block='a', s=2)
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='b')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='c')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='d')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='e')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='f')

    X = X = convolutional_block(X, f=3, filters=[512, 512, 2048], stage=5, block='a', s=2)
    X = identity_block(X, 3, [512, 512, 2048], stage=5, block='b')
    X = identity_block(X, 3, [512, 512, 2048], stage=5, block='c')

    X = layers.AveragePooling1D(2, padding='same')(X)
    
    X = layers.Flatten()(X)
    X = layers.Dense(256, activation='relu')(X)
    X = layers.Dense(128, activation='relu')(X)
    X = layers.Dense(1, activation='linear')(X)
    
    model = Model(inputs=X_input, outputs=X, name='ResNet50')

    return model

In [9]:
model = ResNet50()
model.summary()

2024-02-16 16:41:20.232119: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38374 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:5e:00.0, compute capability: 8.0


Model: "ResNet50"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 164, 4)]             0         []                            
                                                                                                  
 zero_padding1d (ZeroPaddin  (None, 170, 4)               0         ['input_1[0][0]']             
 g1D)                                                                                             
                                                                                                  
 conv1 (Conv1D)              (None, 82, 64)               1856      ['zero_padding1d[0][0]']      
                                                                                                  
 bn_conv1 (BatchNormalizati  (None, 82, 64)               256       ['conv1[0][0]']        

In [10]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [11]:
history = model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=5,
    validation_split=0.2,
)

Epoch 1/5


2024-02-16 16:41:42.061430: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-02-16 16:41:42.169000: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-16 16:41:42.403945: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-16 16:41:44.000285: I external/local_xla/xla/service/service.cc:168] XLA service 0x14ac95d903a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-16 16:41:44.000314: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2024-02-16 16:41:44.007788: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1708123304.107636 1367634 device_compiler.h:

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
