# CAEM-GBDT

CAEM-GBDT: a cancer subtype identifying method using Multi-omics data and convolutional autoencoder network

# import packages

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Reshape
from tensorflow.keras import Input
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Conv1D,Conv1DTranspose,BatchNormalization,Activation,multiply,MaxPool1D,Dropout,GRU,Flatten,Dense,UpSampling1D
from tensorflow.keras import Model,datasets
import os
import tensorflow.keras as keras
from keras import losses
from sklearn.metrics import f1_score, precision_score, recall_score
import os
import tensorflow.keras.backend as kb
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import hamming_loss
import warnings
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.svm as svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
#from deepforest import CascadeForestClassifier
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# loading data and data preprocessing

In [None]:
data1 = np.loadtxt('E:/多组学数据/datasets/txt/Breast/Breast_Methy.txt',dtype=np.str)
data1 = np.array(data1).astype(np.float32)

data1.mean(axis=1)
data1 = data1 - data1.mean(axis=0,keepdims=True)
data1 = data1 / np.sqrt(data1.var(axis=0,keepdims = True))
data1.var(axis=0)
data1.shape

(104,23094)

In [None]:
tag = np.loadtxt('E:/多组学数据/datasets/txt/Breast/Breast_Label.txt',dtype=np.str)
tag = np.array(tag).astype(np.float32)
tag.shape

(104,)

Gelu activation function

In [None]:
def gelu(x):
            return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))

# convolutional block attention module

In [None]:
def cbam_block(cbam_feature, ratio=8,kernel_size = 3):

    cbam_feature = channel_attention(cbam_feature, ratio)
    cbam_feature = spatial_attention(cbam_feature,kernel_size)
    return cbam_feature

def channel_attention(input_feature, ratio=8):

    #channel_axis = 1 if K.image_data_format() == "channels_first" else -1
    channel = input_feature.shape[-1]
    filters = max(1, int(channel//ratio))
    shared_layer_one = tf.keras.layers.Dense(filters,
                             activation='relu',
                             kernel_initializer='he_normal',
                             use_bias=True,
                             bias_initializer='zeros')
    shared_layer_two = tf.keras.layers.Dense(channel,
                             kernel_initializer='he_normal',
                             use_bias=True,
                             bias_initializer='zeros')

    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(input_feature)    
    avg_pool = tf.keras.layers.Reshape((1,1,channel))(avg_pool)
    avg_pool = shared_layer_one(avg_pool)
    avg_pool = shared_layer_two(avg_pool)

    max_pool = tf.keras.layers.GlobalMaxPooling1D()(input_feature)
    max_pool = tf.keras.layers.Reshape((1,1,channel))(max_pool)
    max_pool = shared_layer_one(max_pool)
    max_pool = shared_layer_two(max_pool)
   

    cbam_feature = tf.keras.layers.Add()([avg_pool,max_pool])
    cbam_feature = tf.keras.layers.Activation('sigmoid')(cbam_feature)


    return multiply([input_feature, cbam_feature])
def spatial_attention(input_feature,kernel_siz):
    kernel_size = kernel_siz

    channel = input_feature.shape[-1]
    cbam_feature = input_feature

    avg_pool = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=3, keepdims=True))(cbam_feature)
    #assert avg_pool._keras_shape[-1] == 1
    max_pool = tf.keras.layers.Lambda(lambda x: K.max(x, axis=3, keepdims=True))(cbam_feature)
    #assert max_pool._keras_shape[-1] == 1
    concat = tf.keras.layers.Concatenate(axis=3)([avg_pool, max_pool])
    #assert concat._keras_shape[-1] == 2
    cbam_feature = tf.keras.layers.Conv1D(filters = 1,
                    kernel_size=kernel_size,
                    strides=1,
                    padding='same',
                    activation='sigmoid',
                    kernel_initializer='he_normal',
                    use_bias=False)(concat)	
    #assert cbam_feature._keras_shape[-1] == 1

    return multiply([input_feature, cbam_feature])


# Convolutional Autoencoder

In [None]:
class MyModel(tf.keras.Model):
    

    def __init__(self):
        super (MyModel,self).__init__()
        #self.f5 = tf.keras.layers.Flatten()
        self.f1 = tf.keras.layers.Dense(512,activation = gelu,kernel_regularizer=tf.keras.regularizers.l2())
        self.f2 = tf.keras.layers.Conv1D(filters = 16,kernel_size = 3,padding='same',strides = 2,activation=gelu)
        self.f3 = tf.keras.layers.MaxPool1D(pool_size = 2)
        self.f4 = tf.keras.layers.Conv1D(filters = 4,kernel_size = 3,padding='same',strides = 2,activation=gelu)
        self.f5 = tf.keras.layers.MaxPool1D(pool_size = 4)
        #self.att = MultiHeadAttention(head_num = 2)
        #self.at = SeqSelfAttention(attention_width=15,attention_activation=gelu,attention_regularizer_weight=1e-4)
        #self.l1 = tf.keras.layers.GRU(4,return_sequences = True,bias_regularizer=tf.keras.regularizers.l2(1e-4),activity_regularizer=tf.keras.regularizers.l2(1e-5))
        
        
        self.f6 = tf.keras.layers.Conv1DTranspose(filters = 4,kernel_size = 3,padding='same',strides =2,activation=gelu)
        self.f7 = tf.keras.layers.UpSampling1D(size = 4)
        self.f8 = tf.keras.layers.Conv1DTranspose(filters = 16,kernel_size = 3,padding='same',strides =2,activation=gelu)
        self.f9 = tf.keras.layers.UpSampling1D(size = 2)
        #self.f12 = tf.keras.layers.Conv1DTranspose(filters = 1,kernel_size = 3,padding='same',activation=gelu)
        self.flatten = tf.keras.layers.Flatten()
        self.f10 = tf.keras.layers.Dense(512,activation = gelu,kernel_regularizer=tf.keras.regularizers.l2())
        self.f11 = tf.keras.layers.Dense(23094,activation = gelu,kernel_regularizer=tf.keras.regularizers.l2())
    def call(self,x):
        #x = Reshape((17814,1))(x)
        x = self.f1(x)
        x = Reshape((512,1))(x)
        x = self.f2(x)
        x = self.f3(x)
        x = self.f4(x)
        encode = self.f5(x)
        #c = self.flatten(encode)
        a = cbam_block(encode)
        x = Reshape((16,4))(a)
        #c1 = self.at(encode)
        #c2 = self.l1(x)
        c = self.flatten(x)
        x = self.f6(encode)
        x = self.f7(x)
        x = self.f8(x)
        x = self.f9(x)
        #x = self.f12(x)
        x = self.flatten(x)
        x = self.f10(x)
        decode = self.f11(x)
        z = tf.concat([c,decode],axis=1)
        return z
    
    
model = MyModel()
model.build(input_shape=(104,23094))
model.call(Input(shape=(23094,)))

model.summary()

# Training and testing

In [None]:
optimizer = keras.optimizers.Adam()
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        recon_loss = losses.mean_squared_error(x,logits[:,64:])
        loss_value = recon_loss
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value,logits[:,64:]


def val_step(x, y):
    val_logits = model(x, training=False)

In [None]:
kf = KFold(n_splits=4)
epos = 100

for epo in range(epos):
    print("\nStart of epoch %d" % (epo,))
    #for x_batch_train, y_batch_train in train_ds:
    loss,log=train_step(data[0:83,:], tag[0:83])
    log = np.array(log)
    
    X_train,X_test,y_train,y_test = train_test_split(log,tag[0:83],test_size=0.2,random_state=20)
    gbm= GradientBoostingClassifier(learning_rate=0.1,n_estimators=300,max_depth=3,min_samples_leaf =5, min_samples_split =5, max_features='sqrt',subsample=0.8,random_state=10)
    #lgb = LGBMClassifier(boosting_type='gbdt',num_leaves=127, learning_rate=0.1, n_estimators=500,objective=multiclass)
    #lgb.fit(X_train, y_train)
    #modelknn=KNeighborsClassifier(n_neighbors=10)
    #modeldeep = CascadeForestClassifier(random_state=0)
    #modelrf = RandomForestClassifier(random_state=0)
    #modelsvm = svm.SVC(kernel='linear',probability=True,random_state=20)
    #mod = xgb.XGBClassifier(n_estimators=500,max_depth=6,min_child_weight = 1,gamma=0.,subsample=0.8,objective='multi:softmax',random_state=27)
    gbm.fit(X_train,y_train)
    y_pred = gbm.predict(X_test)
 
    accuracy = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred, average='weighted' )
    p = precision_score(y_test,y_pred, average='weighted')
    r = recall_score(y_test,y_pred, average='weighted')
    print("accuracy: %.2f" % (float(accuracy),))
    print("Precision：%.2f",p)
    print("Recall：%.2f",r)
    print("F1：%.2f",f1)
 

    #for x_batch_val, y_batch_val in val_ds:
    val_step(data[83:,:], tag[83:])
    #modelsvm2 = svm.SVC(kernel='linear',probability=True,random_state=20)

Start of epoch 36
accuracy: 0.82
Precision： 0.72
Recall： 0.76
F1： 0.73