In [None]:
import os
from sklearn.decomposition import PCA
import random
from numpy.random import seed

seed(1)
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Activation, BatchNormalization, Flatten
from keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from tensorflow.keras.callbacks import EarlyStopping

import csv
from sklearn.utils import shuffle

import pandas as pd

In [None]:
vector_size = 1447
event_num = 2
droprate = 0.3

In [None]:
def DNN():
    print("________DNN_________")
    train_input = Input(shape=(vector_size,), name='Inputlayer')
    train_in = Dense(512, activation='relu')(train_input)
    train_in = BatchNormalization()(train_in)
    train_in = Dropout(droprate)(train_in)

    train_in = Dense(256, activation='relu')(train_in)
    train_in = BatchNormalization()(train_in)
    train_in = Dropout(droprate)(train_in)

    train_in = Dense(event_num)(train_in)
    out = Activation('softmax')(train_in)

    model = Model(train_input, out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
def readdata():
    #reading data
    #dataset_path= '/dataset_piazza.csv' #piazza
    #dataset_path= '/dataset_reznik.csv' #reznik
    #dataset_path= '/dataset_stitch_ecoli_150.csv' #stitch_ecoli_150
    #dataset_path= '/dataset_stitch_ecoli_400.csv' #stitch_ecoli_400
    #dataset_path= '/dataset_stitch_ecoli_700.csv' #stitch_ecoli_700
    #dataset_path= '/dataset_stitch_ecoli_900.csv' #stitch_ecoli_900
    #dataset_path= '/dataset_stitch_yeast_150.csv' #stitch_yeast_150
    #dataset_path= '/dataset_stitch_yeast_400.csv' #stitch_yeast_400
    #dataset_path= '/dataset_stitch_yeast_700.csv' #stitch_yeast_700
    dataset_path=  '/dataset_stitch_yeast_900.csv' #stitch_yeast_900
    
    all_matrix= []
    all_labels= []


    df= pd.read_csv(dataset_path, header=None)

    df[df>2**32]=np.nan
    df[df<-2**32]=np.nan
    df= df.fillna(df.mean())
    all= df.to_numpy(dtype= np.float64)
    del df

    
    #pca
    print("performing PCA")
    all_matrix= all[:, :-1]
    
    #all_labels= np.array(all_labels)
    all_labels= all[:, -1]
    
    print(all_matrix.shape)
    print(all_labels.shape)
    pca= PCA(n_components= 1447)
    print("fitting PCA and transforming origin matrix with shape:\t", all_matrix.shape)
    all_matrix= pca.fit_transform(all_matrix)
    print("New matrix shape", all_matrix.shape)


    all_matrix, all_labels= shuffle(all_matrix, all_labels, random_state= 42)
    all_matrix= list(all_matrix)
    all_labels= list(all_labels)

 
    return all_matrix, all_labels


In [None]:
def evaluate(pred_type, pred_score, y_test, event_num):
    y_one_hot = label_binarize(y_test, classes= np.arange(event_num + 1))
    y_one_hot = y_one_hot[:, [0, 1]]

    result_auc_micro = roc_auc_score(y_one_hot, pred_score, average='micro')
    result_auc_macro = roc_auc_score(y_one_hot, pred_score, average='macro')
    return result_auc_micro, result_auc_macro

In [None]:
def get_index(label_matrix, event_num, seed, CV):
    index_all_class = np.zeros(len(label_matrix))
    for j in range(event_num):
        index = np.where(label_matrix == j)
        kf = KFold(n_splits=CV, shuffle=True, random_state=seed)
        k_num = 0
        for train_index, test_index in kf.split(range(len(index[0]))):
            index_all_class[index[0][test_index]] = k_num
            k_num += 1
    return index_all_class

In [None]:
def cross_validation(feature_matrix, label_matrix, clf_type, event_num, seed, CV):
    y_true = np.array([])
    y_pred = np.array([])
    y_score = np.zeros((0, event_num), dtype=float)
    label_matrix = np.array(label_matrix)
    feature_matrix = np.array(feature_matrix)
    index_all_class = get_index(label_matrix, event_num, seed, CV)

    matrix = []
    print("_____cross_validation_____")

    for k in range(CV):
        train_index = np.where(index_all_class != k)
        test_index = np.where(index_all_class == k)
        pred = np.zeros((len(test_index[0]), event_num), dtype=float)

        x_train = feature_matrix[train_index]
        x_test = feature_matrix[test_index]
        y_train = label_matrix[train_index]
        y_test = label_matrix[test_index]

        y_train_one_hot = np.array(y_train)
        y_train_one_hot = (np.arange(y_train_one_hot.max() + 1) == y_train[:, None]).astype(dtype='float32')

        y_test_one_hot = np.array(y_test)
        y_test_one_hot = (np.arange(y_test_one_hot.max() + 1) == y_test[:, None]).astype(dtype='float32')

        if clf_type == 'DNN':
            dnn = DNN()
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
            dnn.fit(x_train, y_train_one_hot, batch_size=64, epochs=100, validation_data=(x_test, y_test_one_hot),
                    callbacks=[early_stopping])

            pred += dnn.predict(x_test)
        else:
            print("_______ERROR___________")
        pred_score = pred / 1
        pred_type = np.argmax(pred_score, axis=1)
        y_true = np.hstack((y_true, y_test))
        y_pred = np.hstack((y_pred, pred_type))
        y_score = np.row_stack((y_score, pred_score))

        wfp = open(str(k) + '.txt', 'w')
        for i in range(len(y_test)):
            res = str(pred_score[i][0]) + ' ' + str(pred_score[i][1]) + ' ' + str(y_test[i]) + '\n'
            wfp.write(res)
        wfp.close()

        #########evaluate auc###########
        result_micro, result_macro = evaluate(pred_type, pred_score, y_test, event_num)
        print("idx, auc_micro, auc_macro: ", k, result_micro, result_macro)
    result_all_micro, result_all_macro = evaluate(y_pred, y_score, y_true, event_num)
    print("auc_micro_all, auc_macro_all: ", result_all_micro, result_all_macro)

In [None]:
seed = 0
CV = 10
all_matrix, all_labels = readdata()

performing PCA
(47150, 3475)
(47150,)
fitting PCA and transforming origin matrix with shape:	 (47150, 3475)
New matrix shape (47150, 1447)


In [None]:
y_true = np.array([])
y_pred = np.array([])
y_score = np.zeros((0, event_num), dtype=float)
label_matrix = np.array(all_labels)
feature_matrix = np.array(all_matrix)
index_all_class = get_index(label_matrix, event_num, seed, CV)

matrix = []
print("_____cross_validation_____")

for k in range(CV):
    train_index = np.where(index_all_class != k)
    test_index = np.where(index_all_class == k)
    pred = np.zeros((len(test_index[0]), event_num), dtype=float)

    x_train = feature_matrix[train_index]
    x_test = feature_matrix[test_index]
    y_train = label_matrix[train_index]
    y_test = label_matrix[test_index]

    y_train_one_hot = np.array(y_train)
    y_train_one_hot = (np.arange(y_train_one_hot.max() + 1) == y_train[:, None]).astype(dtype='float32')

    y_test_one_hot = np.array(y_test)
    y_test_one_hot = (np.arange(y_test_one_hot.max() + 1) == y_test[:, None]).astype(dtype='float32')


    dnn = DNN()
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
    dnn.fit(x_train, y_train_one_hot, batch_size=64, epochs=100, validation_data=(x_test, y_test_one_hot),
            callbacks=[early_stopping])

    pred += dnn.predict(x_test)

    pred_score = pred / 1
    pred_type = np.argmax(pred_score, axis=1)
    y_true = np.hstack((y_true, y_test))
    y_pred = np.hstack((y_pred, pred_type))
    y_score = np.row_stack((y_score, pred_score))

    wfp = open(str(k) + '.txt', 'w')
    for i in range(len(y_test)):
        res = str(pred_score[i][0]) + ' ' + str(pred_score[i][1]) + ' ' + str(y_test[i]) + '\n'
        wfp.write(res)
    wfp.close()

    #########evaluate auc###########
    result_micro, result_macro = evaluate(pred_type, pred_score, y_test, event_num)
    print("idx, auc_micro, auc_macro: ", k, result_micro, result_macro)
result_all_micro, result_all_macro = evaluate(y_pred, y_score, y_true, event_num)
print("auc_macro_all: ", result_all_macro)
print("F1 Macro", f1_score(y_true, y_pred, average='macro'))
print("Accuracy:", accuracy_score(y_true, y_pred))

_____cross_validation_____
________DNN_________
Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Inputlayer (InputLayer)     [(None, 1447)]            0         
                                                                 
 dense_60 (Dense)            (None, 512)               741376    
                                                                 
 batch_normalization_40 (Bat  (None, 512)              2048      
 chNormalization)                                                
                                                                 
 dropout_40 (Dropout)        (None, 512)               0         
                                                                 
 dense_61 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_41 (Bat  (None, 256)              1024      
 chNormali