In [1]:
import os
import torch
import argparse
import numpy as np
import pandas as pd
from util import * 
from model import *
from pandas import DataFrame
from sklearn import metrics

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--epoches',              type=int,  default=30,  help='')
parser.add_argument('--batch_size',           type=int,  default=128,  help='')
parser.add_argument('--max_length',           type=int,  default=200, help='')
parser.add_argument('--learning_rate',        type=float, default=1e-4, help="")
parser.add_argument('--model_path',           type=str,  default="../3-new-12w-0", help='')
parser.add_argument('--ind_filename',  type=str,  default="../dataset/enhancer_3-mer_DNABERT_ind.txt", help='')
parser.add_argument('--tra_filename',  type=str,  default="../dataset/enhancer_3-mer_DNABERT_tra.txt", help='')

args = parser.parse_args(args=[]) # 如果不使用"args=[]"，会报错

In [3]:
def embedding(model, dataloader, mer):
    output_embeddings = []
    output_labels = []
    for batch_data in dataloader:
        model.eval()
        with torch.no_grad():   # No gradient
            input_ids = batch_data["input_ids"].to(device)
            attention_mask = batch_data["attention_mask"].to(device)
            labels = batch_data["labels"]
            
            embeddings = model(input_ids, attention_mask, kmer=mer).to("cpu").numpy()
        
        if len(output_embeddings) == 0:
            output_embeddings = embeddings
            output_labels = labels
        else:
            output_embeddings = np.concatenate((output_embeddings, embeddings), axis=0)
            output_labels = np.concatenate((output_labels, labels), axis=0)

    return output_embeddings, output_labels


def train_classifier(model, dataloader, optimizer):
    train_iter, train_loss_sum = 0.0, 0.0
    real_labels, pre_labels = [], []
    for batch_data in dataloader:
        batch_features = batch_data["features"].to(device)
        batch_labels = batch_data["labels"].to(device)

        model.train()
        optimizer.zero_grad()
        outputs = model(batch_features, batch_labels)
        
        # 反向梯度信息
        loss = outputs[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        # 参数更新
        optimizer.step()

        # compute performance
        train_loss = outputs[0].detach().to("cpu").numpy()
        logits = outputs[1].detach()

        train_iter += 1
        train_loss_sum += train_loss

        if len(real_labels) == 0:
            real_labels = batch_labels.to("cpu").numpy()
            pre_labels = logits.to("cpu").numpy()
        else:
            real_labels = np.concatenate([real_labels, batch_labels.to("cpu").numpy()], axis=0)
            pre_labels = np.concatenate([pre_labels, logits.to("cpu").numpy()], axis=0)

    tra_loss = train_loss_sum/(train_iter)
    acc, mcc, sn, sp = evaluation_criterion(pre_labels, real_labels)

    fpr, tpr, threshold = metrics.roc_curve(real_labels, pre_labels)
    roc_auc = metrics.auc(fpr, tpr)

    return tra_loss, acc, mcc, sn, sp, roc_auc


def test_classifier(model, dataloader, optimizer):
    test_iter, test_loss_sum = 0.0, 0.0
    real_labels, pre_labels = [], []
    for batch_data in dataloader:
        batch_features = batch_data["features"].to(device)
        batch_labels = batch_data["labels"].to(device)

        model.eval()
        with torch.no_grad():
            outputs = model(batch_features, batch_labels)
        
        # compute performance
        test_loss = outputs[0].detach().to("cpu").numpy()
        logits = outputs[1].detach()

        test_iter += 1
        test_loss_sum += test_loss

        if len(real_labels) == 0:
            real_labels = batch_labels.to("cpu").numpy()
            pre_labels = logits.to("cpu").numpy()
        else:
            real_labels = np.concatenate([real_labels, batch_labels.to("cpu").numpy()], axis=0)
            pre_labels = np.concatenate([pre_labels, logits.to("cpu").numpy()], axis=0)

    test_loss = test_loss_sum/(test_iter)
    acc, mcc, sn, sp = evaluation_criterion(pre_labels, real_labels)

    fpr, tpr, threshold = metrics.roc_curve(real_labels, pre_labels)
    roc_auc = metrics.auc(fpr, tpr)

    return test_loss, acc, mcc, sn, sp, roc_auc, real_labels, pre_labels



In [4]:
# average [3,4,5,6]
# 添加L2-正则化 average
mers = [3, 4, 5, 6]
test_real_labels_list, test_pre_labels_list = [], []

filename = "./result/record.txt"
file_log = open(filename, "w")
content = "state\ttrain-loss\tacc\tmcc\tsn\tsp\tauc\tind-loss\tacc\tmcc\tsn\tsp\tauc\n"
file_log.write(content)
file_log.flush()


for i in range(len(mers)):
    mer = mers[i]
    
    args.model_path = "../../DNA-BERT-{}-new-12w-0".format(mer)
    args.ind_filename = "../dataset/enhancer_{}-mer_DNABERT_ind.txt".format(mer)
    args.tra_filename = "../dataset/enhancer_{}-mer_DNABERT_tra.txt".format(mer)
    
    tra_emb_dataloader = getData(args, split=False, validation=False, shuffle=True)
    ind_emb_dataloader = getData(args, split=False, validation=True, shuffle=False)
    
    emb_model = C_Bert_average_embedding.from_pretrained(args.model_path).to(device)

    tra_emb_features, tra_labels = embedding(emb_model, tra_emb_dataloader, mer)
    ind_emb_features, ind_labels = embedding(emb_model, ind_emb_dataloader, mer)

    train_dataset = NewDataset_classifier(tra_emb_features, tra_labels)
    tra_dataloader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True)

    ind_dataset = NewDataset_classifier(ind_emb_features, ind_labels)
    ind_dataloader = DataLoader(ind_dataset, batch_size = args.batch_size, shuffle = False)


    classifier_model = Enhancer_classifier().to(device)
    epoches = args.epoches
    learning_rate = args.learning_rate
    optimizer = optim.Adam(classifier_model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08,)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.98)    # 指数衰减损失函数

    for epoch in range(epoches):
        tra_loss, tra_acc, tra_mcc, tra_sn, tra_sp, tra_auc = train_classifier(classifier_model, tra_dataloader, optimizer)
        ind_loss, ind_acc, ind_mcc, ind_sn, ind_sp, ind_auc, real_labels, pre_labels = test_classifier(classifier_model, ind_dataloader, optimizer)

        content = "epoch: {};\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f};\n".format(epoch, \
            tra_loss, tra_acc, tra_mcc, tra_sn, tra_sp, tra_auc, ind_loss, ind_acc, ind_mcc, ind_sn, ind_sp, ind_auc)
        print(content)
        # print(real_labels[0:20])

        if epoch == 29:
            test_real_labels_list.append(real_labels)
            test_pre_labels_list.append(pre_labels)
            file_log.write(content)
            file_log.flush()

print()
test_real_labels = np.mean(test_real_labels_list, axis=0)
test_pre_labels = np.mean(test_pre_labels_list, axis=0)

print(test_real_labels.shape)

acc, mcc, sn, sp = evaluation_criterion(test_pre_labels, test_real_labels)
fpr, tpr, threshold = metrics.roc_curve(test_real_labels, test_pre_labels)
roc_auc = metrics.auc(fpr, tpr)

print(acc, mcc, sn, sp, roc_auc)
content = "integrate; acc: {:.5f}, mcc: {:.5f}, sn: {:.5f}, sp: {:.5f}, auc: {:.5f};".format(acc, mcc, sn, sp, roc_auc)
file_log.write(content)
file_log.flush()

Some weights of the model checkpoint at ../3-new-12w-0 were not used when initializing C_Bert_average_embedding: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing C_Bert_average_embedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_average_embedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0; train loss:0.65494, acc: 0.70788, mcc: 0.42572, sn: 0.60040, sp: 0.81536, auc: 0.78083; test loss: 0.63966, acc: 0.70000, mcc: 0.44791, sn: 0.47500, sp: 0.92500, auc: 0.82515;

epoch: 1; train loss:0.60098, acc: 0.73989, mcc: 0.52124, sn: 0.54447, sp: 0.93531, auc: 0.84174; test loss: 0.61466, acc: 0.74000, mcc: 0.50487, sn: 0.58500, sp: 0.89500, auc: 0.82915;

epoch: 2; train loss:0.56235, acc: 0.76415, mcc: 0.54818, sn: 0.63073, sp: 0.89757, auc: 0.85447; test loss: 0.59682, acc: 0.73500, mcc: 0.48072, sn: 0.63000, sp: 0.84000, auc: 0.83332;

epoch: 3; train loss:0.53498, acc: 0.77022, mcc: 0.55650, sn: 0.65094, sp: 0.88949, auc: 0.86037; test loss: 0.58962, acc: 0.73500, mcc: 0.48072, sn: 0.63000, sp: 0.84000, auc: 0.83375;

epoch: 4; train loss:0.51538, acc: 0.77460, mcc: 0.56321, sn: 0.66375, sp: 0.88544, auc: 0.86053; test loss: 0.58575, acc: 0.75500, mcc: 0.51946, sn: 0.66000, sp: 0.85000, auc: 0.83518;

epoch: 5; train loss:0.49899, acc: 0.77392, mcc: 0.55854, sn: 0.6

Some weights of the model checkpoint at ../4-new-12w-0 were not used when initializing C_Bert_average_embedding: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing C_Bert_average_embedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_average_embedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0; train loss:0.66493, acc: 0.61725, mcc: 0.29647, sn: 0.31132, sp: 0.92318, auc: 0.71093; test loss: 0.64973, acc: 0.65750, mcc: 0.37570, sn: 0.38500, sp: 0.93000, auc: 0.81625;

epoch: 1; train loss:0.60000, acc: 0.74090, mcc: 0.52168, sn: 0.54919, sp: 0.93261, auc: 0.84301; test loss: 0.62190, acc: 0.72500, mcc: 0.47498, sn: 0.56500, sp: 0.88500, auc: 0.82220;

epoch: 2; train loss:0.55712, acc: 0.76853, mcc: 0.55652, sn: 0.63747, sp: 0.89960, auc: 0.85610; test loss: 0.61134, acc: 0.74000, mcc: 0.49710, sn: 0.61000, sp: 0.87000, auc: 0.82458;

epoch: 3; train loss:0.52119, acc: 0.77392, mcc: 0.56384, sn: 0.65566, sp: 0.89218, auc: 0.86067; test loss: 0.60724, acc: 0.74000, mcc: 0.49445, sn: 0.62000, sp: 0.86000, auc: 0.82485;

epoch: 4; train loss:0.50650, acc: 0.77864, mcc: 0.56993, sn: 0.67385, sp: 0.88342, auc: 0.86340; test loss: 0.60256, acc: 0.75500, mcc: 0.51946, sn: 0.66000, sp: 0.85000, auc: 0.82537;

epoch: 5; train loss:0.48663, acc: 0.77796, mcc: 0.56617, sn: 0.6

Some weights of the model checkpoint at ../5-new-12w-0 were not used when initializing C_Bert_average_embedding: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing C_Bert_average_embedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_average_embedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0; train loss:0.65486, acc: 0.64623, mcc: 0.35885, sn: 0.35647, sp: 0.93598, auc: 0.75754; test loss: 0.64541, acc: 0.67750, mcc: 0.40594, sn: 0.43500, sp: 0.92000, auc: 0.81378;

epoch: 1; train loss:0.60069, acc: 0.72069, mcc: 0.48903, sn: 0.50539, sp: 0.93598, auc: 0.83404; test loss: 0.61750, acc: 0.72250, mcc: 0.46021, sn: 0.59500, sp: 0.85000, auc: 0.81765;

epoch: 2; train loss:0.55686, acc: 0.76044, mcc: 0.53956, sn: 0.63005, sp: 0.89084, auc: 0.85030; test loss: 0.61684, acc: 0.71000, mcc: 0.44176, sn: 0.55500, sp: 0.86500, auc: 0.81867;

epoch: 3; train loss:0.52907, acc: 0.76482, mcc: 0.54692, sn: 0.64016, sp: 0.88949, auc: 0.85691; test loss: 0.60325, acc: 0.73000, mcc: 0.47267, sn: 0.61500, sp: 0.84500, auc: 0.81988;

epoch: 4; train loss:0.50744, acc: 0.76819, mcc: 0.54865, sn: 0.66307, sp: 0.87332, auc: 0.85818; test loss: 0.60385, acc: 0.73500, mcc: 0.48180, sn: 0.62500, sp: 0.84500, auc: 0.82010;

epoch: 5; train loss:0.49361, acc: 0.77358, mcc: 0.56070, sn: 0.6

Some weights of the model checkpoint at ../6-new-12w-0 were not used when initializing C_Bert_average_embedding: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing C_Bert_average_embedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_average_embedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0; train loss:0.66309, acc: 0.62197, mcc: 0.32037, sn: 0.29784, sp: 0.94609, auc: 0.74477; test loss: 0.65485, acc: 0.66000, mcc: 0.38316, sn: 0.38500, sp: 0.93500, auc: 0.81255;

epoch: 1; train loss:0.62076, acc: 0.71260, mcc: 0.47491, sn: 0.48989, sp: 0.93531, auc: 0.83545; test loss: 0.63170, acc: 0.71000, mcc: 0.44836, sn: 0.53500, sp: 0.88500, auc: 0.81945;

epoch: 2; train loss:0.58704, acc: 0.74191, mcc: 0.51207, sn: 0.57817, sp: 0.90566, auc: 0.84257; test loss: 0.61742, acc: 0.72500, mcc: 0.46355, sn: 0.60500, sp: 0.84500, auc: 0.82000;

epoch: 3; train loss:0.56027, acc: 0.76213, mcc: 0.53609, sn: 0.65768, sp: 0.86658, auc: 0.84889; test loss: 0.61011, acc: 0.73000, mcc: 0.47267, sn: 0.61500, sp: 0.84500, auc: 0.82102;

epoch: 4; train loss:0.53993, acc: 0.75371, mcc: 0.52982, sn: 0.60984, sp: 0.89757, auc: 0.84901; test loss: 0.59760, acc: 0.74500, mcc: 0.49724, sn: 0.66000, sp: 0.83000, auc: 0.82075;

epoch: 5; train loss:0.51921, acc: 0.76853, mcc: 0.54429, sn: 0.6