In [1]:
import os
import copy
import torch
import argparse
import numpy as np
import pandas as pd
from util import * 
from model import *
from pandas import DataFrame
from transformers import get_linear_schedule_with_warmup

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--epoches',              type=int,  default=30,  help='')
parser.add_argument('--batch_size',           type=int,  default=16,  help='')
parser.add_argument('--max_length',           type=int,  default=200, help='')
parser.add_argument('--learning_rate',        type=float, default=1e-4, help="")
parser.add_argument('--model_path',           type=str,  default="../DNAB-ERT/3-new-12w-0", help='')
parser.add_argument('--ind_filename',  type=str,  default="../dataset/enhancer_3-mer_DNABERT_ind.txt", help='')
parser.add_argument('--tra_filename',  type=str,  default="../dataset/enhancer_3-mer_DNABERT_tra.txt", help='')

args = parser.parse_args(args=[]) 

In [3]:
# L2-normalization and average
mers = [3,4,5, 6]

ind_pre_label_list, ind_real_label_list = [], []
tra_pre_label_list, tra_real_label_list = [], []

for mer in mers:
    
    args.model_path = "../../DNA-BERT/{}-new-12w-0".format(mer)
    args.ind_filename = "../dataset/enhancer_{}-mer_DNABERT_ind.txt".format(mer)
    args.tra_filename = "../dataset/enhancer_{}-mer_DNABERT_tra.txt".format(mer)
    ind_dataloader = getData(args, split=False, validation=True, shuffle=False)
    tra_dataloader = getData(args, split=False, validation=False, shuffle=False)

    # ensemble for average_L2
    model = C_Bert_2FC_average.from_pretrained(args.model_path, num_labels=1).to(device)
    model_dict = torch.load("fine-trained_model/C_Bert_2FC_average_{}-mer.pt".format(mer))
    model.load_state_dict(model_dict)
    
    # prediction in training dataset 
    tra_pre_labels, tra_real_labels = prediction(model, tra_dataloader, kmer=mer)
    tra_pre_label_list.append(tra_pre_labels)
    tra_real_label_list.append(tra_real_labels)
    
    acc, mcc, sn, sp = evaluation_criterion(tra_pre_labels, tra_real_labels)
    print("tra--mer: {}; acc:{:.4f}, sn:{:.4f}, sp:{:.4f}, mcc:{:.4f}".format(mer, acc, sn, sp, mcc))
    
    # prediction in independent dataset
    ind_pre_labels, ind_real_labels = prediction(model, ind_dataloader, kmer=mer)
    ind_pre_label_list.append(ind_pre_labels)
    ind_real_label_list.append(ind_real_labels)
    
    acc, mcc, sn, sp = evaluation_criterion(ind_pre_labels, ind_real_labels)
    print("ind--mer: {}; acc:{:.4f}, sn:{:.4f}, sp:{:.4f}, mcc:{:.4f}".format(mer, acc, sn, sp, mcc))

Some weights of the model checkpoint at ../DNA-BERT/3-new-12w-0 were not used when initializing C_Bert_2FC_average: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing C_Bert_2FC_average from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_2FC_average from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of C_Bert_2FC_average were not initialized from the model checkpoint at ../DNA-BERT/3-new-12w-0 and are newly initialized: ['classifier1.

tra--mer: 3; acc:0.9535, sn:0.9474, sp:0.9596, mcc:0.9071
ind--mer: 3; acc:0.8075, sn:0.7900, sp:0.8250, mcc:0.6154


Some weights of the model checkpoint at ../DNA-BERT/4-new-12w-0 were not used when initializing C_Bert_2FC_average: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing C_Bert_2FC_average from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_2FC_average from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of C_Bert_2FC_average were not initialized from the model checkpoint at ../DNA-BERT/4-new-12w-0 and are newly initialized: ['classifier1.

tra--mer: 4; acc:0.9151, sn:0.8875, sp:0.9427, mcc:0.8315
ind--mer: 4; acc:0.7975, sn:0.7550, sp:0.8400, mcc:0.5972


Some weights of the model checkpoint at ../DNA-BERT/5-new-12w-0 were not used when initializing C_Bert_2FC_average: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing C_Bert_2FC_average from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_2FC_average from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of C_Bert_2FC_average were not initialized from the model checkpoint at ../DNA-BERT/5-new-12w-0 and are newly initialized: ['classifier1.

tra--mer: 5; acc:0.9394, sn:0.9353, sp:0.9434, mcc:0.8787
ind--mer: 5; acc:0.8075, sn:0.7900, sp:0.8250, mcc:0.6154


Some weights of the model checkpoint at ../DNA-BERT/6-new-12w-0 were not used when initializing C_Bert_2FC_average: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing C_Bert_2FC_average from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing C_Bert_2FC_average from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of C_Bert_2FC_average were not initialized from the model checkpoint at ../DNA-BERT/6-new-12w-0 and are newly initialized: ['classifier1.

tra--mer: 6; acc:0.9104, sn:0.8996, sp:0.9212, mcc:0.8209
ind--mer: 6; acc:0.7950, sn:0.7650, sp:0.8250, mcc:0.5911


In [4]:
tra_pre_label_average = np.array(tra_pre_label_list).mean(axis=0)
tra_real_label_average = np.array(tra_real_label_list).mean(axis=0)
tra_acc, tra_mcc, tra_sn, tra_sp = evaluation_criterion(tra_pre_label_average, tra_real_label_average)
print("ensemble learning; tra--acc:{:.4f}, sn:{:.4f}, sp:{:.4f}, mcc:{:.4f}".format(tra_acc, tra_sn, tra_sp, tra_mcc))

ind_pre_label_average = np.array(ind_pre_label_list).mean(axis=0)
ind_real_label_average = np.array(ind_real_label_list).mean(axis=0)
ind_acc, ind_mcc, ind_sn, ind_sp = evaluation_criterion(ind_pre_label_average, ind_real_label_average)
print("ensemble learning; ind--acc:{:.4f}, sn:{:.4f}, sp:{:.4f}, mcc:{:.4f}".format(ind_acc, ind_sn, ind_sp, ind_mcc))

ensemble learning; tra--acc:0.9471, sn:0.9373, sp:0.9569, mcc:0.8944
ensemble learning; ind--acc:0.8300, sn:0.8000, sp:0.8600, mcc:0.6612
