In [1]:
import torch
from transformers import BertForSequenceClassification, DNATokenizer, BertConfig



In [2]:
from run_finetune import MODEL_CLASSES, ALL_MODELS, processors

In [3]:
import argparse
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
    "--data_dir",
    default='/home/015861469/DNABERT-genome-classification/examples/sample_data/ft/6-2',
    type=str,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
)
parser.add_argument(
    "--model_type",
    default='dna',
    type=str,
    help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
    "--n_process",
    default=2,
    type=int,
    help="number of processes used for data process",
)
parser.add_argument(
    "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
)
parser.add_argument(
    "--model_name_or_path",
    default='/home/015861469/DNABERT-genome-classification/examples/finetuned-model',
    type=str,
    help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
)
parser.add_argument(
    "--task_name",
    default='dna-genome-classification',
    type=str,
    help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
)
parser.add_argument(
    "--output_dir",
    default='/home/015861469/dnabert-finetune2/output',
    type=str,
    help="The output directory where the model predictions and checkpoints will be written.",
)


# Other parameters
parser.add_argument(
    "--visualize_data_dir",
    default=None,
    type=str,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
)
parser.add_argument(
    "--result_dir",
    default=None,
    type=str,
    help="The directory where the dna690 and mouse will save results.",
)
parser.add_argument(
    "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
    "--tokenizer_name",
    default="",
    type=str,
    help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
    "--cache_dir",
    default="",
    type=str,
    help="Where do you want to store the pre-trained models downloaded from s3",
)
parser.add_argument(
    "--predict_dir",
    default=None,
    type=str,
    help="The output directory of predicted result. (when do_predict)",
)
parser.add_argument(
    "--max_seq_length",
    default=150,
    type=int,
    help="The maximum total input sequence length after tokenization. Sequences longer "
    "than this will be truncated, sequences shorter will be padded.",
)
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument("--do_predict", action="store_true", help="Whether to do prediction on the given dataset.")
parser.add_argument("--do_visualize", action="store_true", help="Whether to calculate attention score.")
parser.add_argument("--visualize_train", action="store_true", help="Whether to visualize train.tsv or dev.tsv.")
parser.add_argument("--do_ensemble_pred", action="store_true", help="Whether to do ensemble prediction with kmer 3456.")
parser.add_argument(
    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
)
parser.add_argument(
    "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
)

parser.add_argument(
    "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
)
parser.add_argument(
    "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
)
parser.add_argument(
    "--per_gpu_pred_batch_size", default=8, type=int, help="Batch size per GPU/CPU for prediction.",
)
parser.add_argument(
    "--early_stop", default=0, type=int, help="set this to a positive integet if you want to perfrom early stop. The model will stop \
                                                if the auc keep decreasing early_stop times",
)
parser.add_argument(
    "--predict_scan_size",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--beta1", default=0.9, type=float, help="Beta1 for Adam optimizer.")
parser.add_argument("--beta2", default=0.999, type=float, help="Beta2 for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate of attention.")
parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate of intermidiete layer.")
parser.add_argument("--rnn_dropout", default=0.0, type=float, help="Dropout rate of intermidiete layer.")
parser.add_argument("--rnn", default="lstm", type=str, help="What kind of RNN to use")
parser.add_argument("--num_rnn_layer", default=2, type=int, help="Number of rnn layers in dnalong model.")
parser.add_argument("--rnn_hidden", default=768, type=int, help="Number of hidden unit in a rnn layer.")
parser.add_argument(
    "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
)
parser.add_argument(
    "--max_steps",
    default=-1,
    type=int,
    help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument("--warmup_percent", default=0, type=float, help="Linear warmup over warmup_percent*total_steps.")

parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
parser.add_argument(
    "--save_total_limit",
    type=int,
    default=None,
    help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
)
parser.add_argument(
    "--eval_all_checkpoints",
    action="store_true",
    help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
    "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
)
parser.add_argument(
    "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
)
parser.add_argument(
    "--visualize_models", type=int, default=None, help="The model used to do visualization. If None, use 3456.",
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")


parser.add_argument(
    "--fp16",
    action="store_true",
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)
parser.add_argument(
    "--fp16_opt_level",
    type=str,
    default="O1",
    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    "See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")


args, _ = parser.parse_known_args()

In [4]:
device = torch.device("cuda", 0)

def device_mem():
    free_memory, total_memory = torch.cuda.mem_get_info(device)
    
    print(f'mem allocated: {(total_memory - free_memory)/(1073741824)} GB')
    print(f'mem free: {free_memory/(1073741824)} GB')

In [5]:



# model = BertForSequenceClassification.from_pretrained('checkpoint-176000')
model = BertForSequenceClassification.from_pretrained('checkpoint-176000', output_hidden_states=True)
tokenizer = DNATokenizer.from_pretrained('checkpoint-176000', do_lower_case=args.do_lower_case)

device_mem()

<class 'transformers.tokenization_dna.DNATokenizer'>
mem allocated: 0.8946533203125 GB
mem free: 11.01788330078125 GB


In [6]:
from run_finetune import load_and_cache_examples

In [7]:
test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)

In [8]:
from run_finetune import SequentialSampler, DataLoader

In [9]:


pred_sampler = SequentialSampler(test_dataset)
pred_dataloader = DataLoader(test_dataset, sampler=pred_sampler, batch_size=1)

In [10]:
#test_input = tokenizer.encode(["AAAAAA", "CGGGCC", "CGGGCC", "CGGGCC"], return_tensors='pt')

In [11]:
device_mem()

model.eval()
model = model.to(device)


device_mem()

mem allocated: 0.8946533203125 GB
mem free: 11.01788330078125 GB
mem allocated: 1.2813720703125 GB
mem free: 10.63116455078125 GB


In [None]:
from tqdm import tqdm
import numpy as np
preds = []

num_batches = 0

for batch in tqdm(pred_dataloader):
    # device_mem()
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        output = model(input_ids=batch[0], attention_mask=batch[1])
        ground_truth = batch[3].detach().cpu().numpy()
        predictions = output[0].detach().cpu().numpy()
        #hidden_state = output[1][-1].detach().cpu().numpy()
        
        '''
        for hidden_tensor in output[1]:
            
            
            
            if hidden_state is None:
                hidden_state = hidden_tensor.detach().cpu().numpy()
            else:
                hidden_state = np.vstack((hidden_state, hidden_tensor.detach().cpu().numpy()))
                
        '''

        #preds.append([ground_truth, predictions, hidden_state])
        preds.append([ground_truth, predictions])
    batch = tuple(t.to('cpu') for t in batch)



 30%|███       | 20194/66666 [03:03<06:58, 110.96it/s]

In [18]:


# np.argmax(preds[0][0])
len(preds)


7932

In [None]:
torch.save(preds, "allsampleswithlasthiddenlayer.pt")

In [20]:
preds[0]

[array([10]),
 array([[-1.3957361 , -1.4652294 , -0.5089347 , -1.3103321 , -2.1526408 ,
         -2.4359052 , -0.59401613, -1.7563827 ,  0.18181531, -1.847075  ,
          0.10288739, -1.6903058 , -1.1824719 ,  0.83473694, -0.845769  ,
         -1.716885  , -0.39154914,  1.0610424 ,  0.41459253, -2.7450264 ,
         -1.6299285 ,  0.36574784, -1.9848994 , -1.8938447 ,  9.199543  ,
          0.62960726]], dtype=float32),
 tensor([[[-0.6737,  1.8087, -1.0561,  ..., -1.5085, -0.6884,  0.1325],
          [-2.0218,  0.1378,  0.0759,  ..., -1.6328,  0.1259, -1.3913],
          [-1.5478, -0.1952,  0.5750,  ..., -0.4590,  1.3949,  0.4827],
          ...,
          [ 0.8589,  0.9341, -1.8443,  ..., -0.3697,  0.1014,  0.4760],
          [ 0.8588,  0.9342, -1.8443,  ..., -0.3696,  0.1015,  0.4761],
          [ 0.8589,  0.9341, -1.8443,  ..., -0.3697,  0.1015,  0.4761]]],
        device='cuda:0')]

In [None]:
'''
with torch.no_grad():
    outputs = model(test_dataset)
'''

In [43]:
# outputs[1][0].shape

torch.Size([1, 6, 768])

In [39]:
# len(outputs[1])

13

In [44]:
# outputs[]

(tensor([[-4.9895, -1.3488, -6.2393, -6.4248, -4.7668, -5.7695,  9.8707, -1.6136,
           0.6870, -1.3251, -3.2916, -4.1163, -3.1736,  1.0745, -3.7624, -3.9735,
          -1.6632, -4.4030, -4.4965, -4.8065, -3.5700, -3.2713, -3.9811, -4.3459,
          -2.2340, -2.1067]]),
 (tensor([[[-0.1898, -2.4788, -1.2954,  ..., -1.0075,  0.4486, -0.3974],
           [-0.4983, -0.4832,  0.4338,  ..., -0.1776, -1.0141,  1.0393],
           [-1.2430, -0.0303, -1.8768,  ...,  0.1568, -1.0915, -0.1246],
           [-0.0042, -0.9437, -2.1916,  ...,  0.2222, -1.6541, -0.1496],
           [-0.9131, -0.3854, -0.8939,  ...,  0.2014, -0.7817, -0.8353],
           [-0.3086, -0.1756, -0.7787,  ...,  0.5481,  0.8922, -1.9683]]]),
  tensor([[[-0.1679, -2.2226,  0.2550,  ..., -0.6580, -0.8625, -0.1905],
           [-1.0089, -1.0270,  0.1991,  ..., -0.8095, -0.0135,  1.5513],
           [-0.7402, -1.4995, -0.4611,  ..., -0.9995,  0.2757,  1.4518],
           [-0.1376, -1.9133, -0.5152,  ..., -1.0038, -0.0444, 

In [29]:
# outputs.hidden_states

NameError: name 'outputs' is not defined

In [5]:
# lastlayer = model.base_model.encoder.layer[-1]

In [33]:
# lastlayer = model.base_model

In [34]:
# torch.save(lastlayer.state_dict(), "last_layer.pt")

In [13]:
# model.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4101, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )