In [1]:
import torch
from transformers import BertForSequenceClassification, DNATokenizer, BertConfig



In [2]:
# import from original DNABERT prediction module

from run_finetune import MODEL_CLASSES, ALL_MODELS, processors
from run_finetune import load_and_cache_examples
from run_finetune import SequentialSampler, DataLoader, RandomSampler

In [None]:
path_to_test_csv = '/pathToTestCSV'
path_to_model = '/pathToFinetunedModel'

# modified multi-species classification task
task_name = 'dna-genome-classification'

In [3]:
#Initialize multispecies DNABERT model with the same parameters as original DNABERT

import argparse
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
    "--data_dir",
    default=path_to_test_csv,
    type=str,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
)
parser.add_argument(
    "--model_type",
    default='dna',
    type=str,
    help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
    "--n_process",
    default=2,
    type=int,
    help="number of processes used for data process",
)
parser.add_argument(
    "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
)
parser.add_argument(
    "--model_name_or_path",
    default=path_to_model,
    type=str,
    help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
)
parser.add_argument(
    "--task_name",
    default=task_name,
    type=str,
    help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
)
parser.add_argument(
    "--output_dir",
    default="output",
    type=str,
    help="The output directory where the model predictions and checkpoints will be written.",
)


# Other parameters
parser.add_argument(
    "--visualize_data_dir",
    default=None,
    type=str,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
)
parser.add_argument(
    "--result_dir",
    default=None,
    type=str,
    help="The directory where the dna690 and mouse will save results.",
)
parser.add_argument(
    "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
    "--tokenizer_name",
    default="",
    type=str,
    help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
    "--cache_dir",
    default="",
    type=str,
    help="Where do you want to store the pre-trained models downloaded from s3",
)
parser.add_argument(
    "--predict_dir",
    default=None,
    type=str,
    help="The output directory of predicted result. (when do_predict)",
)
parser.add_argument(
    "--max_seq_length",
    default=150,
    type=int,
    help="The maximum total input sequence length after tokenization. Sequences longer "
    "than this will be truncated, sequences shorter will be padded.",
)
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument("--do_predict", action="store_true", help="Whether to do prediction on the given dataset.")
parser.add_argument("--do_visualize", action="store_true", help="Whether to calculate attention score.")
parser.add_argument("--visualize_train", action="store_true", help="Whether to visualize train.tsv or dev.tsv.")
parser.add_argument("--do_ensemble_pred", action="store_true", help="Whether to do ensemble prediction with kmer 3456.")
parser.add_argument(
    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
)
parser.add_argument(
    "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
)

parser.add_argument(
    "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
)
parser.add_argument(
    "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
)
parser.add_argument(
    "--per_gpu_pred_batch_size", default=8, type=int, help="Batch size per GPU/CPU for prediction.",
)
parser.add_argument(
    "--early_stop", default=0, type=int, help="set this to a positive integet if you want to perfrom early stop. The model will stop \
                                                if the auc keep decreasing early_stop times",
)
parser.add_argument(
    "--predict_scan_size",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--beta1", default=0.9, type=float, help="Beta1 for Adam optimizer.")
parser.add_argument("--beta2", default=0.999, type=float, help="Beta2 for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate of attention.")
parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate of intermidiete layer.")
parser.add_argument("--rnn_dropout", default=0.0, type=float, help="Dropout rate of intermidiete layer.")
parser.add_argument("--rnn", default="lstm", type=str, help="What kind of RNN to use")
parser.add_argument("--num_rnn_layer", default=2, type=int, help="Number of rnn layers in dnalong model.")
parser.add_argument("--rnn_hidden", default=768, type=int, help="Number of hidden unit in a rnn layer.")
parser.add_argument(
    "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
)
parser.add_argument(
    "--max_steps",
    default=-1,
    type=int,
    help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument("--warmup_percent", default=0, type=float, help="Linear warmup over warmup_percent*total_steps.")

parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
parser.add_argument(
    "--save_total_limit",
    type=int,
    default=None,
    help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
)
parser.add_argument(
    "--eval_all_checkpoints",
    action="store_true",
    help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
    "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
)
parser.add_argument(
    "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
)
parser.add_argument(
    "--visualize_models", type=int, default=None, help="The model used to do visualization. If None, use 3456.",
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")


parser.add_argument(
    "--fp16",
    action="store_true",
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)
parser.add_argument(
    "--fp16_opt_level",
    type=str,
    default="O1",
    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    "See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")


args, _ = parser.parse_known_args()

In [4]:
device = torch.device("cuda", 0)

# Code for checking memory usage
def device_mem():
    free_memory, total_memory = torch.cuda.mem_get_info(device)
    
    print(f'mem allocated: {(total_memory - free_memory)/(1073741824)} GB')
    print(f'mem free: {free_memory/(1073741824)} GB')

In [5]:
# Initialize DNABERT model to output hidden states
model = BertForSequenceClassification.from_pretrained('checkpoint-176000', output_hidden_states=True)
tokenizer = DNATokenizer.from_pretrained('checkpoint-176000', do_lower_case=args.do_lower_case)

device_mem()

<class 'transformers.tokenization_dna.DNATokenizer'>
mem allocated: 6.6700439453125 GB
mem free: 5.24249267578125 GB


In [7]:
# load test dataset
test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)

In [9]:
# set manual seed for reproducibility
torch.manual_seed(0)

# initialize sampler to sample randomly without replacement
pred_sampler = RandomSampler(test_dataset, replacement=False)
pred_dataloader = DataLoader(test_dataset, sampler=pred_sampler, batch_size=1)

In [10]:
#test_input = tokenizer.encode(["AAAAAA", "CGGGCC", "CGGGCC", "CGGGCC"], return_tensors='pt')

In [11]:
device_mem()

model.eval()
model = model.to(device)


device_mem()

mem allocated: 6.6700439453125 GB
mem free: 5.24249267578125 GB
mem allocated: 7.0567626953125 GB
mem free: 4.85577392578125 GB


In [None]:
# generate outputs and the final layers of the hidden states
# store them in preds array

from tqdm import tqdm
import numpy as np
preds = []

num_batches = 0

for batch in tqdm(pred_dataloader):
    num_batches += 1
    if num_batches == 10000:
        break
    # device_mem()
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        output = model(input_ids=batch[0], attention_mask=batch[1])
        ground_truth = batch[3].detach().cpu().numpy()
        predictions = output[0].detach().cpu().numpy()
        # hidden_state = output[1][-1].detach().cpu().numpy()
        hidden_state = output[1][-1]
        '''
        for hidden_tensor in output[1]:
            
            
            
            if hidden_state is None:
                hidden_state = hidden_tensor.detach().cpu().numpy()
            else:
                hidden_state = np.vstack((hidden_state, hidden_tensor.detach().cpu().numpy()))
                
        '''

        preds.append([ground_truth, predictions, hidden_state])
        #preds.append([ground_truth, predictions])
    batch = tuple(t.to('cpu') for t in batch)



  4%|▍         | 2947/66666 [00:27<09:44, 108.92it/s]

In [42]:
len(preds[0][2][0])

150

In [46]:
# every element in the array contains: true label, probabilities for each species, and final layer of hidden state

preds[0]

[array([6]),
 array([[-5.7757115 ,  0.50725484, -5.3489494 , -5.795606  , -3.8426554 ,
         -5.2826967 ,  8.869636  , -1.574056  ,  2.6732917 , -1.971483  ,
         -3.802625  , -4.2452264 , -2.2954338 , -1.6202209 , -4.0202074 ,
         -4.16784   , -3.2110126 , -4.9556484 , -4.660567  , -2.9408958 ,
         -2.1119242 , -3.3046687 , -2.8836193 , -4.597231  , -1.5807074 ,
         -1.5889636 ]], dtype=float32),
 tensor([[[ 1.4950,  0.1428, -1.0611,  ...,  0.4944, -1.7593, -0.1664],
          [ 0.7785,  0.4851, -0.7086,  ..., -0.6719, -0.7252,  0.5714],
          [ 0.5982,  0.4157, -0.1231,  ..., -1.6310,  0.4971, -0.0163],
          ...,
          [ 2.0788, -0.0275, -1.8271,  ...,  1.4939, -0.5857, -0.4547],
          [ 2.0787, -0.0275, -1.8271,  ...,  1.4940, -0.5856, -0.4547],
          [ 2.0787, -0.0276, -1.8271,  ...,  1.4939, -0.5856, -0.4547]]],
        device='cuda:0')]

In [13]:
# save array in pt file

torch.save(preds, "outputs.pt")