In [None]:
KMER = 6
MODEL_PATH = f"./ft/{KMER}/"
DATA_PATH = f"sample_data/ft/{KMER}"
PREDICTION_PATH = f"./result/{KMER}"

class AttrDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

args = AttrDict({
    "model_type": "dna", # one of MODEL_CLASSES
    "tokenizer_name": f"dna{KMER}",
    "model_name_or_path": MODEL_PATH,
    "task_name": "dnaprom",
    "do_predict": True,
    "do_train": False,
    "data_dir": DATA_PATH,
    "max_seq_length": 75,
    "per_gpu_pred_batch_size": 128,
    "output_dir": MODEL_PATH,
    "predict_dir": PREDICTION_PATH,
    "cache_dir": "",
    "n_process": 2,
    "fp16": False,
    "seed": 42,
    "do_lower_case": False,
    "overwrite_output_dir": False,
    "overwrite_cache": False,
    "config_name": "",
    "visualize_data_dir": None,
    "result_dir": None,
    "do_eval": False,
    "do_visualize": False,
    "visualize_train": False,
    "do_ensemble_pred": False,
    "per_gpu_train_batch_size": 8,
    "per_gpu_eval_batch_size": 8,
    "early_stop": 0,
    "predict_scan_size": 1,
    "gradient_accumulation_steps": 1,
    "learning_rate": 5e-5,
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "beta1": 0.9,
    "beta2": 0.999,
    "max_grad_norm": 1.0,
    "attention_probs_dropout_prob": 0.1,
    "hidden_dropout_prob": 0.1,
    "rnn_dropout": 0.0,
    "rnn": "lstm",
    "num_rnn_layer": 2,
    "rnn_hidden": 768,
    "num_train_epochs": 3.0,
    "max_steps": -1,
    "warmup_steps": 0,
    "warmup_percent": 0,
    "logging_steps": 500,
    "save_steps": 500,
    "save_total_limit": None,
    "eval_all_checkpoints": False,
    "no_cuda": False,
    "fp16_opt_level": "O1",
    "local_rank": -1,
    "server_ip": "",
    "server_port": "",
    "evaluate_during_training": False,
    "should_continue": False,
    "visualize_models": None,
})


In [None]:
if args.should_continue:
    sorted_checkpoints = _sorted_checkpoints(args)
    if len(sorted_checkpoints) == 0:
        raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
    else:
        args.model_name_or_path = sorted_checkpoints[-1]

if (
    os.path.exists(args.output_dir)
    and os.listdir(args.output_dir)
    and args.do_train
    and not args.overwrite_output_dir
):
    raise ValueError(
        "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
            args.output_dir
        )
    )

# Setup distant debugging if needed
if args.server_ip and args.server_port:
    # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
    import ptvsd

    print("Waiting for debugger attach")
    ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
    ptvsd.wait_for_attach()

# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    args.local_rank,
    device,
    args.n_gpu,
    bool(args.local_rank != -1),
    args.fp16,
)

# Set seed
set_seed(args)

# Prepare GLUE task
args.task_name = args.task_name.lower()
if args.task_name not in processors:
    raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]()
args.output_mode = output_modes[args.task_name]
label_list = processor.get_labels()
num_labels = len(label_list)

# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

if not args.do_visualize and not args.do_ensemble_pred:
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    
    config.hidden_dropout_prob = args.hidden_dropout_prob
    config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    if args.model_type in ["dnalong", "dnalongcat"]:
        assert args.max_seq_length % 512 == 0
    config.split = int(args.max_seq_length/512)
    config.rnn = args.rnn
    config.num_rnn_layer = args.num_rnn_layer
    config.rnn_dropout = args.rnn_dropout
    config.rnn_hidden = args.rnn_hidden

    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    logger.info('finish loading model')

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

# Training
if args.do_train:
    train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and args.task_name != "dna690":
    # Create output directory if needed
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    logger.info("Saving model checkpoint to %s", args.output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = (
        model.module if hasattr(model, "module") else model
    )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Load a trained model and vocabulary that you have fine-tuned
    model = model_class.from_pretrained(args.output_dir)
    tokenizer = tokenizer_class.from_pretrained(args.output_dir)
    model.to(args.device)

# Evaluation
results = {}
if args.do_eval and args.local_rank in [-1, 0]:
    tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    checkpoints = [args.output_dir]
    if args.eval_all_checkpoints:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
        )
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

        model = model_class.from_pretrained(checkpoint)
        model.to(args.device)
        result = evaluate(args, model, tokenizer, prefix=prefix)
        result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
        results.update(result)

# Prediction
predictions = {}
if args.do_predict and args.local_rank in [-1, 0]:
    tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    checkpoint = args.output_dir
    logger.info("Predict using the following checkpoint: %s", checkpoint)
    prefix = ''
    model = model_class.from_pretrained(checkpoint)
    model.to(args.device)
    prediction = predict(args, model, tokenizer, prefix=prefix)

# Visualize
if args.do_visualize and args.local_rank in [-1, 0]:
    visualization_models = [3,4,5,6] if not args.visualize_models else [args.visualize_models]

    scores = None
    all_probs = None

    for kmer in visualization_models:
        output_dir = args.output_dir.replace("/690", "/690/" + str(kmer))
        #checkpoint_name = os.listdir(output_dir)[0]
        #output_dir = os.path.join(output_dir, checkpoint_name)
        
        tokenizer = tokenizer_class.from_pretrained(
            "dna"+str(kmer),
            do_lower_case=args.do_lower_case,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        checkpoint = output_dir
        logger.info("Calculate attention score using the following checkpoint: %s", checkpoint)
        prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
        config = config_class.from_pretrained(
            output_dir,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        config.output_attentions = True
        model = model_class.from_pretrained(
            checkpoint,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model.to(args.device)
        attention_scores, probs = visualize(args, model, tokenizer, prefix=prefix, kmer=kmer)
        if scores is not None:
            all_probs += probs
            scores += attention_scores
        else:
            all_probs = deepcopy(probs)
            scores = deepcopy(attention_scores)

    all_probs = all_probs/float(len(visualization_models))
    np.save(os.path.join(args.predict_dir, "atten.npy"), scores)
    np.save(os.path.join(args.predict_dir, "pred_results.npy"), all_probs)

# ensemble prediction
if args.do_ensemble_pred and args.local_rank in [-1, 0]:

    for kmer in range(3,7):
        output_dir = os.path.join(args.output_dir, str(kmer))
        tokenizer = tokenizer_class.from_pretrained(
            "dna"+str(kmer),
            do_lower_case=args.do_lower_case,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        checkpoint = output_dir
        logger.info("Calculate attention score using the following checkpoint: %s", checkpoint)
        prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
        config = config_class.from_pretrained(
            output_dir,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        config.output_attentions = True
        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model.to(args.device)
        if kmer == 3:
            args.data_dir = os.path.join(args.data_dir, str(kmer))
        else:
            args.data_dir = args.data_dir.replace("/"+str(kmer-1), "/"+str(kmer))

        if args.result_dir.split('/')[-1] == "test.npy":
            results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix=prefix)
        elif args.result_dir.split('/')[-1] == "train.npy":
            results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix=prefix, evaluate=False)
        else:
            raise ValueError("file name in result_dir should be either test.npy or train.npy")

        if kmer == 3:
            all_probs = deepcopy(probs)
            cat_probs = deepcopy(probs)
        else:
            all_probs += probs
            cat_probs = np.concatenate((cat_probs, probs), axis=1)
        print(cat_probs[0])
    

    all_probs = all_probs / 4.0
    all_preds = np.argmax(all_probs, axis=1)
    
    # save label and data for stuck ensemble
    labels = np.array(out_label_ids)
    labels = labels.reshape(labels.shape[0],1)
    data = np.concatenate((cat_probs, labels), axis=1)
    random.shuffle(data)
    root_path = args.result_dir.replace(args.result_dir.split('/')[-1],'')
    if not os.path.exists(root_path):
        os.makedirs(root_path)
    # data_path = os.path.join(root_path, "data")
    # pred_path = os.path.join(root_path, "pred")
    # if not os.path.exists(data_path):
    #     os.makedirs(data_path)
    # if not os.path.exists(pred_path):
    #     os.makedirs(pred_path)
    # np.save(os.path.join(data_path, args.result_dir.split('/')[-1]), data)
    # np.save(os.path.join(pred_path, "pred_results.npy", all_probs[:,1]))
    np.save(args.result_dir, data)
    ensemble_results = compute_metrics(eval_task, all_preds, out_label_ids, all_probs[:,1])
    logger.info("***** Ensemble results {} *****".format(prefix))
    for key in sorted(ensemble_results.keys()):
        logger.info("  %s = %s", key, str(ensemble_results[key]))    






<class 'transformers.tokenization_dna.DNATokenizer'>


  state_dict = torch.load(resolved_archive_file, map_location="cpu")


<class 'transformers.tokenization_dna.DNATokenizer'>
finish loading examples
number of processes for converting feature: 1
1 processor started !


Predicting: 100%|██████████| 8/8 [00:01<00:00,  5.78it/s]
