In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import gc
import csv   
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from transformers import  get_linear_schedule_with_warmup
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import LightningModule, Trainer, seed_everything
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, matthews_corrcoef, roc_auc_score, f1_score
from scipy.special import softmax
from Bio import SeqIO

from preprocess.utils import Sequence, DNASequence, KmerSequence, compute_all_metrics
from model.bertnup import Dnabert1Dataset, BertNup
from model.trainer import k_fold_cv

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import transformers
transformers.utils.logging.set_verbosity_error()
import logging
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [None]:
import random
def set_seed(seed):
    """ Set all seeds to make results reproducible (deterministic mode).
        When seed is a false-y value or not supplied, disables deterministic mode. """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(0)

### Hyperparams

In [None]:
EPOCHS = 10
LEARNING_RATE = 2e-5

In [None]:
train_params = {'batch_size': 32,
                'shuffle': True,
                'num_workers': 0
                }
test_params = {'batch_size': 128,
                'shuffle': False,
                'num_workers': 0
                }

### Training

In [None]:
all_data = [
    'Hsapiens',
    'Celegans',
    'Dmelanogaster',
    'DM_5U',
    'DM_LC',
    'DM_PM',
    'HS_5U',
    'HS_LC',
    'HS_PM',
    'Y_PM',
    'Y_WG',
]

In [None]:
for k in [3, 4, 5, 6]:
    pretrained_model_name = 'armheb/DNA_bert_' + str(k)
    for data_name in all_data:
        print(f'DNABERT-1-{k} for {data_name}')
        k_fold_cv(data_name=data_name, 
                  result_dir='Results/DNABERT-1-' + str(k), 
                  pretrained_model_name=pretrained_model_name,
                  kmer=k, train_params=train_params, test_params=test_params, 
                  device=device, learning_rate=LEARNING_RATE, epochs=EPOCHS
                 )