<a href="https://colab.research.google.com/github/chrom3DEpi/Enhancer_pred/blob/master/human_enhancers_cohn_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train Transformer Classifier With Pytorch

This notebook demonstrates how to use `genomic_benchmarks` to train a neural network classifier on one of its benchmark datasets [human_nontata_promoters](https://github.com/ML-Bioinfo-CEITEC/genomic_benchmarks/tree/main/docs/human_nontata_promoters).

In [None]:
pip install genomic-benchmarks

Collecting genomic-benchmarks
  Downloading genomic_benchmarks-0.0.5.tar.gz (20 kB)
Collecting biopython>=1.79
  Downloading biopython-1.79-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 7.2 MB/s eta 0:00:01
Collecting googledrivedownloader>=0.4
  Downloading googledrivedownloader-0.4-py2.py3-none-any.whl (3.9 kB)
Collecting yarl
  Downloading yarl-1.7.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (308 kB)
[K     |████████████████████████████████| 308 kB 37.9 MB/s eta 0:00:01
Collecting multidict>=4.0
  Downloading multidict-5.2.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (187 kB)
[K     |████████████████████████████████| 187 kB 38.3 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: genomic-benchmarks
  Building wheel for genomic-benchmarks (setup.py) ... [?25ldone
[?25h  Created wheel for genomic-ben

In [None]:
!pip install --upgrade torchtext

Collecting torchtext
  Using cached torchtext-0.11.0-cp38-cp38-manylinux1_x86_64.whl (8.0 MB)
Installing collected packages: torchtext
Successfully installed torchtext-0.11.0


In [None]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanEnhancersCohn
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab
from genomic_benchmarks.data_check import info

# Choose the dataset

Create pytorch dataset object

In [None]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanEnhancersCohn
from genomic_benchmarks.loc2seq import download_dataset

download_dataset('human_enhancers_cohn', version=0, force_download=True)

train_dset =  HumanEnhancersCohn('train', version=0, force_download=True)
test_dset =  HumanEnhancersCohn('test', version=0, force_download=True)

sum([s[1] for s in train_dset]), sum([s[1] for s in test_dset])

# Print out information about the dataset

In [None]:
info("human_enhancers_cohn", 0)

Dataset `human_enhancers_cohn` has 2 classes: negative, positive.

All lenghts of genomic intervals equals 500.

Totally 27791 sequences have been found, 20843 for training and 6948 for testing.


Unnamed: 0,train,test
negative,10422,3474
positive,10421,3474


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

# neatly, in this data set we see that all the sequences have the same length
# collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)
# train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, collate_fn=collate)

Using cuda device


# Model
We will initialize our model.
From the dataset info, we know that all inputs are 500 characters long, and the number of classes is 2.

In [None]:
!pip install transformers

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("armheb/DNA_bert_6").to(device)
tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/xstefan3/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.10.2",
  

## Training

In [None]:
train_dset[0][0]

'ATGCCCAGCAACTTCTGATTGGGTCATTGAACAGATATTTATTTATTGATCTCCTACAGTGTTCCAGACACTATTTTGGGTGCAGTGAACATTCTTAGAACTTAGATTCTAATAGAGGTGACTAATCACAAACAATGAGATGGGTAGTATCAGTTAGTGATAAGTGGTACGAAGAAGAAACTTGTACAGGGTGCTGACCTAGATACAGTAGTCAGAGAACACCCCTCTAAAGAGGAGGTATGTAAGTTGGGACCTAAATAGAATGGCAGAGCAAGGGCTGGCCAATGTCTGGGGAAGAGCATTCTAAGAAAAGAGAAACTGCAAGTGCAAAGGCTTTAAGATAGGAGCATGCTTGGTGCATTCAAGGACCAGAAAAGAGGCCTGTCTGTGTTCAACTCAGTGAGCAAGATGTGTTGGTGGAAAGGAGTCCTGAAGGCCAGAACCTGGAGTGTTGAGCAGTGAATGTGCTTTGTCAGGTTCAGTTTAACTCTGTACGCATT'

In [None]:
import itertools
offset = 6
# text = train_dset[0][0]

bos, eos = tokenizer("").input_ids

encoded_dset = [(label, list(itertools.chain([bos], *tokenizer([text[i:i+offset] 
                for i in range(0, len(text), offset)], add_special_tokens=False).input_ids, [eos])))
                for text, label in train_dset]
encoded_dset_test = [(label, list(itertools.chain([bos], *tokenizer([text[i:i+offset] 
                     for i in range(0, len(text), offset)], add_special_tokens=False).input_ids, [eos])))
                     for text, label in test_dset]

In [None]:
encoded_samples = [{"input_ids": torch.tensor(ids), "attention_mask": torch.tensor([1]*len(ids)), "labels": torch.tensor(label)} 
                   for label, ids in encoded_dset]
encoded_samples_test = [{"input_ids": torch.tensor(ids), "attention_mask": torch.tensor([1]*len(ids)), "labels": torch.tensor(label)} 
                   for label, ids in encoded_dset_test]

In [None]:
import random
random.shuffle(encoded_samples)

In [None]:
sum([s["labels"].item() for s in encoded_samples])

10422

In [None]:
# evaluation metrics to log
def metrics(pred) -> dict:
    acc = sum([pred == true for pred, true in zip(pred.predictions.argmax(-1).tolist(), 
                                                  pred.label_ids.tolist())]) / len(pred.label_ids)
    return {"acc": acc}

In [None]:
# I use comet.ml for logging, but these env vars need to be set before the jupyter lab start, it does not work here
!export COMET_API_KEY=XXX
!export COMET_PROJECT_NAME=ceitec

from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# for a complete list, see https://huggingface.co/transformers/main_classes/trainer.html?highlight=launch#trainingarguments
args = TrainingArguments(output_dir="output_checkpoints",
                         learning_rate=2e-5,
                         weight_decay=0.01,
                         num_train_epochs=15,
                         per_device_train_batch_size=32,
                         do_train=True,
                         do_eval=True,
                         logging_steps=10,
                         warmup_steps=5000,
                         eval_steps=100,
                         evaluation_strategy="steps",
                         logging_strategy="steps",
                         logging_first_step=True,
                         load_best_model_at_end=True,
#                          metric_for_best_model="acc"
                        )
# Trainer contains a lot of useful defaults and also supports multi-gpu training
trainer = Trainer(model=model, args=args, compute_metrics=metrics, callbacks=[EarlyStoppingCallback(30)],
                  train_dataset=encoded_samples[:-200], eval_dataset=encoded_samples[-200:])

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 20643
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9690


Step,Training Loss,Validation Loss,Acc
100,0.7009,0.705104,0.505
200,0.6899,0.68965,0.55
300,0.6664,0.667353,0.59
400,0.6437,0.645954,0.675
500,0.6263,0.62328,0.69
600,0.6372,0.601904,0.665
700,0.5901,0.584692,0.675
800,0.602,0.568804,0.685
900,0.5729,0.568178,0.715
1000,0.5816,0.560669,0.7


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to output_checkpoints/checkpoint-100
Configuration saved in output_checkpoints/checkpoint-100/config.json
Model weights saved in output_checkpoints/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to output_checkpoints/checkpoint-200
Configuration saved in output_checkpoints/checkpoint-200/config.json
Model weights saved in output_checkpoints/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to output_checkpoints/checkpoint-300
Configuration saved in output_checkpoints/checkpoint-300/config.json
Model weights saved in output_checkpoints/checkpoint-300/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to output_checkpoints/checkpoint-400
Configuration saved in output_checkpoints/checkp

TrainOutput(global_step=7400, training_loss=0.49472031897789726, metrics={'train_runtime': 4966.8161, 'train_samples_per_second': 62.343, 'train_steps_per_second': 1.951, 'total_flos': 1.045114420215348e+16, 'train_loss': 0.49472031897789726, 'epoch': 11.46})

## Testing

Here follow the results of multiple experiments, the upper cells comment on a difference to the lower ones.

In [None]:
from tqdm import tqdm

predictions = []
for sample in tqdm(test_loader, total=len(test_dset)/32):
    outputs = trainer.model.to("cpu")(**sample)
    preds = outputs.logits.argmax(-1).tolist()
    predictions.extend(preds)

218it [01:39,  2.19it/s]                             


In [None]:
# early-stopping by loss, patience=50 eval_steps=50 (higher), weight_decay=0.01
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.7227722772277227

In [None]:
# early-stopping by loss, patience=50 eval_steps=500, weight_decay=0.1
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.6938711694809255

In [None]:
# early-stopping by loss, patience=50 eval_steps=500, weight_decay=0.1
from sklearn.metrics import accuracy_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
accuracy_score(labels, predictions)

0.7179044329303397

In [None]:
# early-stopping patience 100, weight_decay=0.02
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.7146999130182661

In [None]:
# early-stopping patience 100
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.7269980774512496

In [None]:
# early-stopping by the loss, default lr (5e-5), no weight_decay
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.7180299539170506

In [None]:
from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.6804256691389874

In [None]:
# previous output - example solution

from sklearn.metrics import f1_score
from genomic_benchmarks.data_check.info import labels_in_order

labels = labels_in_order(dset_name='human_enhancers_cohn')
f1_score(labels, predictions)

0.40943812595484635