In [1]:
import sys
import os
import random
import time
import re
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import datasets
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig, EvalPrediction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

In [3]:
DATA_DIR = Path(os.getcwd()).parent / "data"

In [4]:
pretrained_model = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

In [5]:
train_dataset = datasets.load_from_disk(DATA_DIR / "train/jp-engineer-articles-dataset")
test_dataset = datasets.load_from_disk(DATA_DIR / "test/jp-engineer-articles-dataset")



In [6]:
training_args = TrainingArguments(
    output_dir=DATA_DIR / "results",          # output directory
    num_train_epochs=2,              # total number of training epochs
    learning_rate=2e-05,
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=1,    # batch size for evaluation
    warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler
    weight_decay=0.06,               # strength of weight decay
)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
config = AutoConfig.from_pretrained(pretrained_model, num_labels=3)



In [8]:
def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        padding="max_length",
        max_length=2048,
        truncation=True,
    )

In [9]:
with training_args.main_process_first(desc="train dataset map pre-processing"):
    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
    )
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
    
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
    predict_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
        desc="Running tokenizer on prediction dataset",
    )



In [10]:
metric = evaluate.load("xnli")

In [11]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=None,
)

In [13]:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
max_train_samples = len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.save_model()  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: premise, hypothesis, __index_level_0__. If premise, hypothesis, __index_level_0__ are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2433
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 2434
  Number of trainable parameters = 278811651
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 1343, in forward
    outputs = self.deberta(
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 1109, in forward
    encoder_outputs = self.encoder(
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 546, in forward
    output_states = layer_module(
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 386, in forward
    attention_output = self.attention(
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 317, in forward
    self_output = self.self(
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 766, in forward
    attention_probs = self.dropout(attention_probs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 258, in forward
    return XDropout.apply(x, self.get_context())
  File "/home/workspace/notion-auto-archive/venv/lib/python3.9/site-packages/transformers/models/deberta_v2/modeling_deberta_v2.py", line 205, in forward
    return input.masked_fill(mask, 0) * ctx.scale
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 10.76 GiB total capacity; 9.56 GiB already allocated; 73.38 MiB free; 9.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
logger.info("*** Predict ***")
label_list = ["entailment", "neutral", "contradiction"]
predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

max_predict_samples = len(predict_dataset)
metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

predictions = np.argmax(predictions, axis=1)
output_predict_file = os.path.join(training_args.output_dir, "predictions.txt")
if trainer.is_world_process_zero():
    with open(output_predict_file, "w") as writer:
        writer.write("index\tprediction\n")
        for index, item in enumerate(predictions):
            item = label_list[item]
            writer.write(f"{index}\t{item}\n")