In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [2]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

In [3]:
import datasets

In [4]:
ds = datasets.load_dataset('csv', data_files={'train': '../test_data/rdd/train.txt',
                                     'validation': '../test_data/rdd/dev.txt'}, 
                  delimiter='\t', quoting=3)

Using custom data configuration default-5d1762bef8cb5959
Reusing dataset csv (/home/jds/.cache/huggingface/datasets/csv/default-5d1762bef8cb5959/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'en.1'],
        num_rows: 6290045
    })
    validation: Dataset({
        features: ['en', 'en.1'],
        num_rows: 1572503
    })
})

In [6]:
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

In [8]:
tokenizer.src_lang = "en-XX"
tokenizer.tgt_lang = "en-XX"

In [9]:
tokenizer("Hello, this one sentence!")

{'input_ids': [3, 35378, 4, 903, 1632, 149357, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
max_input_length = 128
max_target_length = max_input_length
source_lang = "en"
target_lang = "en.1"

def preprocess(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
preprocess(ds['train'][:2])

{'input_ids': [[3, 1071, 63736, 16, 142, 420, 6488, 756, 2367, 4927, 14461, 34, 808, 4458, 141, 3493, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 1143, 2408, 5, 3164, 3129, 12921, 57849, 5256, 111, 1238, 93322, 18, 51, 188, 3430, 6, 4, 621, 70541, 297, 23, 70, 661, 1375, 58, 51659, 4061, 53, 1369, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
token_ds = ds.map(preprocess, batched=True)

Loading cached processed dataset at /home/jds/.cache/huggingface/datasets/csv/default-5d1762bef8cb5959/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-24e51acdb00d559b.arrow
Loading cached processed dataset at /home/jds/.cache/huggingface/datasets/csv/default-5d1762bef8cb5959/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-fbb60dd8d977fc0e.arrow


In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=token_ds['train'],         # training dataset
    eval_dataset=token_ds['validation']            # evaluation dataset
)

In [16]:
trainer.train()



Step,Training Loss


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py", line 1289, in forward
    outputs = self.model(
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py", line 1159, in forward
    encoder_outputs = self.encoder(
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py", line 799, in forward
    layer_outputs = encoder_layer(
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py", line 324, in forward
    hidden_states = self.activation_fn(self.fc1(hidden_states))
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 93, in forward
    return F.linear(input, self.weight, self.bias)
  File "/home/jds/.pyenv/versions/3.8.6/envs/ocr/lib/python3.8/site-packages/torch/nn/functional.py", line 1692, in linear
    output = input.matmul(weight.t())
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 10.92 GiB total capacity; 10.04 GiB already allocated; 7.44 MiB free; 10.21 GiB reserved in total by PyTorch)


In [20]:
import torch

In [21]:
torch.cuda.empty_cache()