In [1]:
from datasets import load_dataset

dataset_train = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'train[:10%]')
dataset_test = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'test[:10%]')

No config specified, defaulting to: gazeta/default
Found cached dataset gazeta (/home/medic/.cache/huggingface/datasets/IlyaGusev___gazeta/default/1.0.0/ef9349c3c0f3112ca4036520d76c4bc1b8a79d30bc29643c6cae5a094d44e457)
No config specified, defaulting to: gazeta/default
Found cached dataset gazeta (/home/medic/.cache/huggingface/datasets/IlyaGusev___gazeta/default/1.0.0/ef9349c3c0f3112ca4036520d76c4bc1b8a79d30bc29643c6cae5a094d44e457)


In [2]:
dataset_train

Dataset({
    features: ['text', 'summary', 'title', 'date', 'url'],
    num_rows: 5240
})

In [3]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [4]:
def len_tok(text):
    return len(text.split())

In [5]:
max_len_sum, max_len_tl = max(map(len_tok, dataset_train['summary'])), max(map(len_tok, dataset_train['title']))
max_len_sum, max_len_tl

(75, 18)

In [6]:
max_len_sum, max_len_tl = 60, 15

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized_input = tokenizer(batch['summary'], padding='max_length', truncation=True, max_length=max_len_sum)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

dataset_train = dataset_train.map(tokenize, batched=True, batch_size=8)
dataset_test = dataset_test.map(tokenize, batched=True, batch_size=8)

dataset_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
dataset_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Loading cached processed dataset at /home/medic/.cache/huggingface/datasets/IlyaGusev___gazeta/default/1.0.0/ef9349c3c0f3112ca4036520d76c4bc1b8a79d30bc29643c6cae5a094d44e457/cache-2cf81f28446bb4c6.arrow


  0%|          | 0/73 [00:00<?, ?ba/s]

In [8]:
dataset_train.save_to_disk('gazeta/train')
dataset_test.save_to_disk('gazeta/test')

In [9]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments


model = T5ForConditionalGeneration.from_pretrained(model_name)

In [46]:
output_dir = 'gazeta/output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.00001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_gazeta', # Wandb run name
    logging_steps=500, # How often to log loss to wandb
    eval_steps=500, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5240
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 32750
  Number of trainable parameters = 244309248


Step,Training Loss,Validation Loss
500,2.2945,2.800131
1000,2.2992,2.785827
1500,2.2141,2.76532
2000,2.1945,2.747967
2500,2.1596,2.754606
3000,2.1455,2.760347
3500,2.1197,2.737623
4000,2.1479,2.732628
4500,2.1004,2.725555
5000,2.1187,2.70714


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
Saving model checkpoint to gazeta/output/checkpoint-1000
Configuration saved in gazeta/output/checkpoint-1000/config.json
Model weights saved in gazeta/output/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [gazeta/output/checkpoint-5000]

***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
Saving model checkpoint to gazeta/output/checkpoint-9000
Configuration saved in gazeta/output/checkpoint-9000/config.json
Model weights saved in gazeta/output/checkpoint-9000/pytorch_model.bin
Deleting older checkpoint [gazeta/output/checkpoint-8000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you ca

***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
Saving model checkpoint to gazeta/output/checkpoint-17000
Configuration saved in gazeta/output/checkpoint-17000/config.json
Model weights saved in gazeta/output/checkpoint-17000/pytorch_model.bin
Deleting older checkpoint [gazeta/output/checkpoint-16000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  yo

***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 577
  Batch size = 8
Saving model checkpoint to gazeta/output/checkpoint-25000
Configuration saved in gazeta/output/checkpoint-25000/config.json
Model weights saved in gazeta/output/checkpoint-25000/pytorch_model.bin
Deleting older checkpoint [gazeta/output/checkpoint-24000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, title, url, summary, date. If text, title, url, summary, date are not expected by `T5ForConditionalGeneration.forward`,  yo

***** Running Evaluation *****
  Num examples = 577
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from gazeta/output/checkpoint-5000 (score: 2.7071404457092285).
Deleting older checkpoint [gazeta/output/checkpoint-32000] due to args.save_total_limit


TrainOutput(global_step=32750, training_loss=1.860656222976801, metrics={'train_runtime': 7110.3511, 'train_samples_per_second': 36.848, 'train_steps_per_second': 4.606, 'total_flos': 2.08700430336e+16, 'train_loss': 1.860656222976801, 'epoch': 50.0})

In [12]:
trainer.save_model(output_dir + '/model')

Saving model checkpoint to gazeta/output/model
Configuration saved in gazeta/output/model/config.json
Model weights saved in gazeta/output/model/pytorch_model.bin


In [70]:
INX = 457
print("SUMMARY: | {}".format(dataset_test['summary'][INX]))
print("TITLE: | {}".format(dataset_test['title'][INX]))

SUMMARY: | Торговое перемирие между США и Китаем вполне возможно. Китай согласился на уступки в сельхозсекторе, а США ослабят тарифную политику. Такое заключение можно сделать по итогам диалога сторон в Вашингтоне в пятницу. Президент Дональд Трамп назвал переговоры «значительной первой фазой» будущей торговой сделки, которую он планирует подписать с председателем Си Цзиньпином в декабре. Впрочем, эта «первая фаза» не затрагивает других острых споров между державами.
TITLE: | «Первая фаза»: США и Китай закончат войну?


In [49]:
device = "cuda"

In [71]:
import torch

input_text = dataset_test['summary'][INX]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=1
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)


output:
Китай согласился на перемирие
