### замтеик

### imports and freezes

In [20]:
from datasets import load_dataset, Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import numpy as np
import pandas as pd

import random

In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)

In [10]:
model_checkpoint = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

metric = load_metric("squad")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 4.50kB [00:00, 8.75MB/s]                   
Downloading extra modules: 3.30kB [00:00, 7.96MB/s]                   


### dataset preparetion

In [11]:
raw_datasets = load_dataset("kuznetsoffandrey/sberquad")
raw_datasets

Downloading readme: 5.16kB [00:00, 14.8MB/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [27]:
def filter_long_answers(example):
    return bool(example["answers"]["text"]) and len(example["answers"]["text"][0]) > 20


filtered_train = raw_datasets["train"].filter(filter_long_answers)
filtered_validation = raw_datasets["validation"].filter(filter_long_answers)

raw_datasets = raw_datasets.copy()
raw_datasets["train"] = filtered_train
raw_datasets["validation"] = filtered_validation

Filter: 100%|██████████| 45328/45328 [00:00<00:00, 87153.01 examples/s]
Filter: 100%|██████████| 5036/5036 [00:00<00:00, 75198.53 examples/s]


In [13]:
# def unpack_text_answer(answer: str) -> str:
#     return eval(str(answer))['text'][0]


# train_df['answers'] = train_df['answers'].apply(unpack_text_answer)
# valid_df['answers'] = valid_df['answers'].apply(unpack_text_answer)

In [14]:
# train_df.head()

In [28]:
stride = 128
max_length = 512

In [29]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map: 100%|██████████| 22363/22363 [00:05<00:00, 3761.94 examples/s]


In [30]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map: 100%|██████████| 2620/2620 [00:00<00:00, 3049.71 examples/s]


### model train

In [31]:
def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)


data_collator = DataCollatorForTokenClassification(tokenizer)

In [32]:
LR = 5e-5
BS = 32
N_EPOCHS = 10
WD = 0.05
WARMUP = 0.1
GRAD_ACC = 2


training_args = TrainingArguments(
    output_dir="./results",                   
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    
    load_best_model_at_end=False,
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WD,
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP,
    fp16=True,                     
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0961,No log
2,0.1826,No log
3,0.1543,No log
4,0.1062,No log
5,0.084,No log
6,0.0602,No log
7,0.043,No log
8,0.028,No log
9,0.0174,No log


In [25]:
save_model_path: str = '/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/answers_with_len_more_20-train_loss-0_'

model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_model_path)

('/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/train_loss-0-0605/tokenizer_config.json',
 '/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/train_loss-0-0605/special_tokens_map.json',
 '/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/train_loss-0-0605/vocab.txt',
 '/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/train_loss-0-0605/added_tokens.json',
 '/home/golubev.dmitriy25/models/MY_TRAIN-train_qa_model/train_loss-0-0605/tokenizer.json')