# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab

In [None]:
!pip install transformers=="4.35.0" datasets=="2.14.6" accelerate=="0.24.1"

## Fine-tuning and Evaluating a Russian Question Answering Model with Sberquad

Importing deps

In [2]:
# This code includes software developed by the following open-source projects:
# - numpy (License: BSD-3-Clause license, Authors: NumPy Developers)
# - pandas (License: BSD-3-Clause License, Authors: Pandas Development Team)
# - datasets (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - transformers (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - accelerate (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - scikit-learn (License: BSD License, Authors: scikit-learn Developers)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.

import numpy as np
import pandas as pd
from IPython.display import display
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, pipeline

Define random state for reproducing

In [3]:
RANDOM_STATE = 42

Loading dataset

In [None]:
sberquad = load_dataset("sberquad")

train_data = sberquad["train"].select(range(2500))
test_data = sberquad["validation"].select(range(500))

Preprocessing and Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

train_data = train_data.map(
    prepare_train_features,
    batched=True,
    remove_columns=train_data.column_names,
)

test_data = test_data.map(
    prepare_train_features,
    batched=True,
    remove_columns=test_data.column_names,
)

Model Setup and Training Configuration

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained("DeepPavlov/rubert-base-cased")

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    output_dir="./results",
    logging_dir="./logs",
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    do_train=True,
    do_eval=True,
    use_cpu=False,
    load_best_model_at_end=True,
    save_strategy="steps",
    evaluation_strategy="steps",
    seed=RANDOM_STATE,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

Model Training

In [7]:
trainer.train()

Step,Training Loss,Validation Loss
100,3.7992,2.463964
200,2.3662,2.144193
300,2.0425,2.092083


TrainOutput(global_step=314, training_loss=2.707084752951458, metrics={'train_runtime': 590.3141, 'train_samples_per_second': 8.47, 'train_steps_per_second': 0.532, 'total_flos': 1306483783680000.0, 'train_loss': 2.707084752951458, 'epoch': 2.0})

## Evaluation

Model Evaluation and Metrics Comparison

In [8]:
metric = load_metric("squad")

def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    start_preds, end_preds = predictions
    start_labels, end_labels = labels

    start_preds = np.argmax(start_preds, axis=1)
    end_preds = np.argmax(end_preds, axis=1)

    formatted_predictions = []
    for idx, (start, end) in enumerate(zip(start_preds, end_preds)):
        pred_text = tokenizer.decode(test_data[idx]['input_ids'][start:end+1], skip_special_tokens=True)
        formatted_predictions.append({"id": idx, "prediction_text": pred_text})

    references = [{"id": idx, "answers": sberquad["validation"][idx]['answers']} for idx in range(len(test_data))]

    return metric.compute(predictions=formatted_predictions, references=references)

trainer_base = Trainer(
    model=AutoModelForQuestionAnswering.from_pretrained("DeepPavlov/rubert-base-cased"),
    compute_metrics=compute_metrics,
    eval_dataset=test_data,
)

base_metrics = trainer_base.evaluate()

trainer.compute_metrics = compute_metrics

fine_tuned_metrics = trainer.evaluate()

results_df = pd.DataFrame({
    "Metric": ["Loss", "Exact Match", "F1 Score"],
    "Base Model": [base_metrics['eval_loss'], base_metrics['eval_exact_match'], base_metrics['eval_f1']],
    "Fine-tuned Model": [fine_tuned_metrics['eval_loss'], fine_tuned_metrics['eval_exact_match'], fine_tuned_metrics['eval_f1']],
})


display(results_df)

  metric = load_metric("squad")


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Metric,Base Model,Fine-tuned Model
0,Loss,6.207763,2.092083
1,Exact Match,0.0,43.0
2,F1 Score,2.814299,62.004937


Model testing

In [9]:
trainer.model.to('cpu')

question_answering_pipeline = pipeline("question-answering", model=trainer.model, tokenizer=tokenizer)

context = "Sberbank — крупнейший банк в России и странах СНГ. Основной акционер — правительство Российской Федерации."
questions = [
    "Кто является основным акционером Sberbank?",
    "Какой банк в России и странах СНГ является крупнейшим?",
    "Где Sberbank является крупнейшим банком?",
    "Основным акционером чего является правительство Российской Федерации?"
]

for question in questions:
    answer = question_answering_pipeline(question=question, context=context)
    print("\nВопрос:", question)
    print("Ответ:", answer['answer'])


Вопрос: Кто является основным акционером Sberbank?
Ответ: правительство Российской Федерации

Вопрос: Какой банк в России и странах СНГ является крупнейшим?
Ответ: Sberbank

Вопрос: Где Sberbank является крупнейшим банком?
Ответ: в России и странах СНГ

Вопрос: Основным акционером чего является правительство Российской Федерации?
Ответ: Sberbank
