# Лабораторная работа №7

In [1]:
!pip install -r requirements.txt > None

In [2]:
import torch
import evaluate
import numpy as np
import tensorflow as tf

from tensorflow import keras
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, TrainingArguments, TFAutoModelForQuestionAnswering, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RANDOM_STATE = 2023
TRAIN_SET = 500

## Выбираем модель

In [4]:
# https://huggingface.co/Den4ikAI/rubert_large_squad_2

selected_model = "Den4ikAI/rubert_large_squad_2"
# обучена на https://huggingface.co/ai-forever/ruBert-base

In [5]:
qa_pipeline = pipeline(
    "question-answering",
    model="Den4ikAI/rubert_large_squad_2",
    tokenizer="Den4ikAI/rubert_large_squad_2"
)
predictions = qa_pipeline({
    'context': "Пушкин родился 6 июля 1799 года",
    'question': "Когда родился Пушкин?"
})
print(predictions)

{'score': 0.9182615280151367, 'start': 15, 'end': 31, 'answer': '6 июля 1799 года'}


## Загружаем датасеты

In [6]:
sber_dataset = load_dataset("sberquad")
squad_dataset = load_dataset("squad")

In [7]:
print(squad_dataset)
print(squad_dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome

## Проверим модель на данных squad

In [8]:
test_set = squad_dataset['train'][0]

predictions = qa_pipeline({
    'context': test_set['context'],
    'question': test_set['question']
})
print(predictions)
print(test_set)

{'score': 0.9514355659484863, 'start': 512, 'end': 541, 'answer': 'to Saint Bernadette Soubirous'}
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Ber

## А теперь на данных для дообучения sbersquad

In [9]:
test_set = sber_dataset['train'][20]

predictions = qa_pipeline({
    'context': test_set['context'],
    'question': test_set['question']
})
print(predictions)
print(test_set)

{'score': 0.6491039395332336, 'start': 0, 'end': 35, 'answer': 'Город Байконур и космодром Байконур'}
{'id': 3546, 'title': 'SberChallenge', 'context': 'Город Байконур и космодром Байконур вместе образуют комплекс Байконур , арендованный Россией у Казахстана на период до 2050 года. Эксплуатация космодрома стоит около 9 млрд рублей в год (стоимость аренды комплекса Байконур составляет 115 млн долларов — около 7,4 млрд рублей в год; ещё около 1,5 млрд рублей в год Россия тратит на поддержание объектов космодрома), что составляет 4,2 % от общего бюджета Роскосмоса на 2012 год. Кроме того, из федерального бюджета России в бюджет города Байконура ежегодно осуществляется безвозмездное поступление в размере 1,16 млрд рублей (по состоянию на 2012 год). В общей сложности космодром и город обходятся бюджету России в 10,16 млрд рублей в год.', 'question': 'Что образует комплекс Байконур ?', 'answers': {'text': ['Город Байконур и космодром Байконур'], 'answer_start': [0]}}


## Обработаем датасет от Сбера

In [10]:
tokenizer = AutoTokenizer.from_pretrained(selected_model)

In [11]:
def preprocess_validation_examples(tokenizer, examples):
    max_length = 384  # The maximum length of a feature (question and context)
    doc_stride = (
        64  # The authorized overlap between two part of the context when splitting
    )

    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(
                    token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [12]:
from functools import partial

partial_tokenize_function = partial(preprocess_validation_examples, tokenizer)

sber_dataset['train'] = sber_dataset['train'].select(range(TRAIN_SET))
sber_dataset['validation'] = sber_dataset['validation'].select(
    range(TRAIN_SET))

tokenized_datasets = sber_dataset.map(
    partial_tokenize_function,
    batched=True,
    remove_columns=sber_dataset["train"].column_names,
    num_proc=3,
)

tokenized_datasets

Map (num_proc=3): 100%|██████████| 500/500 [00:23<00:00, 21.05 examples/s]
  table = cls._concat_blocks(blocks, axis=0)
Map (num_proc=3): 100%|██████████| 500/500 [00:18<00:00, 26.46 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 502
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 24041
    })
})

In [13]:
train_set = tokenized_datasets["train"].with_format("numpy")[:]
validation_set = tokenized_datasets["validation"].with_format("numpy")[:]

In [14]:
model = TFAutoModelForQuestionAnswering.from_pretrained(

    selected_model, from_pt=True)

optimizer = keras.optimizers.Adam(learning_rate=5e-5)

keras.mixed_precision.set_global_policy("mixed_float16")
model.compile(optimizer=optimizer)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [15]:
model.fit(train_set, validation_data=validation_set, epochs=1)

In [None]:
context = """Keras is an API designed for human beings, not machines. Keras follows best
practices for reducing cognitive load: it offers consistent & simple APIs, it minimizes
the number of user actions required for common use cases, and it provides clear &
actionable error messages. It also has extensive documentation and developer guides. """
question = "What is Keras?"

inputs = tokenizer([context], [question], return_tensors="np")
outputs = model(inputs)
start_position = tf.argmax(outputs.start_logits, axis=1)
end_position = tf.argmax(outputs.end_logits, axis=1)
print(int(start_position), int(end_position[0]))

In [None]:
answer = inputs["input_ids"][0, int(start_position): int(end_position) + 1]
print(answer)
print(tokenizer.decode(answer))

#### Используя набор данных Sberquad дообучить выбранную модель, оценить качество до и после дообучения