### заметки

1. определиться с архитектурой модели. попробовать модернберт от дипвк и руберт от диппавлова
2. провести обрааботку датасета, добавить свои записи


время обучения модели с полным датасетом sberquad при приведенных гиперпараметрах (LR = 5e-5 ; BS = 32 ; N_EPOCHS = 10 ; WD = 0.05 ; WARMUP = 0.1 ; GRAD_ACC = 2) на TESLA a100 занимает порядка 50 минут. было бы не плохо подрубить логирование в облако

### imports ; freezes ; model init 

In [20]:
import torch
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering, Trainer, 
                          DataCollatorForTokenClassification, TrainingArguments) 

from datasets import load_dataset, load_metric
import numpy as np

import random

In [2]:
def set_seed(seed: int = 42) -> bool:
    try:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        return True

    except Exception as _ex:
        print(f'Error while setting seeds. Error :: {_ex}')
        return False

set_seed(42)

In [10]:
model_checkpoint: str = "DeepPavlov/rubert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model: AutoModelForQuestionAnswering = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

metric = load_metric("squad")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 4.50kB [00:00, 8.75MB/s]                   
Downloading extra modules: 3.30kB [00:00, 7.96MB/s]                   


### dataset preparetion

In [11]:
raw_datasets = load_dataset("kuznetsoffandrey/sberquad")
raw_datasets

Downloading readme: 5.16kB [00:00, 14.8MB/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [None]:
# оставляем только те записи, ответ на которые длиннее 20-ти симмволов

# def filter_long_answers(example) -> bool:
#     return bool(example["answers"]["text"]) and len(example["answers"]["text"][0]) > 20


# filtered_train = raw_datasets["train"].filter(filter_long_answers)
# filtered_validation = raw_datasets["validation"].filter(filter_long_answers)

# raw_datasets = raw_datasets.copy()
# raw_datasets["train"] = filtered_train
# raw_datasets["validation"] = filtered_validation

In [28]:
stride: int = 128
max_length: int = 512

In [29]:
def preprocess_training_examples(examples):
    questions: list[str] = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers: str = examples["answers"]
    start_positions: list[int] = []
    end_positions: list[int] = []

    for i, offset in enumerate(offset_mapping):
        sample_idx: int = sample_map[i]
        answer: str = answers[sample_idx]
        start_char: str = answer["answer_start"][0]
        end_char: str = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids: list[int] = inputs.sequence_ids(i)

        idx: int = 0
        while sequence_ids[idx] != 1:
            idx += 1

        context_start: int = idx
        while sequence_ids[idx] == 1:
            idx += 1

        context_end: int = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)

        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1

            start_positions.append(idx - 1)
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
                
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map: 100%|██████████| 22363/22363 [00:05<00:00, 3761.94 examples/s]


In [30]:
def preprocess_validation_examples(examples):
    questions: list[str] = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids: list[int] = []

    for i in range(len(inputs["input_ids"])):
        sample_idx: int = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids: list[int] = inputs.sequence_ids(i)
        offset: str = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map: 100%|██████████| 2620/2620 [00:00<00:00, 3049.71 examples/s]


### model train

In [31]:
def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)


data_collator = DataCollatorForTokenClassification(tokenizer)

In [32]:
LR: float = 5e-5
BS: int = 32
N_EPOCHS: int = 10
WD: float = 0.05
WARMUP: float = 0.1
GRAD_ACC: int = 2


training_args = TrainingArguments(
    output_dir="./results",                   
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    
    load_best_model_at_end=False,
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WD,
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP,
    fp16=True,                     
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0961,No log
2,0.1826,No log
3,0.1543,No log
4,0.1062,No log
5,0.084,No log
6,0.0602,No log
7,0.043,No log
8,0.028,No log
9,0.0174,No log


In [None]:
save_model_path: str = '-'

model.save_pretrained(save_model_path)
tokenizer.save_pretrained(save_model_path)

### model evaluation

In [None]:
model_path: str = '-'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
context = """20 ноября 2025 года Институт татарской энциклопедии и регионоведения им. М. Хасанова Академии наук Республики Татарстан (ИТЭР АН РТ) проводит Всероссийскую научно-практическую конференцию «Казань тысячелетняя», приуроченную к 1020-летию основания города (2025), сообщается на сайте научного учреждения [1]1.

Цель — обмен научным опытом и новейшими подходами в изучении вопросов по тысячелетней истории Казани и других старинных городов Российской Федерации (РФ), а также методикой создания отраслевых энциклопедий о городских агломерациях и мегаполисах.

Тематические направления:
Казань — город межнационального и межконфессионального согласия.
Казань и другие старинные города России в исторической ретроспективе: архитектура, культура, экономика и повседневность.
Городские агломерации в современной России: проблемы и перспективы развития.
Энциклопедии о городах России и Татарстана: научно-методологические и практические подходы.
Города и их историографы: прошлое и настоящее.
Заявки на участие принимаются до 30 сентября 2025 года, статьи — до 20 октября 2025 года."""

question = "где сообщается новость?"


def get_qa_answer(question: str, context: str) -> str:
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512,
        return_offsets_mapping=True  
    ).to(device)
    
    offset_mapping = inputs.pop("offset_mapping")[0]
    with torch.no_grad():
        outputs = model(**inputs)  
        
    answer_start_token: int = torch.argmax(outputs.start_logits).item()
    answer_end_token: int = torch.argmax(outputs.end_logits).item() + 1  
    
    start_char: int = offset_mapping[answer_start_token][0].item()  
    end_char: int = offset_mapping[answer_end_token - 1][1].item()  
    
    model_answer: str = context[start_char:end_char]
    return {
        'model_answer': model_answer,
        'answer_indices': [start_char, end_char]
    }

get_qa_answer(question=question, context=context)

In [None]:
validation_df = pd.DataFrame(raw_datasets['validation'])
validation_df['new_answer'] = ''
validation_df['new_start_ind'] = ''

validation_df.head()

In [None]:
total_correct: int = 0
total_incorrect: int = 0


for ind in range(len(validation_df)):
    question: str = validation_df.loc[ind, 'question']
    context: str = validation_df.loc[ind, 'context']

    correct_answer: str = validation_df.loc[ind, 'answers']['text'][0]
    correct_start_ind: int = validation_df.loc[ind, 'answers']['answer_start'][0]

    model_answer: dict = get_qa_answer(question=question, context=context)

    validation_df.loc[ind, 'new_answer'] = model_answer['model_answer']
    validation_df.loc[ind, 'new_start_ind'] = model_answer['answer_indices'][0]

    if ((correct_answer.lower() in model_answer['model_answer'].lower()) or 
            (model_answer['model_answer'].lower() in correct_answer.lower()) or 
            (correct_answer.lower().lower() == model_answer['model_answer'].lower())):
        total_correct += 1

    else:
        total_incorrect += 1

    if ind%50 == 0:
        print(f'runned {ind} / {len(validation_df)} records')
        validation_df.to_csv('', index=False)


validation_df.to_csv('', index=False)

In [None]:
print(total_correct / (total_correct + total_incorrect))