In [48]:
!pip install -r requirements.txt

Collecting huggingface_hub==0.18.0 (from -r requirements.txt (line 5))
  Using cached huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)
Collecting ipywidgets==8.1.1 (from -r requirements.txt (line 6))
  Downloading ipywidgets-8.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.9 (from ipywidgets==8.1.1->-r requirements.txt (line 6))
  Downloading widgetsnbextension-4.0.9-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.9 (from ipywidgets==8.1.1->-r requirements.txt (line 6))
  Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl.metadata (4.1 kB)
INFO: pip is looking at multiple versions of tokenizers to determine which version is compatible with other requirements. This could take a while.
Collecting tokenizers<0.15,>=0.14 (from transformers==4.35.0->-r requirements.txt (line 2))
  Using cached tokenizers-0.14.0-cp39-none-win_amd64.whl.metadata (6.8 kB)
Collecting evaluate==0.4.1 (from -r requirements.txt (line 4))
  Using cached ev

ERROR: Cannot install -r requirements.txt (line 1), -r requirements.txt (line 2), -r requirements.txt (line 4), huggingface_hub==0.18.0, tokenizers==0.14.1 and transformers because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts


In [45]:
# https://huggingface.co/abletobetable/distilbert-ru-qa
selected_model = "AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru"

In [49]:
import torch
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [23]:
sber_dataset = load_dataset("sberquad")
squad_dataset = load_dataset("squad")

In [24]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [25]:
print(squad_dataset["validation"][5])

{'id': '56be8e613aeaaa14008c90d1', 'title': 'Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'What was the theme of Super Bowl 50?', 'answers': {'text': ['"golden anniversary"', 'gold-themed', '"golden anniversary'], 'answer_star

In [26]:
tokenizer = AutoTokenizer.from_pretrained(selected_model)
model = AutoModelForQuestionAnswering.from_pretrained(selected_model)

In [27]:
tokenizer.is_fast

True

In [28]:
context = squad_dataset["train"][5]["context"]
question = squad_dataset["train"][5]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

"<s> When did the Scholastic Magazine of Notre dame begin publishing?</s></s> As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when som

In [29]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [30]:
eval_set = squad_dataset["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad_dataset["validation"].column_names,
)

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 9.64kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 473/473 [00:00<00:00, 158kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 649kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.20MB/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1428.59 examples/s]


In [34]:
tokenizer = AutoTokenizer.from_pretrained(selected_model)

In [35]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device)
         for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(selected_model).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

In [38]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

start_logits, end_logits

(array([[-7.450444 , -7.2323074, -7.8797503, ..., -7.69388  , -7.426134 ,
         -7.668083 ],
        [-7.2158074, -6.714641 , -7.325329 , ..., -6.4738503, -5.218007 ,
         -6.730356 ],
        [-7.2993956, -6.6172085, -7.563138 , ..., -6.7138453, -6.971804 ,
         -7.43128  ],
        ...,
        [-7.176881 , -5.935554 , -7.494331 , ..., -7.122738 , -7.130123 ,
         -7.0842752],
        [-7.3994904, -6.1245375, -6.344665 , ..., -7.264229 , -7.142623 ,
         -7.512552 ],
        [-7.30225  , -6.3859806, -6.085725 , ..., -9.943708 , -9.944086 ,
         -7.181176 ]], dtype=float32),
 array([[-5.7932386, -8.008629 , -6.6649036, ..., -5.9831257, -7.311994 ,
         -5.797443 ],
        [-5.440403 , -7.3044457, -5.5259466, ..., -4.038795 , -6.3687253,
         -6.00023  ],
        [-5.451812 , -7.659976 , -6.146906 , ..., -6.221962 , -7.5362883,
         -5.7656736],
        ...,
        [-4.9833417, -8.051199 , -5.7997475, ..., -4.9630194, -4.974517 ,
         -4.9098654

In [39]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)


example_to_features

defaultdict(list,
            {'56be4db0acb8001400a502ec': [0, 1, 2],
             '56be4db0acb8001400a502ed': [3, 4, 5],
             '56be4db0acb8001400a502ee': [6, 7, 8],
             '56be4db0acb8001400a502ef': [9, 10, 11],
             '56be4db0acb8001400a502f0': [12, 13, 14, 15],
             '56be8e613aeaaa14008c90d1': [16, 17, 18],
             '56be8e613aeaaa14008c90d2': [19, 20, 21],
             '56be8e613aeaaa14008c90d3': [22, 23, 24],
             '56bea9923aeaaa14008c91b9': [25, 26, 27],
             '56bea9923aeaaa14008c91ba': [28, 29, 30],
             '56bea9923aeaaa14008c91bb': [31, 32, 33],
             '56beace93aeaaa14008c91df': [34, 35, 36],
             '56beace93aeaaa14008c91e0': [37, 38, 39],
             '56beace93aeaaa14008c91e1': [40, 41, 42],
             '56beace93aeaaa14008c91e2': [43, 44, 45, 46],
             '56beace93aeaaa14008c91e3': [47, 48, 49],
             '56bf10f43aeaaa14008c94fd': [50, 51, 52, 53],
             '56bf10f43aeaaa14008c94fe': [54,

#### Используя набор данных Sberquad дообучить выбранную модель, оценить качество до и после дообучения