In [1]:
import numpy as np
import pandas as pd

In [2]:
import gdown

In [3]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers


In [46]:
# загрузка данных
file_id = '19AeSoUQkL9ao12N7SM3jcsDdhLm_kuFx' # доступ для чтения
gdown.download(id=file_id)
#https://drive.google.com/file/d/19AeSoUQkL9ao12N7SM3jcsDdhLm_kuFx/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=19AeSoUQkL9ao12N7SM3jcsDdhLm_kuFx
To: /content/dev-v1.1.json
100%|██████████| 137k/137k [00:00<00:00, 2.64MB/s]


'dev-v1.1.json'

In [5]:
# загрузка данных
file_id = '1lgen59NWzS08wMgvjYnctGerWaJ_YiQX' # доступ для чтения
gdown.download(id=file_id)
#https://drive.google.com/file/d/1lgen59NWzS08wMgvjYnctGerWaJ_YiQX/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1lgen59NWzS08wMgvjYnctGerWaJ_YiQX
To: /content/train_truncated_data.json
100%|██████████| 29.2k/29.2k [00:00<00:00, 38.8MB/s]


'train_truncated_data.json'

In [47]:
train_contexts, train_questions, train_answers = read_squad('train_truncated_data.json')
val_contexts, val_questions, val_answers = read_squad('dev-v1.1.json')

In [48]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)


        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2


In [49]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [50]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)


In [51]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))


        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [52]:

import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [53]:

from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [54]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

val_loader = DataLoader(val_dataset, batch_size=16)

# Списки для отслеживания метрик валидации
val_loss = []
val_accuracy = []

# Валидация модели
for batch in tqdm(val_loader, desc="Validation"):
    # Подготовка входных данных и меток
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    with torch.no_grad():  # Выключение вычисления градиентов
        # Прямой проход
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        start_preds = torch.argmax(outputs.start_logits, dim=1)
        end_preds = torch.argmax(outputs.end_logits, dim=1)

        # Обновление метрик
        val_loss.append(loss.item())
        # Оценка точности: простая проверка на совпадение позиций начала и конца
        accuracy = ((start_preds == start_positions) & (end_preds == end_positions)).float().mean()
        val_accuracy.append(accuracy.item())

# Вычисление среднего значения потерь и точности
mean_val_loss = sum(val_loss) / len(val_loss)
mean_val_accuracy = sum(val_accuracy) / len(val_accuracy)

print(f'Validation Loss: {mean_val_loss:.3f}')
print(f'Validation Accuracy: {mean_val_accuracy:.3f}')

Validation: 100%|██████████| 45/45 [05:57<00:00,  7.94s/it]

Validation Loss: 5.792
Validation Accuracy: 0.000





In [58]:
def answer_question(question, context):
    # Токенизация вопроса и контекста
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

    # Получение индексов токенов для входных данных модели
    input_ids = inputs["input_ids"].tolist()[0]

    # Передача токенизированного вопроса и контекста в модель
    with torch.no_grad():
        outputs = model(**inputs)

    # Выделение индексов начала и конца ответа
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Преобразование индексов обратно в текст ответа
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

In [60]:
# Пример использования
question = "How often is Notre Dame's the Juggler published?"
context = "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students. Finally, in Spring 2008 an undergraduate journal for political science research, Beyond Politics, made its debut."

answer = answer_question(question, context)
print("Answer:", answer)

Answer: , notre dame ' s students run a number of news media outlets. the nine student - run outlets include three newspapers, both a radio and television station, and several magazines and journals. begun as a one - page journal in september 1876, the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states. the other magazine, the juggler, is released twice a year and focuses on student literature and artwork. the dome yearbook is published annually. the newspapers have varying publication interests, with the observer published daily and mainly reporting university and other news, and staffed by students from both notre dame and saint mary ' s college. unlike scholastic and the dome, the observer is an independent publication and does not have a faculty advisor or any editorial oversight from the university. in 1987,
