In [1]:
import json
import pandas as pd

import torch
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [2]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)

In [10]:
dataset = load_dataset("sberquad")
df_train = pd.DataFrame(dataset['train'])
subsample = df_train.drop_duplicates('context').sample(100)
paragraphs = subsample['context'].values

In [11]:
def vectors_from_texts(text,tokenizer,model):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    
    if len(tokenized_text) > 512:
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    tokens_tensor = tokens_tensor.to(device)
    segments_tensors = segments_tensors.to(device)

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2] # скрытые слои

    token_embeddings = torch.stack(hidden_states, dim=0)#собираем скрытые слои
    token_embeddings = torch.squeeze(token_embeddings, dim=1)#получаем тензор без избыточных размерностей
    token_embeddings = token_embeddings.permute(1,0,2)# меняем порядое размерности, чтобы формат соответствовал
    token_vecs_sum = []
    
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)#сумма последних 4-х токенов
        token_vecs_sum.append(sum_vec)

        token_vecs = hidden_states[-2][0]

    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding.cpu().numpy()

In [12]:
def retriv(question):#выбираем текст, в котором есть ответ на заданный вопрос
    MODEL = 'DeepPavlov/rubert-base-cased'
    tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=True)
    model = BertModel.from_pretrained(MODEL, output_hidden_states = True)
    model.to(device)
    model.eval()
    
    context_vectors = []
    for paragraph in paragraphs:
        context_vectors.append(vectors_from_texts(paragraph, tokenizer,model))
    question_vector = vectors_from_texts(question, tokenizer,model)
    vectors = []
    
    for vector in context_vectors:
        vectors.append(cosine(vector, question_vector))

    answ_par_id = vectors.index(min(vectors))
    paragraph = paragraphs[answ_par_id]
    return paragraph

In [13]:
def reader(question, paragraph):#составляем точный ответ
    tokenizer = AutoTokenizer.from_pretrained("AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru")
    model = AutoModelForQuestionAnswering.from_pretrained("AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru", force_download=True, resume_download=False)
    
    encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)
    
    inputs = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(inputs)
    output = model(input_ids=torch.tensor([inputs]))
    
    start_index = torch.argmax(output[0])
    end_index = torch.argmax(output[1])
    answer = ' '.join(tokens[start_index:end_index+1])
    
    corrected_answer = ''

    for word in answer.split(" "):
        if word[:1] == "▁":
            corrected_answer += ' ' + word[1:]
        else:
            corrected_answer += word
    
    return corrected_answer

In [14]:
def task5(question):
    paragraph = retriv(question)
    out_answ = reader(question, paragraph)
    return out_answ

In [15]:
question = subsample.sample()['question'].values[0]
print(question)
print(task5(question))

Согласно чему эмиссия совзнаков была прекращена?


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

 декрету СНК СССР
