In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


In [3]:
from tqdm.auto import tqdm  # for showing progress bar

def add_end_idx(answers, contexts):
    new_answers = []
    # loop through each answer-context pair
    for answer, context in tqdm(zip(answers, contexts)):
        # quick reformating to remove lists
        answer['text'] = answer['text'][0]
        answer['answer_start'] = answer['answer_start'][0]
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
        new_answers.append(answer)
    return new_answers

def prep_data(dataset):
    questions = dataset['question']
    contexts = dataset['context']
    answers = add_end_idx(
        dataset['answers'],
        contexts
    )
    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

In [4]:
import pandas as pd
df = pd.read_excel("data4.xlsx", index_col=None)

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
df_tr, df_eval = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
df_eval

Unnamed: 0,context,question,answer_text,answer_start
80,Request Body\n\nThe request accepts the follow...,canarySettings,canary deployment settings of this stage.,435
4,The stage of order processing (if it has the P...,What does RESERVATION_EXPIRED mean?,The customer has not completed a reserved orde...,579
40,Name Type Description length number Packing le...,What does width mean,Packing width in cm.,195
69,OfferProcessingNoteType Type of the reason why...,NO_PARAMETERS_IN_SHOP_TITLE,"product is produced in different versions, and...",2206
10,"""""""The order processing stage (if it has the P...",RESERVATION_EXPIRED,buyer did not complete the reserved order with...,568
45,OfferProcessingStatusType Product publication ...,NEED_CONTENT,for a product without a SKU on the marketSku M...,227
70,OfferProcessingNoteType Type of the reason why...,NO_SIZE_MEASURE,item requires a size chart.,2420
66,OfferProcessingNoteType Type of the reason why...,NEED_VENDOR,manufacturer of the goods is incorrectly speci...,1949
47,OfferProcessingStatusType Product publication ...,EJECTED,"product has not been moderated, as the Market ...",632
11,"""""""The order processing stage (if it has the P...",SHOP_FAILED,store cannot fulfill the order,646


In [7]:
answer_texts_tr = df_tr['answer_text'].tolist()
answer_starts_tr = df_tr['answer_start'].tolist()

answer_texts_eval = df_eval['answer_text'].tolist()
answer_starts_eval = df_eval['answer_start'].tolist()

In [8]:
answers_tr = []
for text, start in zip(answer_texts_tr, answer_starts_tr):
    answers_tr.append({'text': [text], 'answer_start': [start]})

answers_eval = []
for text, start in zip(answer_texts_eval, answer_starts_eval):
    answers_eval.append({'text': [text], 'answer_start': [start]})

In [9]:
import uuid

id_array_tr = []
titles_tr = []

for elem in answer_starts_tr:
    x = uuid.uuid4()
    x = str(x)
    id_array_tr.append(x)
    titles_tr.append("My_dataset_train")

id_array_eval = []
titles_eval = []

for elem in answer_starts_eval:
    x = uuid.uuid4()
    x = str(x)
    id_array_eval.append(x)
    titles_eval.append("My_dataset_eval")


In [10]:
data_tr = {}
data_tr['context'] = df_tr['context'].tolist()
data_tr['question'] = df_tr['question'].tolist()
data_tr['answers'] = answers_tr
data_tr['id'] = id_array_tr
data_tr['title'] = titles_tr

data_eval = {}
data_eval['context'] = df_eval['context'].tolist()
data_eval['question'] = df_eval['question'].tolist()
data_eval['answers'] = answers_eval
data_eval['id'] = id_array_eval
data_eval['title'] = titles_eval

In [11]:
print(data_tr['question'][3], '\n', data_tr['answers'][3])

created_at 
 {'text': ['Time of resource creation specified in RFC3339 (Timestamps) format.'], 'answer_start': [510]}


In [12]:
data_eval['answers'][3]

{'text': ['product is produced in different versions, and it is not clear from the specified name which one it is. '],
 'answer_start': [2206]}

In [13]:
#dataset = prep_data(data1)

In [14]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

dd_tr = Dataset.from_dict(data_tr)
dd_eval = Dataset.from_dict(data_eval)

In [15]:
data_eval['question']

['canarySettings',
 'What does RESERVATION_EXPIRED mean?',
 'What does width mean',
 'NO_PARAMETERS_IN_SHOP_TITLE',
 'RESERVATION_EXPIRED',
 'NEED_CONTENT',
 'NO_SIZE_MEASURE',
 'NEED_VENDOR',
 'EJECTED',
 'SHOP_FAILED',
 'lastUpdatedDate',
 'width',
 'documentationVersion',
 'status',
 'PICKUP',
 'What does STARTED mean?',
 'BadRequestException',
 'What does UNPAID status mean?',
 'deploymentId',
 'What does NEED_INFO mean?',
 'variables',
 'cacheClusterStatus',
 'USER_CHANGED_MIND']

In [16]:
#model_checkpoint = "AndrewChar/model-QA-5-epoch-RU"
#batch_size = 16

In [17]:
# max_length = 3284 # The maximum length of a feature (question and context)
# doc_stride = 256 # The authorized overlap between two part of the context when splitting it is needed.

In [18]:
data_tr.keys()

dict_keys(['context', 'question', 'answers', 'id', 'title'])

In [19]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

##model_checkpoint = "distilbert-base-cased-distilled-squad"

In [20]:
model_checkpoint = "distilbert-base-cased-distilled-squad"

max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 256

In [21]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [22]:
pad_on_right = tokenizer.padding_side == "right"

In [23]:
features_tr = prepare_train_features(data_tr)
tokenized_datasets_tr = dd_tr.map(prepare_train_features, batched=True, remove_columns=dd_tr.column_names)

features_eval = prepare_train_features(data_eval)
tokenized_datasets_eval = dd_eval.map(prepare_train_features, batched=True, remove_columns=dd_eval.column_names)

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [24]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/261M [00:00<?, ?B/s]

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

In [25]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
from transformers import default_data_collator

data_collator = default_data_collator

In [27]:
batch_size = 16
lr = 1e-6
epochs = 15

In [46]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-lr{lr}-epochs{epochs}",
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

In [47]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets_tr,
    eval_dataset=tokenized_datasets_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

/content/distilbert-base-cased-distilled-squad-finetuned-lr1e-06-epochs15 is already a clone of https://huggingface.co/gallyamovi/distilbert-base-cased-distilled-squad-finetuned-lr1e-06-epochs15. Make sure you pull the latest changes with `repo.git_pull()`.


In [48]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.097086
2,No log,1.082725
3,No log,1.081452
4,No log,1.072237
5,No log,1.070802
6,No log,1.069283
7,No log,1.070804
8,No log,1.071453
9,No log,1.072058
10,No log,1.076543


TrainOutput(global_step=120, training_loss=0.3421732584635417, metrics={'train_runtime': 89.3757, 'train_samples_per_second': 19.636, 'train_steps_per_second': 1.343, 'total_flos': 229296190556160.0, 'train_loss': 0.3421732584635417, 'epoch': 15.0})

In [49]:
trainer.save_model(model_checkpoint + "_gallyamovi")

Upload file pytorch_model.bin:   0%|          | 1.00/249M [00:00<?, ?B/s]

Upload file runs/Apr24_04-32-48_ec96c80191a1/1682310774.935971/events.out.tfevents.1682310774.ec96c80191a1.260…

Upload file runs/Apr24_04-39-05_ec96c80191a1/1682311150.9725761/events.out.tfevents.1682311150.ec96c80191a1.26…

Upload file runs/Apr24_04-39-05_ec96c80191a1/events.out.tfevents.1682311150.ec96c80191a1.260.6:   0%|         …

Upload file training_args.bin:   0%|          | 1.00/3.62k [00:00<?, ?B/s]

Upload file runs/Apr24_04-32-48_ec96c80191a1/events.out.tfevents.1682310774.ec96c80191a1.260.2:   0%|         …

To https://huggingface.co/gallyamovi/distilbert-base-cased-distilled-squad-finetuned-lr1e-06-epochs15
   163254a..4488c12  main -> main

   163254a..4488c12  main -> main

To https://huggingface.co/gallyamovi/distilbert-base-cased-distilled-squad-finetuned-lr1e-06-epochs15
   4488c12..c671155  main -> main

   4488c12..c671155  main -> main



In [50]:
trainer.push_to_hub()

In [50]:
from transformers import AutoModelForQuestionAnswering

model_0 = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
model_1 = AutoModelForQuestionAnswering.from_pretrained("gallyamovi/distilbert-base-cased-distilled-squad-finetuned-squad")

Downloading (…)lve/main/config.json:   0%|          | 0.00/597 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [30]:
#model2 = pipeline(model="gallyamovi/distilbert-base-cased-distilled-squad-finetuned-squad")

In [31]:
# context = r"""
# Возможные значения:

# AREA — область.
# CITY — крупный город.
# CONTINENT — континент.
# COUNTRY — страна.
# DISCTRICT — район города.
# MONORAIL_STATION — станция монорельса.
# OVERSEAS_TERRITORY — отдельная территория какого-либо государства, расположенная в другой части света (например, Ангилья, Гренландия, Бермудские острова и т. д.).
# REGION — регион.
# REPUBLIC — субъект федерации.
# REPUBLIC_AREA — район субъекта федерации.
# SECONDARY_DISTRICT — район города второго уровня (например, для ВАО Москвы районами второго уровня являются Измайлово, Новокосино, Перово и т. д.).
# """
# question = "SECONDARY_DISTRICT"

In [32]:
#%time model2(context=context, question=question)

In [33]:
# question_answerer = pipeline("question-answering", model="gallyamovi/distilbert-base-cased-distilled-squad-finetuned-squad")

# context = r"""
# ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example     of a
# ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
# ... a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
# ... """
# result = question_answerer(question="What is a good example of a question answering dataset?",     context=context)

In [51]:
context = r"""
OfferProcessingNoteType Type of the reason why the product did not pass moderation: ASSORTMENT — the product is produced in different versions. Each of them must be described as a separate product (the offer-mapping-entry input parameter of the /campaigns/{campaignId}/offer-mapping-entries/updates POST request or a line in the catalog if you upload products through the store's personal account). CANCELLED - the product has been withdrawn from moderation on your initiative. CONFLICTING_INFORMATION (previously erroneously CONFLICTING) - You have provided conflicting product information. The parameters to be corrected are specified in the payload parameter. DEPARTMENT_FROZEN — the rules for placing products in this category are being processed, so the product cannot be moderated yet. INCORRECT_INFORMATION - The product information you provided is inconsistent with the manufacturer's description. The parameters to be corrected are specified in the payload parameter. LEGAL_CONFLICT - the product did not pass moderation due to legal reasons. For example, it is not officially sold in Russia or you do not have permission to sell it. NEED_CLASSIFICATION_INFORMATION - The information you provided about the product is not enough to classify it. Please check that you have correctly entered the name, category, manufacturer and country of production of the product, as well as the URL of images or pages with descriptions that can be used to identify the product. NEED_INFORMATION - the product has not been sold in Russia before and is not yet available on the Market. You can create a card for it. For more information, see Working with an item card in Market Help for merchants. NEED_PICTURES - product images are needed to identify the product. Send the URL of the product images in the POST request /campaigns/{campaignId}/offer-mapping-entries/updates or upload the updated catalog via the store's personal account. NEED_VENDOR - the manufacturer of the goods is incorrectly specified. NO_CATEGORY, NO_KNOWLEDGE — products from the specified category are not yet placed on the Market. If the category appears, the product will be sent for moderation again. NO_PARAMETERS_IN_SHOP_TITLE - the product is produced in different versions, and it is not clear from the specified name which one it is. The parameters to be added to the product name are specified in the payload parameter. NO_SIZE_MEASURE - This item requires a size chart. Send it to support or your manager. Size grid requirements are specified in the payload parameter. UNKNOWN - the product has not been moderated for another reason. Contact support or your manager.
"""

In [82]:
data_eval['question']

['height',
 'What does width mean',
 'What does RESERVATION_EXPIRED mean?',
 'SUSPENDED',
 'PROCESSING',
 'ASSORTMENT',
 'NEED_VENDOR',
 'How to remove an item that cannot be removed',
 'DELISTED',
 'tags',
 'USER_NOT_PAID',
 'What does SUSPENDED mean?',
 'STARTED',
 'ACTIVE',
 'documentationVersion',
 'zone_id',
 'variables',
 'What does STARTED mean?',
 'description',
 'documentationVersion',
 'CANCELED',
 'weight',
 'What does PICKUP status mean?',
 'NEED_CLASSIFICATION_INFORMATION',
 'length']

In [70]:
i = 68
context = data_tr['context'][i]
question = data_tr['question'][i]
ans = data_tr['answers'][i]
question, ans

('height', {'text': ['Packing height in cm.'], 'answer_start': [388]})

In [71]:
context

'Name Type Description length number Packing length in cm. Can be specified up to thousandths, decimal separator is a dot. Example: 65.55. Required if weightDimensions is specified. width number Packing width in cm. It can be specified up to thousandths, the separator between integer and fractional parts is a dot. Example: 50.7. Required if weightDimensions is specified. height number Packing height in cm. It can be specified up to thousandths, the separator between integer and fractional parts is a dot. Example: 20.0. Required if weightDimensions is specified. weight number Weight of goods in kg including packaging (gross). You can specify up to thousandths, the separator of the integer and fractional parts is a dot. Example: 1.001. Required if weightDimensions is specified.'

In [72]:
print('QUESTION: ', question)
print('TRUE ANSWER: ', ans['text'])

QUESTION:  height
TRUE ANSWER:  ['Packing height in cm.']


In [73]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model='gallyamovi/distilbert-base-cased-distilled-squad-finetuned-lr1e-07-epochs15')

result = question_answerer(question=question, context=context)
result['answer']


'Packing height in cm. It'

In [69]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')


result = question_answerer(question=question, context=context)
result['answer']


'Send it to support or your manager'

In [48]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [49]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

/content/model-QA-5-epoch-RU-finetuned-squad is already a clone of https://huggingface.co/gallyamovi/model-QA-5-epoch-RU-finetuned-squad. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.save_model(model_checkpoint + "_gallyamovi")

To https://huggingface.co/gallyamovi/model-QA-5-epoch-RU-finetuned-squad
   b637db7..bc49216  main -> main

   b637db7..bc49216  main -> main



In [None]:
trainer.push_to_hub()

In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("gallyamovi/model-QA-5-epoch-RU-finetuned-squad")

Downloading (…)lve/main/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

In [None]:
context = "Путин президент России. Барак Обама был президентом США. Иван студент МФТИ."
question = "Путин"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

context, question


('Путин президент России. Барак Обама был президентом США. Иван студент МФТИ.',
 'Путин')

In [None]:
inputs = tokenizer(question, context, return_tensors="pt")

In [None]:
import torch

with torch.no_grad():
    outputs = model(**inputs)

In [None]:
import tensorflow as tf

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'Путин президент России'

In [None]:
context = r"""
Возможные значения:

AREA — область.
CITY — крупный город.
CONTINENT — континент.
COUNTRY — страна.
DISCTRICT — район города.
MONORAIL_STATION — станция монорельса.
OVERSEAS_TERRITORY — отдельная территория какого-либо государства, расположенная в другой части света (например, Ангилья, Гренландия, Бермудские острова и т. д.).
REGION — регион.
REPUBLIC — субъект федерации.
REPUBLIC_AREA — район субъекта федерации.
SECONDARY_DISTRICT — район города второго уровня (например, для ВАО Москвы районами второго уровня являются Измайлово, Новокосино, Перово и т. д.).
"""
question = "SECONDARY_DISTRICT"

inputs = tokenizer(question, context, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

print(question + '\n\n')

tokenizer.decode(predict_answer_tokens)

SECONDARY_DISTRICT




'район города второго уровня'

In [None]:
model2 = pipeline(model='gallyamovi/model-QA-5-epoch-RU-finetuned-squad')

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
%time model2(context=context, question=question)

CPU times: user 549 ms, sys: 3.18 ms, total: 552 ms
Wall time: 637 ms


{'score': 0.5618088841438293,
 'start': 439,
 'end': 466,
 'answer': 'район города второго уровня'}

In [None]:
model3 = pipeline(model='AndrewChar/model-QA-5-epoch-RU')

Some layers from the model checkpoint at AndrewChar/model-QA-5-epoch-RU were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at AndrewChar/model-QA-5-epoch-RU and are newly initialized: ['dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
%time model3(context=context, question=question)

CPU times: user 1.01 s, sys: 13 ms, total: 1.02 s
Wall time: 1.03 s


{'score': 0.5618084073066711,
 'start': 439,
 'end': 466,
 'answer': 'район города второго уровня'}

In [None]:
context = r"""

Name
Type
Description
id
integer
Идентификатор региона.
name*
string
Название региона.
type*
RegionType
Тип региона.

Возможные значения:

AREA — область.
CITY — крупный город.
CONTINENT — континент.
COUNTRY — страна.
DISCTRICT — район города.
MONORAIL_STATION — станция монорельса.
OVERSEAS_TERRITORY — отдельная территория какого-либо государства, расположенная в другой части света (например, Ангилья, Гренландия, Бермудские острова и т. д.).
REGION — регион.
REPUBLIC — субъект федерации.
REPUBLIC_AREA — район субъекта федерации.
SECONDARY_DISTRICT — район города второго уровня (например, для ВАО Москвы районами второго уровня являются Измайлово, Новокосино, Перово и т. д.).
SETTLEMENT — поселение.
SUBURB — пригород.
SUBWAY_STATION — станция метро.
TOWN — город.
UNKNOWN — неизвестный регион.
parent
RegionDTO
Информация о родительском регионе.

Указываются родительские регионы до уровня страны.
children
RegionDTO[]
Дочерние регионы.
RegionType

Тип региона.

Возможные значения:

CITY_DISTRICT — район города.
CITY — крупный город.
CONTINENT — континент.
COUNTRY_DISTRICT — область.
COUNTRY — страна.
REGION — регион.
REPUBLIC_AREA — район субъекта федерации.
REPUBLIC — субъект федерации.
SUBWAY_STATION — станция метро.
VILLAGE — город.
OTHER — неизвестный регион.

"""

In [None]:
question = "район субъекта федерации"

In [None]:
%time model2(context=context, question=question)

CPU times: user 3.81 s, sys: 0 ns, total: 3.81 s
Wall time: 3.82 s


{'score': 0.27811652421951294,
 'start': 3034,
 'end': 3042,
 'answer': 'REPUBLIC'}

In [None]:
%time model3(context=context, question=question)

CPU times: user 8.75 s, sys: 93.1 ms, total: 8.84 s
Wall time: 7.24 s


{'score': 0.27811765670776367,
 'start': 3034,
 'end': 3042,
 'answer': 'REPUBLIC'}

In [None]:
import requests

from bs4 import BeautifulSoup
from bs4.element import Tag

blocks = ["p", "h1", "h2", "h3", "h4", "h5", "blockquote"]
from transformers import pipeline

def _extract_blocks(parent_tag) -> list:
    extracted_blocks = []
    for tag in parent_tag:
        if tag.name in blocks:
            extracted_blocks.append(tag)
            continue
        if isinstance(tag, Tag):
            if len(tag.contents) > 0:
                inner_blocks = _extract_blocks(tag)
                if len(inner_blocks) > 0:
                    extracted_blocks.extend(inner_blocks)
    return extracted_blocks

def to_plaintext(html_text: str) -> str:
    soup = BeautifulSoup(html_text, features="lxml")
    extracted_blocks = _extract_blocks(soup.body)
    extracted_blocks_texts = [block.get_text().strip() for block in extracted_blocks]
    return "\n".join(extracted_blocks_texts)

def get_text(url):
    page = requests.get(url)
    text = to_plaintext(page.text)
    return text

url = 'https://yandex.ru/dev/market/partner-api/doc/ru/reference/regions/searchRegionChildren'
text = get_text(url)

print(text[:100]) 

Информация о дочерних регионах
Возвращает информацию о регионах, являющихся дочерними по отношению к


In [None]:
context = text


In [None]:
%time model2(context=context, question=question)

CPU times: user 3.85 s, sys: 0 ns, total: 3.85 s
Wall time: 3.84 s


{'score': 0.6036526560783386,
 'start': 2170,
 'end': 2214,
 'answer': 'отдельная территория какого-либо государства'}

In [None]:
%time model3(context=context, question=question)

CPU times: user 8.72 s, sys: 72.9 ms, total: 8.8 s
Wall time: 9.83 s


{'score': 0.6036524176597595,
 'start': 2170,
 'end': 2214,
 'answer': 'отдельная территория какого-либо государства'}

In [173]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf
from transformers import AutoTokenizer, BertTokenizer
import torch

text = r"""
gRPC code	gRPC status	HTTP code	Error description
1	CANCELLED	499	The operation was aborted on the client side.
2	UNKNOWN	500	Unknown error.
3	INVALID_ARGUMENT	400	Incorrect request parameters specified. Details are provided in the details field.
4	DEADLINE_EXCEEDED	504	The server response timed out.
5	NOT_FOUND	404	The requested resource not found.
6	ALREADY_EXISTS	409	Attempt to create a resource that already exists.
7	PERMISSION_DENIED	403	The user has no permissions required to perform the operation.
8	RESOURCE_EXHAUSTED	429	The request limit exceeded.
9	FAILED_PRECONDITION	400	The operation was canceled as its preconditions were not met. Examples: an attempt to delete a non-empty folder or to run the rmdir command for an object that is not a folder.
10	ABORTED	409	The operation failed due to a concurrent computing conflict, such as an invalid sequence of commands or an aborted transaction.
11	OUT_OF_RANGE	400	Out of range. For example, searching or reading outside of the file.
12	NOT_IMPLEMENTED	501	The operation is not supported by the service.
13	INTERNAL	500	Internal server error. This error means that the operation cannot be performed due to a server-side technical problem. For example, due to insufficient computing resources.
14	UNAVAILABLE	503	The service is currently unavailable. Try again in a few seconds.
15	DATA_LOSS	500	Permanent data loss or damage.
16	UNAUTHENTICATED	401	The operation requires authentication.
"""
question = "OUT_OF_RANGE"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model0(**inputs)

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

print(question + '\n\n')

tokenizer.decode(predict_answer_tokens)

OUT_OF_RANGE




'[CLS]'

In [150]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ -6.4811,  -6.0346,  -8.2896,  -8.3321,  -5.4741, -10.1977,  -5.6380,
          -8.4348,  -1.1405,   0.7115,  -4.5685,  -5.2944,  -3.2320,  -1.0997,
          -8.0167,  -6.6972,  -8.4444,  -7.2350,  -5.6081,  -7.1601,  -4.3694,
         -10.2164,  -4.4498,  -7.8633,  -9.6116, -10.3542, -10.1072,  -6.0033,
          -7.3285,  -7.6127,  -6.5942, -11.3136,  -9.6610,  -7.7261,  -7.9657,
          -4.4977, -10.4202,  -4.9833,  -8.3430,  -9.7261, -10.1485, -10.2609,
          -5.7992,  -8.8550,  -6.2583,  -8.3606, -10.9817,  -7.5700,  -5.6310,
          -8.6464,  -8.8597,  -5.2637, -10.8880,  -5.9670,  -8.9884, -10.2281,
         -11.0610, -10.8865,  -7.7926,  -8.3321]]), end_logits=tensor([[ -4.9181, -10.1890,  -8.7844,  -8.6567,  -9.3095,  -7.6696,  -8.7831,
          -5.5539,  -6.9771,  -1.9832,  -2.8869,   0.9508,  -7.0594,  -6.3127,
          -1.7459,  -0.4897,  -9.0770,  -9.1606,  -1.1910,  -3.7562,  -7.5693,
         -10.23