In [1]:
!pip install transformers
!pip install nmslib

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting nmslib
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11<2.6.2 (from nmslib)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Building wheels for collected packages: nmslib
  Building wheel for nmslib (setup.py) ... [?25ldone
[?25h  Created wheel for nmslib: filename=nmslib-2.1.1-cp310-cp310-linux_x86_64.whl size=13550778 sha256=34ca73a8a96f5b

#### Пример: семантическая близость

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [10]:
sentences = [
    'Позовите оператора поддержки, бот не помог',
    'Нужна помощь человека для решения вопроса',
    'Мне нужна новая карта',
    'Хотел бы выпустить ещё одну карточку',
    'So, the robot was useless I need a human expert',
]

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')

In [11]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [12]:
sentence_embeddings

tensor([[ 0.0319,  0.3350, -0.0372,  ..., -0.0135, -0.1594,  0.2159],
        [ 0.0091,  0.2025,  0.1061,  ...,  0.1398, -0.0556,  0.0987],
        [-0.0028, -0.0662,  0.1808,  ..., -0.3124,  0.1364, -0.1686],
        [ 0.0397,  0.2135, -0.0526,  ...,  0.0632, -0.2187, -0.0405],
        [ 0.1122,  0.4121,  0.1259,  ...,  0.0062, -0.2281, -0.0306]])

In [13]:
import nmslib

index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(sentence_embeddings[: -1], ids=list(range(len(sentence_embeddings[: -1]))))
index.createIndex({'post': 2}, print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
****************************************************************
*

In [14]:
ids, distances = index.knnQuery(sentence_embeddings[-1], k=10)

In [15]:
for i, d in zip(ids, distances):
    print(sentences[i], '\t', d)

Позовите оператора поддержки, бот не помог 	 0.53279954
Нужна помощь человека для решения вопроса 	 0.6821636
Мне нужна новая карта 	 0.7499061
Хотел бы выпустить ещё одну карточку 	 0.9512555


#### Пример: генерация текста

In [16]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

In [7]:
text = 'Хочу записатся к врачу'
input_ids = tokenizer.encode(text, return_tensors='pt')

tokens = model.generate(
    input_ids,
    max_length=64,
    repetition_penalty=5.0,
    do_sample=False,
    top_k=5,
    top_p=0.95,
    temperature=1.0,
    num_beams=5,
    no_repeat_ngram_size=4,
)
print([tokenizer.decode(t) for t in tokens])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Хочу записатся к врачу, но не знаю с чего начать.  Может кто подскажет? \n Добрый день!  У меня такая проблема: у моего мужа после операции по удалению аппендицита обнаружили опухоль в правом подреберье (похоже на рак).  Врач прописал курс антибиотиков']


In [13]:
!unzip archive.zip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Archive:  archive.zip
  inflating: dataset_test.tsv        
  inflating: dataset_train.tsv       
  inflating: labels_description.txt  
