### Ноутбук создан на основе семинара Гусева Ильи на кафедре компьютерной лингвистики МФТИ.

In [None]:
!wget -q https://www.dropbox.com/s/43l702z5a5i2w8j/gazeta_train.txt
!wget -q https://www.dropbox.com/s/k2egt3sug0hb185/gazeta_val.txt
!wget -q https://www.dropbox.com/s/3gki5n5djs9w0v6/gazeta_test.txt

In [0]:
!pip install --upgrade razdel allennlp torch fasttext OpenNMT-py networkx pymorphy2 nltk rouge==0.3.1 summa
!pip install transformers youtokentome catalyst
!pip install flair

In [None]:
from transformers import BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelWithLMHead
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
import torch
import random
from scipy.spatial import distance
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

import pandas as pd

In [0]:
# Function for reading text files

def read_gazeta_records(file_name, shuffle=True, sort_by_date=False):
    assert shuffle != sort_by_date
    records = []
    with open(file_name, "r") as r:
        for line in r:
            records.append(eval(line)) # Simple hack
    records = pd.DataFrame(records)
    if sort_by_date:
        records = records.sort("date")
    if shuffle:
        records = records.sample(frac=1)
    return records

In [0]:
# Read text files: train, validation, test

train_records = read_gazeta_records("gazeta_train.txt")
val_records = read_gazeta_records("gazeta_val.txt")
test_records = read_gazeta_records("gazeta_test.txt")

In [0]:
# Function to calculate scores: Bleu & Rouge

def calc_scores(references, predictions, metric="all"):
    print("Count:", len(predictions))
    print("Ref:", references[-1])
    print("Hyp:", predictions[-1])

    if metric in ("bleu", "all"):
        print("BLEU: ", corpus_bleu([[r] for r in references], predictions))
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(predictions, references, avg=True)
        print("ROUGE: ", scores)

### TextRank method

In [0]:
# Load FastText sentence embeddings trained on wiki ru, it is a mean over word embeddings

from flair.embeddings import WordEmbeddings, BytePairEmbeddings, StackedEmbeddings
from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings

pooled_embeddings = DocumentPoolEmbeddings([WordEmbeddings('ru')], pooling='mean')

In [0]:
# Load DeepPavlov Bert embeddings and tokenizer

tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model_bert = AutoModelWithLMHead.from_pretrained("DeepPavlov/rubert-base-cased")

In [17]:
from itertools import combinations
import networkx as nx
import numpy as np
import torch
import razdel
import pymorphy2


 
def unique_words_similarity(words1, words2):
    '''
    Function to calculate sentences similarity based on words intersection
    ''' 
    words1 = set(words1)
    words2 = set(words2)
    if not len(words1) or not len(words2):
        return 0.0
    return len(words1.intersection(words2))/(np.log10(len(words1)) + np.log10(len(words2)))

def similarity_FastTextFlair(words1, words2):
    '''
    Function to calculate sentences similarity based on cosine distance between FastText embeddings (Flair)
    ''' 
    sentence1 = Sentence(' '.join(words1))
    sentence2 = Sentence(' '.join(words2))

    # embed with pooled embeddings
    with torch.no_grad():
        pooled_embeddings.embed(sentence1)
        pooled_embeddings.embed(sentence2)

    vec1 = sentence1.embedding.cpu()
    vec2 = sentence2.embedding.cpu()
    return distance.cosine(vec1, vec2) # np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def similarity_FastText(words1, words2):
    '''
    Function to calculate sentences similarity based on cosine distance between FastText embeddings (ft_native_300_ru_wiki_lenta_lower_case)
    ''' 
    sentence1 = ' '.join(words1)
    sentence2 = ' '.join(words2)

    # embed with pooled embeddings
    vec1 = model_ft.get_sentence_vector(sentence1)
    vec2 = model_ft.get_sentence_vector(sentence2)

    # To CPU
    # vec1 = vec1.cpu()
    # vec2 = vec2.cpu()
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def similarity_DeepPavlovBert(words1,  words2):
    '''
    Function to calculate sentences similarity based on cosine distance between DeepPavlov embeddings
    ''' 
    input_ids1, input_ids2 = torch.tensor([tokenizer_bert.encode(words1)]), torch.tensor([tokenizer_bert.encode(words2)])
    with torch.no_grad():
        output_tuple1, output_tuple2 = model_bert(input_ids1), model_bert(input_ids2)
        last_hidden_states1, last_hidden_states2 = output_tuple1[0], output_tuple2[0]


    cos = torch.nn.CosineSimilarity(1)
    # print(last_hidden_states2.mean(1).shape)
    dist_s = cos(last_hidden_states2.mean(1), last_hidden_states1.mean(1))
 
    # dist_s = scipy.spatial.distance.cdist([emb1], emb2, "cosine")[0]
    return dist_s.item()
    

def gen_text_rank_summary(text, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, morph=None):
    '''
    Summary generation using TextRank
    '''
    # Split text in sentences
    sentences = [sentence.text for sentence in razdel.sentenize(text)]
    n_sentences = len(sentences)

    # Tokenize sentences
    sentences_words = [[token.text.lower() if lower else token.text for token in razdel.tokenize(sentence)] for sentence in sentences]

    # Lemmatize words if necessary 
    if morph is not None:
        sentences_words = [[morph.parse(word)[0].normal_form for word in words] for words in sentences_words]

    # For each sentences pair calculate similarity
    pairs = combinations(range(n_sentences), 2)
    scores = [(i, j, calc_similarity(sentences_words[i], sentences_words[j])) for i, j in pairs]
    

    # Build a graph with edges that are equal similarity value between sentences
    g = nx.Graph()
    g.add_weighted_edges_from(scores)

    # Calculate PageRank
    pr = nx.pagerank(g)
    result = [(i, pr[i], s) for i, s in enumerate(sentences) if i in pr]
    result.sort(key=lambda x: x[1], reverse=True)

    # Choose top sentences
    n_summary_sentences = max(int(n_sentences * summary_part), 1)
    result = result[:n_summary_sentences]

    # Restore their original order
    result.sort(key=lambda x: x[0])

    # Restore summary text
    predicted_summary = " ".join([sentence for i, proba, sentence in result])
    predicted_summary = predicted_summary.lower() if lower else predicted_summary
    return predicted_summary

def calc_text_rank_score(records, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, nrows=1000, morph=None):
    '''
    Function to calculate TextRank score
    '''
    references = []
    predictions = []

    for text, summary in records[['text', 'summary']].values[:nrows]:
        summary = summary if not lower else summary.lower()
        references.append(summary)

        predicted_summary = gen_text_rank_summary(text, calc_similarity, summary_part, lower, morph=morph)
        text = text if not lower else text.lower()
        predictions.append(predicted_summary)

    calc_scores(references, predictions)

# calc_text_rank_score(test_records, calc_similarity=unique_words_similarity)
calc_text_rank_score(test_records.iloc[:1], calc_similarity=similarity_DeepPavlovBert)

Count: 1
Ref: евросоюз тянет с санкциями против турции, поэтому кипр заблокировал введение ограничительных мер против нескольких чиновников из крыма, сообщает reuters со ссылкой на три дипломатических источника. никосия отрицает, что связывает эти два вопроса, и говорит, что ей просто нужно время для анализа предложенных ограничительных мер против россиян.
Hyp: это тем более удивительно, поскольку кипр пользовался беспрецедентной поддержкой и солидарностью со своими партнерами по ес в течение последних нескольких месяцев — в споре с турцией», — сказал один из дипломатов ес. в июне 2019 года тогдашний председатель еврокомиссии жан-клод юнкер заявил, что действия турецкой стороны в иэз кипра неприемлемы, поэтому анкара будет наказана жесткими санкциями.
BLEU:  0.351127959006569
ROUGE:  {'rouge-1': {'f': 0.19565216897211735, 'p': 0.17647058823529413, 'r': 0.21951219512195122}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.12745817871463636, 'p': 0.11764705882352941, 'r': 0

### Extractive RNN 

In [7]:
# Load DeepPavlov FastText embeddings
!wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin

--2020-05-25 08:25:10--  http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lower_case/ft_native_300_ru_wiki_lenta_lower_case.bin
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 93.175.29.74
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6216266514 (5.8G) [application/octet-stream]
Saving to: ‘ft_native_300_ru_wiki_lenta_lower_case.bin’


2020-05-25 09:39:56 (1.32 MB/s) - ‘ft_native_300_ru_wiki_lenta_lower_case.bin’ saved [6216266514/6216266514]



In [8]:
# Load bin file
model_ft = fasttext.load_model('ft_native_300_ru_wiki_lenta_lower_case.bin')




In [0]:
import copy
import razdel
import random

def build_oracle_summary_greedy(text, gold_summary, calc_score, lower=True, max_sentences=30):
    '''
    Жадное построение oracle summary
    '''
    gold_summary = gold_summary.lower() if lower else gold_summary
    # Split text in sentences
    sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
    n_sentences = len(sentences)
    oracle_summary_sentences = set()
    score = -1.0
    summaries = []
    for _ in range(min(n_sentences, 2)):
        for i in range(n_sentences):
            if i in oracle_summary_sentences:
                continue
            current_summary_sentences = copy.copy(oracle_summary_sentences)
            # Add some sentences to the existing summary
            current_summary_sentences.add(i)
            current_summary = " ".join([sentences[index] for index in sorted(list(current_summary_sentences))])
            # Count metrics
            current_score = calc_score(current_summary, gold_summary)
            summaries.append((current_score, current_summary_sentences))
        # If metrics have improved with addition of a sentence, then try to add more
        # Else break
        best_summary_score, best_summary_sentences = max(summaries)
        if best_summary_score <= score:
            break
        oracle_summary_sentences = best_summary_sentences
        score = best_summary_score
    oracle_summary = " ".join([sentences[index] for index in sorted(list(oracle_summary_sentences))])
    return oracle_summary, oracle_summary_sentences

def calc_single_score(pred_summary, gold_summary, rouge):
    return rouge.get_scores([pred_summary], [gold_summary], avg=True)['rouge-2']['f']

In [10]:
from tqdm import tqdm_notebook as tqdm

# Calculate score of oracle summarization

def calc_oracle_score(records, nrows=1000, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
  
    for text, summary in tqdm(records[['text', 'summary']].values[:nrows]):
        summary = summary if not lower else summary.lower()
        references.append(summary)
        predicted_summary, _ = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        predictions.append(predicted_summary)

    calc_scores(references, predictions)

calc_oracle_score(test_records)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Count: 1000
Ref: глава роскосмоса дмитрий рогозин раскритиковал шутку ивана урганта об упрощении требований к кандидатам в космонавты. гендиректор госкорпорации заявил об утрате ведущим чувства юмора после восьми лет непрерывной работы на шоу «вечерний ургант».
Hyp: генеральный директор роскосмоса дмитрий рогозин прокомментировал шутку телеведущего ивана урганта об упрощении подготовки космонавтов. ранее глава госкорпорации заявил о переговорах с центром подготовки космонавтов об отказе от некоторых требований к кандидатам.
BLEU:  0.43106535769975873
ROUGE:  {'rouge-1': {'f': 0.3522902373664272, 'p': 0.4307700039374759, 'r': 0.31660276824452355}, 'rouge-2': {'f': 0.2032850093782878, 'p': 0.25546554153070994, 'r': 0.18111381667522672}, 'rouge-l': {'f': 0.2997182027550175, 'p': 0.39783483642945483, 'r': 0.2913698482908416}}


In [11]:
from rouge import Rouge

def add_oracle_summary_to_records(records, max_sentences=30, lower=True, nrows=1000):
    rouge = Rouge()
    sentences_ = []
    oracle_sentences_ = []
    oracle_summary_ = []
    records = records.iloc[:nrows].copy()

    for text, summary in tqdm(records[['text', 'summary']].values):
        summary = summary.lower() if lower else summary
        sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
        oracle_summary, sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge),
                                                                         lower=lower, max_sentences=max_sentences)
        sentences_ += [sentences]
        oracle_sentences_ += [list(sentences_indicies)]
        oracle_summary_ += [oracle_summary]
    records['sentences'] = sentences_
    records['oracle_sentences'] = oracle_sentences_
    records['oracle_summary'] = oracle_summary_
    return records

ext_train_records = add_oracle_summary_to_records(train_records, nrows=4096)
ext_val_records = add_oracle_summary_to_records(val_records, nrows=256)
ext_test_records = add_oracle_summary_to_records(test_records, nrows=256)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=4096.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=256.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=256.0), HTML(value='')))




In [0]:
import random
import math
import torch
import numpy as np
from rouge import Rouge


from torch.utils import data

# Dataset generating 

class ExtDataset(data.Dataset):
    def __init__(self, records, lower=True, max_sentences=30, max_sentence_length=50, device=torch.device('cpu')):
        self.records = records
        self.num_samples = records.shape[0]
        self.lower = lower
        self.rouge = Rouge()
        self.max_sentences = max_sentences
        self.max_sentence_length = max_sentence_length
        self.device = device
        
    def __len__(self):
        return self.records.shape[0]

    def __getitem__(self, idx):
        cur_record = self.records.iloc[idx]
        inputs = list()
        # 1: DeepPavlov
        for sent in cur_record['sentences']:
            emb_sent = torch.tensor(model_e.get_sentence_vector(sent))
            inputs.append(emb_sent.cpu())
        # 2: Flair
        # for sent in cur_record['sentences']:
        #     sentence = Sentence(sent)
        #     with torch.no_grad():
        #         pooled_embeddings.embed(sentence)

            # inputs.append(sentence.embedding.cpu())
        # inputs = list(map(lambda x: x[:self.max_sentence_length], self.bpe_processor.encode(cur_record['sentences'], output_type=yttm.OutputType.ID)))
        outputs = [int(i in cur_record['oracle_sentences']) for i in range(len(cur_record['sentences']))]
        return {'inputs': inputs, 'outputs': outputs}

In [0]:
train_dataset = ExtDataset(ext_train_records)

In [0]:
# Function to generate batches
def collate_fn(records):
    max_length = max(len(sentence) for record in records for sentence in record['inputs'])
    max_sentences = max(len(record['outputs']) for record in records)

    new_inputs = torch.zeros((len(records), max_sentences, max_length))
    new_outputs = torch.zeros((len(records), max_sentences))
    for i, record in enumerate(records):
        for j, sentence in enumerate(record['inputs']):
            new_inputs[i, j, :len(sentence)] += np.array(sentence)
        new_outputs[i, :len(record['outputs'])] += np.array(record['outputs'])
    return {'features': new_inputs.type(torch.LongTensor), 'targets': new_outputs}

In [31]:
class SentenceTaggerRNN(nn.Module):
    def __init__(self,
                 token_embedding_dim=300,
                 sentence_encoder_hidden_size=300,
                 hidden_size=300,
                 bidirectional=True,
                 sentence_encoder_n_layers=2,
                 sentence_encoder_dropout=0.2,
                 sentence_encoder_bidirectional=True,
                 n_layers=1,
                 dropout=0.2):
        super(SentenceTaggerRNN, self).__init__()

        num_directions = 2 if bidirectional else 1
        assert hidden_size % num_directions == 0
        hidden_size = hidden_size // num_directions

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.bidirectional = bidirectional

        
        self.rnn_layer = nn.LSTM(sentence_encoder_hidden_size, hidden_size, n_layers, dropout=dropout,
                           bidirectional=bidirectional, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)
        self.content_linear_layer = nn.Linear(hidden_size * 2, 1)
        self.document_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.salience_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.tanh_layer = nn.Tanh()

    def forward(self, inputs, hidden=None):
        batch_size = inputs.size(0)
        sentences_count = inputs.size(1)
        tokens_count = inputs.size(2)
        inputs = inputs.reshape(-1, tokens_count)
        embedded_sentences = inputs
        embedded_sentences = embedded_sentences.reshape(batch_size, sentences_count, -1)
        outputs, _ = self.rnn_layer(embedded_sentences.float(), hidden)
        outputs = self.dropout_layer(outputs)
        document_embedding = self.tanh_layer(self.document_linear_layer(torch.mean(outputs, 1)))
        content = self.content_linear_layer(outputs).squeeze(2)
        salience = torch.bmm(outputs, self.salience_linear_layer(document_embedding).unsqueeze(2)).squeeze(2)
        m = torch.nn.Softmax()
        return m(content + salience)

model = SentenceTaggerRNN()


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1



### Train

In [32]:
import catalyst
from catalyst.dl.runner import SupervisedRunner

device = torch.device('cuda')

loaders = {
    'train': data.DataLoader(ExtDataset(ext_train_records), batch_size=128, collate_fn=collate_fn),
    'valid': data.DataLoader(ExtDataset(ext_val_records), batch_size=128, collate_fn=collate_fn),
    'test': data.DataLoader(ExtDataset(ext_test_records), batch_size=128, collate_fn=collate_fn),
}

lr = 1e-3
num_epochs = 7

optimizer  = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()
runner = SupervisedRunner()
runner.train(
    model=model,
    optimizer=optimizer,
    loaders=loaders,
    logdir='./logs',
    num_epochs=num_epochs,
    criterion=criterion,
    verbose=True
)






1/7 * Epoch (train):   0% 0/32 [00:00<?, ?it/s][A[A[A[A



1/7 * Epoch (train):   0% 0/32 [00:02<?, ?it/s, loss=0.708][A[A[A[A



1/7 * Epoch (train):   3% 1/32 [00:02<01:25,  2.76s/it, loss=0.708][A[A[A[A


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.







1/7 * Epoch (train):   3% 1/32 [00:05<01:25,  2.76s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):   6% 2/32 [00:05<01:20,  2.70s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):   6% 2/32 [00:07<01:20,  2.70s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):   9% 3/32 [00:07<01:16,  2.64s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):   9% 3/32 [00:10<01:16,  2.64s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  12% 4/32 [00:10<01:12,  2.60s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  12% 4/32 [00:12<01:12,  2.60s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  16% 5/32 [00:12<01:09,  2.58s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  16% 5/32 [00:15<01:09,  2.58s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  19% 6/32 [00:15<01:06,  2.54s/it, loss=0.708][A[A[A[A



1/7 * Epoch (train):  19% 6/32 [00:17<01:06,  2.54s/it, loss=0.707][A[A[A[A



1/7 * Epoch (train):  22% 7/32 [00:17<01:03,  2.56s/it, loss=0.707][A[A[A[A





In [33]:
device = torch.device("cuda")

references = []
predictions = []
model.eval()
for i, item in tqdm(enumerate(data.DataLoader(ExtDataset(ext_test_records), batch_size=1, collate_fn=collate_fn)), total=ext_test_records.shape[0]):
    logits = model(item["features"].to(device))[0] # Прямой проход
    record = ext_test_records.iloc[i]
    predicted_summary = []
    for i, logit in enumerate(logits):
        if logit > 0.0:
            predicted_summary.append(record['sentences'][i])
    if not predicted_summary:
        predicted_summary.append(record['sentences'][torch.max(logits, dim=0)[1].item()])
    predicted_summary = " ".join(predicted_summary)
    references.append(record['summary'].lower())
    predictions.append(predicted_summary)

calc_scores(references, predictions)


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=256.0), HTML(value='')))


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.




Count: 256
Ref: очередной мост будет построен в россии. на этот раз через реку лена в районе якутска. смета составляет почти 83 млрд рублей. окупаемость моста вызывает сомнения из-за низкого трафика. это, скорее, социальный проект. и даже политический, учитывая положительные итоги строительства моста в крым.
Hyp: президент россии владимир путин одобрил проект строительства моста через лену в якутске. в понедельник, 18 ноября, об этом заявил глава якутии айсен николаев , заявив, что одобрение от федеральных властей получено. ранее это стало известно по данным источников «коммерсанта», которые сообщили, что 9 ноября путин «фактически утвердил» проект строительства автодорожного моста через лену.
BLEU:  0.4441327309711121
ROUGE:  {'rouge-1': {'f': 0.2645632970896027, 'p': 0.25479653857679296, 'r': 0.2943782601001662}, 'rouge-2': {'f': 0.11298230206411987, 'p': 0.1069842755732323, 'r': 0.12953470673811784}, 'rouge-l': {'f': 0.22724253897875155, 'p': 0.23116526946756685, 'r': 0.26725202386

In [0]:
### Sources
- https://arxiv.org/pdf/1611.04230.pdf


- https://towardsdatascience.com/understanding-how-convolutional-neural-network-cnn-perform-text-classification-with-word-d2ee64b9dd0b Пример того, как можно применять CNN в текстовых задачах
- https://arxiv.org/pdf/1808.08745.pdf Очень крутой метод генерации summary без Transformers
- https://towardsdatascience.com/super-easy-way-to-get-sentence-embedding-using-fasttext-in-python-a70f34ac5b7c – простой метод генерации sentence embedding
- https://towardsdatascience.com/fse-2b1ffa791cf9 – Необычный метод генерации sentence embedding
- https://github.com/UKPLab/sentence-transformers – BERT предобученный для sentence embedding

