In [62]:
import os
import time
import random
import numpy as np
import pandas as pd
from typing import Callable, Iterable
from dataclasses import dataclass, field
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
import gc
import torch
import datasets
from torch.utils.data import Dataset, DataLoader, random_split
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel

In [5]:
import telebot

# 1. Параметры

In [22]:
@dataclass
class Params:
    root_dir = "/content/drive/My Drive/MIPT/NLPGen/HW1"
    joey_dataset_path = os.path.join(root_dir, "joey.csv")

    model_name = "distilbert-base-uncased"
    sbert_softmax_model_path = os.path.join(root_dir, "sbert_softmax_lr_2e-6")

    token = "7165553675:AAGUILrqnq55Vl-08Kh4ACTA2mt3NEYdyH4"

params = Params()

# 2. Модель

In [8]:
def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
    return pool


def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:

    model.eval()
    tokenized_texts = tokenizer(input_texts, max_length=128,
                                padding='max_length', truncation=True, return_tensors="pt")
    token_embeds = model(tokenized_texts["input_ids"].to(device),
                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
    pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
    return pooled_embeds

In [9]:
class Sbert(torch.nn.Module):
    def __init__(self, max_length: int = 128):
        super().__init__()
        self.max_length = max_length
        self.bert_model = AutoModel.from_pretrained(params.model_name)
        self.bert_tokenizer = AutoTokenizer.from_pretrained(params.model_name)
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size * 3, 3)

    def forward(self, data: datasets.arrow_dataset.Dataset) -> torch.tensor:
        premise_input_ids = data["premise_input_ids"].to(device)
        premise_attention_mask = data["premise_attention_mask"].to(device)
        hypothesis_input_ids = data["hypothesis_input_ids"].to(device)
        hypothesis_attention_mask = data["hypothesis_attention_mask"].to(device)

        out_premise = self.bert_model(premise_input_ids, premise_attention_mask)
        out_hypothesis = self.bert_model(hypothesis_input_ids, hypothesis_attention_mask)
        premise_embeds = out_premise.last_hidden_state
        hypothesis_embeds = out_hypothesis.last_hidden_state

        pooled_premise_embeds = mean_pool(premise_embeds, premise_attention_mask)
        pooled_hypotheses_embeds = mean_pool(hypothesis_embeds, hypothesis_attention_mask)

        embeds =  torch.cat([pooled_premise_embeds, pooled_hypotheses_embeds,
                             torch.abs(pooled_premise_embeds - pooled_hypotheses_embeds)],
                            dim=-1)
        return self.linear(embeds)

# 3. Подготовка

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [11]:
sbert_softmax_model = Sbert().to(device)
sbert_softmax_model.bert_model.from_pretrained(params.sbert_softmax_model_path)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [12]:
joey_df = pd.read_csv(params.joey_dataset_path)

In [63]:
def get_bi_answer(question, context, answers, tokenizer, model, device="cuda"):
    # Объединяем вопрос и контекст
    combined_text = question + " [SEP] " + context

    # Создаем массив с комбинированным текстом вопроса и контекста в качестве первого элемента, а затем добавляем все ответы
    texts_to_encode = [combined_text] + answers

    # Кодируем весь массив текстов за один вызов функции encode
    embeddings = encode(texts_to_encode, tokenizer, model, device).cpu().detach().numpy()

    # Вычисляем косинусную близость между эмбеддингом вопроса-контекста (первым элементом массива эмбеддингов)
    # и эмбеддингами каждого из ответов (остальные элементы массива)
    question_context_embedding = embeddings[0].reshape(1, -1)
    answers_embeddings = embeddings[1:]
    similarities = cosine_similarity(question_context_embedding, answers_embeddings).flatten()

    # Находим индекс ответа с наибольшей косинусной близостью
    best_answer_index = similarities.argmax()

    # Возвращаем наиболее подходящий ответ, его косинусную близость и аккумулированный контекст
    best_answer = answers[best_answer_index]
    updated_context = best_answer + " [SEP] " + question  # Аккумулируем вопрос с лучшим ответом
    return best_answer, similarities[best_answer_index], updated_context

In [64]:
def get_joey_reply(question, context, joey_df, tokenizer, model, device, num_tries=5):
    best_global_answer = None
    best_global_score = -float('inf')

    for _ in range(num_tries):
        np.random.seed(int(time.time()))
        sample_answers = joey_df["positive_answer"].sample(n=100, replace=False).tolist()
        best_answer, best_score, _ = get_bi_answer(question, context, sample_answers, tokenizer, model, device)
        if best_score > best_global_score:
            best_global_score = best_score
            best_global_answer = best_answer

    return best_global_answer

# 4. Запускаем бота

In [67]:
bot = telebot.TeleBot(params.token)

context = ""

@bot.message_handler(commands=["start"])
def start(m, res=False):
    global context
    contex = "Hi! I'm Joe!"
    bot.send_message(m.chat.id, context)


@bot.message_handler(content_types=["text"])
def process_message(message):
    global context
    reply = get_joey_reply(
        message.text,
        context,
        joey_df,
        sbert_softmax_model.bert_tokenizer,
        sbert_softmax_model.bert_model,
        device
    )
    context = random.choice([message.text, reply])
    bot.send_message(message.chat.id, reply)

In [None]:
print("https://t.me/JoeyTheRetrieverBot")
bot.polling(none_stop=False)

https://t.me/JoeyTheRetrieverBot
