In [2]:
from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

In [3]:
from navec import Navec

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
import requests
import pprint
import json

In [7]:
import pymorphy3

In [8]:
import re

In [3]:
from lxml import html

In [10]:
import numpy as np
from numpy.linalg import norm

In [11]:
path = './navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [12]:
def cosine_similarity(a, b):
    A = navec[a]
    B = navec[b]
    cosine = np.dot(A,B)/(norm(A)*norm(B))
    return cosine

In [13]:
morph = pymorphy3.MorphAnalyzer()

In [14]:

tokenizer = GPT2Tokenizer.from_pretrained('edivet92/edivet_telebot')
model_med = GPT2LMHeadModel.from_pretrained('edivet92/edivet_telebot').to(DEVICE)

In [15]:
model_med.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [16]:
def preprocess_text(text):
    text = re.sub('@[\w]*', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[^а-яА-ЯЁё]', ' ', text)
    text = [morph.parse(w)[0].normal_form for w in text.split()]
    return text

In [24]:
def decide_make_forecast_or_news_or_avito(message):
    to_forecast = False
    to_news = False
    to_avito = False
    for word in preprocess_text(message):    
        try:
            sim_weather = cosine_similarity(word, 'погода')
            if sim_weather > 0.8:
                to_forecast = True
            sim_news = cosine_similarity(word, 'новость')
            if sim_news > 0.8:
                to_news = True
            sim_avito = cosine_similarity(word, 'купить')
            if sim_avito > 0.6:
                to_avito = True
        except:
            pass
    return to_forecast, to_news, to_avito

In [25]:
def get_weather_forecast():
    url = 'http://api.openweathermap.org/data/2.5/forecast?id=498817&appid=a78d06f736cffb8c8ff6d5610eba42a1'
    r = requests.get(url)
    if r.status_code == 200:
        forecast = r.json()
        temp = forecast['list'][0]['main']['temp']
        feel_like = forecast['list'][0]['main']['feels_like']
        humidity = forecast['list'][0]['main']['humidity']
        sky = forecast['list'][0]['weather'][0]['description']
        wind = forecast['list'][0]['wind']['speed']
        day_time = forecast['list'][0]['dt_txt']
        return f'На {day_time} в Питере {(temp - 273):.1f} градусов, ощущается как {(feel_like - 273):.1f}, небо {sky}. Скорость ветра {wind} м/с, влажность {humidity}%'
    else:
        return 'Что-то пошло не так'

In [26]:
def get_lenta_news():
    url = 'https://lenta.ru'
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.1.1138 Yowser/2.5 Safari/537.36',
    }
    response = requests.get(url, headers=header)
    dom = html.fromstring(response.text)
    news_list = dom.xpath("//a[contains(@class, 'card-mini')]")
    news_dict = {}
    for news in news_list[:3]:
        news_title = news.xpath("./div/h3[contains(@class, 'card-mini__title')]/text()")[0]
        if news.xpath('./@href')[0][0:4] == 'http':
            news_link = news.xpath('./@href')[0]
        else:
            news_link = url + news.xpath('./@href')[0]
        news_date = news.xpath('./@href')[0][6:16]
        news_time = news.xpath("./div/div/time/text()")
        news_dict[news_title] = {
            'link': news_link,
            'date': news_date,
            'time': news_time,
        }
    return str(news_dict)

In [48]:
def get_avito_goods(text):
    head_link = 'https://www.avito.ru'
    url = f'https://www.avito.ru/sankt-peterburg?q={text}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.2.931 Yowser/2.5 Safari/537.36'
        }
    response = requests.get(url, headers=headers)
    dom = html.fromstring(response.text)
    items_list = dom.xpath("//div[@class='iva-item-content-rejJg']")
    items = items_list[:3]
    answer = ''
    for item in items:
        answer += str(head_link + item.xpath('./div/a/@href')[0])
        answer += str(item.xpath("./div[@class='iva-item-body-KLUuy']/div/p/text()")[0][:100] + '\n')
    if len(answer) == 0:
        answer = 'Авито блокирует парсинг с этого IP. Хотя из google colab прекрасно работает'
    return answer

In [49]:
def get_answer(text):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out = model_med.generate(input_ids, 
                            do_sample=True,
                            num_beams=3,
                            temperature=1.5,
                            top_p=1.7,
                            max_length=100,
                            )

    generated_text = list(map(tokenizer.decode, out))[0]
    return generated_text.replace('$', '#').split('#')[1]

In [50]:
def startCommand(update, context):
    context.bot.send_message(chat_id=update.message.chat_id, text='Привет')

def textMessage(update, context):
    txt = update.message.text
    to_forecast, to_news, to_avito = decide_make_forecast_or_news_or_avito(txt)
    if to_forecast:
        reply = get_weather_forecast()
    elif to_news:
        reply = get_lenta_news()
    elif to_avito:
        reply = get_avito_goods(txt)
    else:
        reply = get_answer(txt)
    #update.message.reply_text(reply)
    return context.bot.send_message(chat_id=update.message.chat_id, text=reply)

In [51]:
updater = Updater("6269420671:AAEpiMPQ5b674hwCC7OL0t18a_MbYDUb1w4", use_context=True)
dispatcher = updater.dispatcher
       
start_command_handler = CommandHandler('start', startCommand)
text_message_handler = MessageHandler(Filters.text, textMessage)
dispatcher.add_handler(start_command_handler)
dispatcher.add_handler(text_message_handler)
updater.start_polling(clean=True)
updater.idle()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
