# Курсовой проект «Введение в обработку естественного языка» 

## <font color='red'>2. Чат-бот. Реализация.</font>

In [23]:
# !pip install pymorphy2
# !pip install stop_words

# !pip install annoy
# !pip install telegram
# !pip uninstall python-telegram-bot telegram
# !pip install python-telegram-bot --upgrade
# !pip install google-cloud-dialogflow

# !pip install python-telegram-bot==13.8
# !pip install python-telegram-bot --upgrade
# !pip install transformers 
# !pip install transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.1 MB/s 
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [1]:
import numpy as np
import pandas as pd
from pymorphy2 import MorphAnalyzer
import pickle
from stop_words import get_stop_words
import string
from gensim.models import FastText
import annoy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from functools import lru_cache

from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

import logging
import re

In [2]:
from google.api_core.exceptions import InvalidArgument

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

In [6]:
# Путь к моделям
PATH_MODEL = "models/"
PATH_MODEL = "/content/drive/MyDrive/Colab Notebooks/nlp/models/"
# Размер эмбеддинга
SIZE_EMB = 200

##### 2.1. Загрузка моделей

In [7]:
try:
    with open(f'{PATH_MODEL}idfs.pkl', 'rb') as f:
        idfs = pickle.load(f)

    with open(f'{PATH_MODEL}midf.pkl', 'rb') as f:
        midf = pickle.load(f)


    with open(f'{PATH_MODEL}idfs_prod.pkl', 'rb') as f:
        idfs_prod = pickle.load(f)

    with open(f'{PATH_MODEL}midf_prod.pkl', 'rb') as f:
        midf_prod = pickle.load(f)

    modelFT = FastText.load(f'{PATH_MODEL}modelFT')


    ft_index = annoy.AnnoyIndex(SIZE_EMB, 'angular')
    ft_index.load(f'{PATH_MODEL}index_ft') 
    with open(f'{PATH_MODEL}index_map_ft.pkl', 'rb') as f:
        index_map = pickle.load(f)

    vectorizer = CountVectorizer(ngram_range=(1, 2))
    with open(f'{PATH_MODEL}vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)

    lr = LogisticRegression()
    with open(f'{PATH_MODEL}lr.pkl', 'rb') as f:
        lr = pickle.load(f)

    with open(f'{PATH_MODEL}midf_prod.pkl', 'rb') as f:
        midf_p = pickle.load(f)

    ft_index_shop = annoy.AnnoyIndex(SIZE_EMB, 'angular')
    ft_index_shop.load(f'{PATH_MODEL}ft_index_shop') 

    with open(f'{PATH_MODEL}index_map_shop.pkl', 'rb') as f:
        index_map_shop = pickle.load(f)

except:
    pass

### Предобработка текста

In [8]:
@lru_cache(maxsize=128, typed=False)

def parse_morpher(text):
    return morpher.parse(text)[0].normal_form

In [9]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [parse_morpher(re.sub(r'\<[^>]*\>', '', i).lower()) for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [10]:
def embed_txt(txt, idfs, model, midf):
    n_ft = 0
    vector_ft = np.zeros(SIZE_EMB)
    for word in txt:
        if word in model:
            vector_ft += model[word] * idfs.get(word, midf)  #
            n_ft += idfs.get(word, midf)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
        
    return vector_ft

##### 2.2. Чат-бот

In [11]:
import os

In [12]:
import logging

from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext

In [14]:
# !pip install python-telegram-bot --upgrade

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MBartTokenizer, MBartForConditionalGeneration

In [None]:
model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model_mbart_ru_sum_gazeta = MBartForConditionalGeneration.from_pretrained(model_name)

In [14]:
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_mT5_multilingual_XLSum = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [15]:
updater = Updater(" ", use_context=True)  # Токен API к Telegram

def echo(update: Update, context: CallbackContext):
    txt = update.message.text
    update.message.reply_text('Ваше сообщение! ' + update.message.text)


def startCommand(update: Update, context: CallbackContext) -> None:
    update.message.reply_text('Hi!')

def model_mT5_multilingual_XLSum_summary(input_text, model, tokenizer): 

    WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

    input_ids = tokenizer(
        [WHITESPACE_HANDLER(input_text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return summary

def textMessage(update: Update, context: CallbackContext) -> None:
    input_text = update.message.text
    if input_text.split(' ', 1)[0] == 'Summarisation:':

        input_text = input_text.replace('Summarisation:', '')
        summary = model_mT5_multilingual_XLSum_summary(input_text, \
                                                     model_mT5_multilingual_XLSum, tokenizer)
        update.message.reply_text('Суммаризация: ' + summary)

    else:
        res_text = 'Не понимаю запрос. Сформулируйте запрос иначе.'
        input_txt = preprocess_txt(update.message.text)
        vect = vectorizer.transform([" ".join(input_txt)])
        prediction = lr.predict(vect)
      
        if prediction[0] == 1:
            find = False
            vect_ft = embed_txt(input_txt, idfs_prod, modelFT, midf_prod)
            ft_index_shop_val, distances_shop = ft_index_shop.get_nns_by_vector(vect_ft, 3, include_distances=True)
            for i, item in enumerate(ft_index_shop_val):
                if distances_shop[i] <= 0.3:          
                    title, image = index_map_shop[item]
                    print(title, image)
                    update.message.reply_text("title: {} image: {}".format(title, image))
                    find = True
            if find == False:
                update.message.reply_text(res_text)
        else:
            vect_ft = embed_txt(input_txt, idfs, modelFT, midf)
            ft_index_val, distances = ft_index.get_nns_by_vector(vect_ft, 1, include_distances=True)
            if distances[0] <= 0.3:
                update.message.reply_text(index_map[ft_index_val[0]])
            else:
                update.message.reply_text(res_text)

In [16]:
dispatcher = updater.dispatcher

dispatcher.add_handler(CommandHandler('start', startCommand))
dispatcher.add_handler(MessageHandler(Filters.text & ~Filters.command, textMessage))

updater.start_polling()
updater.idle()