### **БИБЛИОТЕКИ**

In [1]:
# загрузим библиотеки

# !pip install python-telegram-bot==13.8 --quiet
# !pip install python-telegram-bot --upgrade --quiet
# !pip install pymorphy2[fast] annoy stop_words transformers sentencepiece sentence_transformers faiss --quiet
# !apt install libomp-dev

from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('stopwords')
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import torch
import torch.nn as nn
import faiss
# !cat '/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/tokenization_small100.py'
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT')
import tokenization_small100
from tokenization_small100 import SMALL100Tokenizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, TFAutoModel, AutoModelForCausalLM, AutoModel, M2M100ForConditionalGeneration, MarianTokenizer, MarianMTModel, AutoModelForSequenceClassification
from datetime import datetime
import json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **ДАННЫЕ, МОДЕЛИ, ПАРАМЕТРЫ**

In [2]:
PATH_TOKEN = '/content/token.txt'
PATH_DF_FILMS = '/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/asian_dramas_dataset_FINAL.csv'
PATH_EMB_FILMS = '/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/df_films_emb.npy'
PATH_BERT_CLASS = '/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/model_bert_clean.pt'

In [None]:
# токен для Телеграма
with open(PATH_TOKEN, 'r') as f:
  TOKEN = f.read()

# инструменты для обработки текста
morpher = MorphAnalyzer()
sw = set(get_stop_words('ru') + nltk.corpus.stopwords.words('russian'))
exclude = set(string.punctuation)

# логирование запросов пользователей на классификацию интентов
history = {'text': [], 'label': []}
history_for_gpt = []

# данные с фильмами
df_films = pd.read_csv(PATH_DF_FILMS).drop('Unnamed: 0', axis=1)
embeddings_for_films = np.load(PATH_EMB_FILMS)

# тренированная BERT-модель для классификации интентов
model_name_for_classification = 'sberbank-ai/ruBert-large'
tokenizer_for_classification = AutoTokenizer.from_pretrained(model_name_for_classification)
model_for_classification = AutoModelForSequenceClassification.from_pretrained(model_name_for_classification, num_labels=3)
device_cpu = torch.device('cpu')
model_for_classification.load_state_dict(torch.load(PATH_BERT_CLASS, map_location=device_cpu))

# GPT-модель для поддержания разговора с пользователем (интент-1)
model_name_for_textgeneration = 'sberbank-ai/rugpt3small_based_on_gpt2'
tokenizer_for_textgeneration = AutoTokenizer.from_pretrained(model_name_for_textgeneration)
model_for_textgeneration = AutoModelForCausalLM.from_pretrained(model_name_for_textgeneration)

# модель для создания эмббедингов на запросы поиска фильмов (интент-2)
encoder_for_films = SentenceTransformer('sberbank-ai/ruBert-base')

# модель для анализа сантимента комментариев фильмов
model_name_for_santiment = 'Tatyana/rubert-base-cased-sentiment-new'
tokenizer_for_santiment = AutoTokenizer.from_pretrained(model_name_for_santiment)
model_for_santiment = AutoModelForSequenceClassification.from_pretrained(model_name_for_santiment)

# модель перевода на корейский язык (интент-3)
model_name_for_translation = 'alirezamsh/small100'
model_for_translation = M2M100ForConditionalGeneration.from_pretrained(model_name_for_translation)
tokenizer_for_translation = SMALL100Tokenizer.from_pretrained(model_name_for_translation)
tokenizer_for_translation.tgt_lang = 'ko'

### **МЕТОДЫ, ФУНКЦИИ**

In [22]:
# обработка текста
def preprocess_txt(line):
    spls = ''.join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != '']
    return ' '.join(spls)


# классификатор интентов
def classifier_intents(text):
  inputs = tokenizer_for_classification(text, return_tensors='pt')
  outputs = model_for_classification(**inputs)
  prediction_logits = outputs.logits
  prediction = np.argmax(prediction_logits.detach().numpy())
  return prediction, prediction_logits


# обработка команды '/start'
def start(update: Update, context: CallbackContext):
    update.message.reply_text(
        'Привет! \
        \nЯ *AsiaLove бот*. Я умею болтать, рекомендовать сериалы и переводить \
        слова и фразы на корейский язык 😊 \
        \nЕсли я что-то не понял, не злись, попробуй переформулировать \
        предложение ^-^.  Погнали!\n⁉️ Кстати, чтобы я смог сделать перевод \
        корректно, поставь слово/фразу в знаки \
        равно --> Переведи мне слово =Апельсин=.', parse_mode='Markdown')


# генератор текста (интент-1)
def respond_to_dialog(texts):
    prefix = '\nВопрос:'
    for i, t in enumerate(texts):
        prefix += t
        prefix += '\nВопрос:' if i % 2 == 1 else '\nОтвет:'
    tokens = tokenizer_for_textgeneration(prefix, return_tensors='pt')
    tokens = {k: v for k, v in tokens.items()}
    end_token_id = tokenizer_for_textgeneration.encode('\n')[0]
    size = tokens['input_ids'].shape[1]
    output = model_for_textgeneration.generate(
        **tokens, 
        eos_token_id=end_token_id,
        do_sample=True, 
        max_new_tokens=size+40, 
        repetition_penalty=3.2, 
        temperature=0.5,
        num_beams=5,
        length_penalty=0.05,
        pad_token_id=tokenizer_for_textgeneration.eos_token_id,
        min_length=10)
    decoded = tokenizer_for_textgeneration.decode(output[0])
    result = decoded[len(prefix):]
    return result.strip()


# рекомендации фильмов (интент-2)
def get_films_recom(user_text):
  dimension = embeddings_for_films.shape[1]
  k_nears = 10
  text = encoder_for_films.encode(user_text).reshape(1, -1)
  index = faiss.IndexFlatL2(dimension)
  index.add(embeddings_for_films)
  D, I = index.search(text, k_nears)
  dict_temp = {I[0][i]:D[0][i] for i in range(len(D[0]))}
  dict_temp = dict(sorted(dict_temp.items(), key=lambda item: item[1], reverse=True))
  return list(dict_temp.keys())


# парсинг рейтинга фильма/сериала с оригинального сайта
def get_film_rating(url):
  try:
    request = requests.get(url)
    rating = bs(request.text, 'html.parser').find_all('div', class_='unit-rating')
    return rating[0].span.text
  except:
    return '-'


# предсказание метки сантимента для комментариев фильмов
def predict(text):
    inputs = tokenizer_for_santiment(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
      outputs = model_for_santiment(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()[0]
    return predicted


# подсчет комментариев по сантименту
def count_comments_santiment(url):

  list_comments = []
  neutral = 0
  positive = 0
  negative = 0

  try:
    request = requests.get(url)
    comments = bs(request.text, 'html.parser').find_all('div', class_='ct-text clearfix')    
    for item in comments:
      list_comments.append(item.text.strip())
    for cmnt in list_comments:
      santiment_label = predict(cmnt)
      if santiment_label == 0:
        neutral += 1
      elif santiment_label == 1:
        positive += 1
      elif santiment_label == 2:
        negative += 1
    return '*Нейтральные*: ' + str(neutral) + '\n*Положительные*: '+ str(positive) + '\n*Негативные*: ' + str(negative)
  except:
    return '*Нейтральные* -' + '\n*Положительные*: -'+ '\n*Негативные*: -'


# перевод на корейский язык (интент-3)
def translate_to_korean(text):
  text_to_translate = re.findall(r'=(.*?)=', text)  
  tokens = tokenizer_for_translation(text_to_translate, return_tensors='pt')
  outputs = model_for_translation.generate(**tokens)
  translation = tokenizer_for_translation.batch_decode(outputs, skip_special_tokens=True)
  return translation[0]


# обработка текстовых сообщений от пользователя с тремя интентами
def bot_answers(update: Update, context: CallbackContext):   
    user_text = update.message.text 
    user_text_clean = preprocess_txt(user_text)    
    label_pred, stats = classifier_intents(user_text_clean)    

    # режим разговора      
    if label_pred == 0:
      # update.message.reply_text('Talk')
      history_for_gpt.append(user_text)
      result = respond_to_dialog(history_for_gpt)
      history_for_gpt.append(result)   
      update.message.reply_text(result)

      # логирование
      if len(history_for_gpt) > 1000:
        with open('/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/history_for_gpt_'+datetime.now().strftime("%d%m%Y_%H%M")+'.txt', 'w') as f:
          f.write(('\n').join(history_for_gpt))    

    # режим рекомендаций фильмов
    elif label_pred == 1:
      # update.message.reply_text('Recommendation')
      ids_recom = get_films_recom(user_text_clean)   
      for idx in ids_recom[:3]:
        # try:
        #   update.message.reply_photo(df_films.img_link.iloc[idx])
        # except:
        #   update.message.reply_text('Упс... Картинку не грузится.\n')
        update.message.reply_text('\n'.join(['*Название*: '+ df_films.ru_name.iloc[idx], 
                                            '*Оригинальное название*: '+ df_films.orig_name.iloc[idx],
                                            '*Количество серий*: '+ df_films.number_ep.iloc[idx],
                                            '*Жанр*: '+ df_films.genre.iloc[idx],                             
                                            '*Страна*: '+ df_films.country.iloc[idx], 
                                            '*Год*: '+ df_films.year.iloc[idx], 
                                            '*Рейтинг*: '+ get_film_rating(df_films.page_link.iloc[idx]),
                                            '*Ссылка*: '+ df_films.page_link.iloc[idx]]), parse_mode='Markdown')
        update.message.reply_text('Читаю комментарии...\n')
        update.message.reply_text(count_comments_santiment(df_films.page_link.iloc[idx]), parse_mode='Markdown')
    
    
    # режим перевода на корейский язык
    elif label_pred == 2:
      # update.message.reply_text('Translation')
      update.message.reply_text(translate_to_korean(user_text))    

    # логирование
    history['text'].append(user_text)
    history['label'].append(label_pred)
    if len(history['label']) > 1000:      
      with open('/content/drive/MyDrive/Colab Notebooks/NLP/CHAT_BOT/history_'+datetime.now().strftime("%d%m%Y_%H%M")+'.txt', 'w') as f:
        f.write(json.dumps(history, default=str))  

### **ЗАПУСК БОТА**

In [23]:
updater = Updater(TOKEN, use_context=True)
dispatcher = updater.dispatcher

# обработчик команды '/start'
dispatcher.add_handler(CommandHandler('start', start))

# обработчик запросов пользователя
dispatcher.add_handler(MessageHandler(Filters.text & ~Filters.command, bot_answers))

In [24]:
# запуск прослушивания сообщений
updater.start_polling()
updater.idle()