In [3]:
import logging
import telebot
from config import TOKEN, LOG_FILE

import re
import os
import json
import nltk
import scipy
import pymystem3
import numpy as np
import pandas as pd

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

from nltk.tag import pos_tag
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from keras.models import load_model
import keras.backend as K

import tensorflow as tf

from itertools import chain
from tqdm import tqdm

Using TensorFlow backend.


In [4]:
class ENGINE_4(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        self.tfidfmodel = self.prepare_vectorizer()
        self.idf = self.tfidfmodel.idf_
        self.idf_dict = {key:value for key, value in zip(self.tfidfmodel.get_feature_names(), self.idf)} 
        
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
        
    def prepare_vectorizer(self):
        """
        Fits TF-IDF vectorizer using all available text from self.knowledge_base
        
        Returns TF-IDF vectorizer object
        """
        # your code goes here
        vectorizer = None
        all_texts = []
        
        for kn in self.knowledge_base:
            all_texts.append(kn['answer'])
            for par in kn['paraphrased_questions']:
                all_texts.append(par)
            all_texts.append(kn['question'])
        
        # all_texts = [' '.join(self.tokenize_and_lemmatize(text)) for text in tqdm(texts)]
        
        vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 2, tokenizer=self.tokenize_and_lemmatize).fit(all_texts)

        return vectorizer
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        Returns a a matrix of shape [N, 300]
        """
        vectorized = []
        # your code goes here
        
        for text in data:
            zero_vector = np.zeros(self.w2v_model.vector_size)
            word_vectors = []
            
            for token in self.tokenize_and_lemmatize(text):
                try:
                    word_vectors.append(self.w2v_model[token]*self.idf_dict[token])
                except Exception as e:
                    pass
            
            if len(word_vectors):
                vectorized.append(np.mean(word_vectors, axis=0))
            else:
                vectorized.append(zero_vector)
        
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions AND paraphrased questions using the vectorize function.
        Builds a list containing class id (it's index in self.knowledge_base) for each vectorized question.
        
        Example: you vectorized 1 question and 5 paraphrases for that question
        Then you should append the ID of the question to class_labels list 5+1=6 times.
        """
        vectors = []
        class_labels = []
        
        # your code goes here
        for num, kn in enumerate(self.knowledge_base):
            step_kn = [kn['question']] + kn['paraphrased_questions']
            vectors.append(self.vectorize(step_kn))
            for i in range(len(step_kn)):
                class_labels.append(num)
            
        return np.vstack(vectors), np.array(class_labels)
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), [0]*len(self.answers)))
        
        for n,i in enumerate(self.class_indexes):
            class_scores[i] += similarities[n]
        
        # your code goes here
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [6]:
class Backend(object):
    """
    class contets few machine learning text handle methods
    """
    
    def __init__(self, model_path, w2v_path, engine_w2v_path, json_path):
        """
        initialize our class with model file, w2v file and json faq files 
        """
        self.model = load_model(model_path)
        self.w2v = Word2Vec.load(w2v_path)
        self.w2v_engine = KeyedVectors.load_word2vec_format(engine_w2v_path)
        self.faq = json_path
        self.tokenizer = RegexpTokenizer('[a-zA-Z0-9@]+')
        self.stemmer = LancasterStemmer()
        self.graph = tf.get_default_graph()
        self.engine = ENGINE_4(self.json_path, self.w2v_engine)
        
    def preproc_tweet(self, text):
        """
        function preprocesing tweet to good format
        """
        max_tweet_length = 20
        vector_size = 512
        
        matrix = np.zeros((1, max_tweet_length, vector_size), dtype=K.floatx())

        for t, token in enumerate(self.tokenizer.tokenize(text.lower())):
            if t >= max_tweet_length:
                break
            try:    
                matrix[0, t, :] = self.w2v[self.stemmer.stem(token)]
            except:
                pass
                
        return matrix
    
    def predict_sentiment(self, matrix):
        """
        fucntion returns sentiment
        argeuments:
            matrix - numpy array, matrix of w2v words ebeddings
            model - keras deep learning model
        """
        graph = self.graph
        global graph
        with graph.as_default():
            sentiment = self.model.predict(matrix)[0][1]
        
        return sentiment

  global graph


In [7]:
# prepr = bk.predict_sentiment(bk.preproc_tweet('What a lonely day! And it\'s mine'))

In [8]:
bot = telebot.TeleBot(TOKEN)

In [10]:
# load some backend for bot
bk = Backend('../lecture1/weights-improvement-09-0.81.hdf5', '../lecture1/test.wv','word_vectors.w2v','./new_faq.json')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Logging to file
fh = logging.FileHandler(LOG_FILE)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
# Logging to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)

@bot.message_handler(commands=['start', 'help'])
def greetings(message):
    try:
        logger.info('Started chat # {}'.format(message.chat.id))
        bot.reply_to(message, 'Приветики, {} {}! Меня зовут {} (если коротко, то Сэнди), я к твоим услугам.\n'
                    'Вот что я могу:\n'
                    '/start - Начинаю чат и пишу тебе все, что умею (каждый раз, чтобы не забыл)\n'
                    '/help - Печатаю помощь (да, чтобы не забыл ;))\n'
                    '/sentiment [message] - предсказываю с помощью хрустального шара положительную (1) или отрицательную (0) окраску твита (только по-аглийски)\n'
                    '/answer [message] - Отвечаю по моему справочнику FAQ (надеюсь, что это не что-то неприличное) на твое сообщение'.format(message.chat.first_name,
                                                                        message.chat.last_name, bot.get_me().username))
    except:
        bot.send_message(message.chat.id, 'Извини, я ошиблась. С кем не бывает?!')
        
@bot.message_handler(commands=['sentiment'])
def predict_sentiment(message):
    try:
        text = ' '.join(message.text.split(' ')[1:]) # some idiot bicycle to delete command from text
        result = bk.predict_sentiment(bk.preproc_tweet(text))
        bot.send_message(message.chat.id, result)
        logger.info('Predicted sentiment {} for tweet {}'.format(result, text))
    except:
        bot.send_message(message.chat.id, 'Извини, я ошиблась. С кем не бывает?!')
        
@bot.message_handler(commands=['answer'])
def answer(message):
    answer = bk.engine.get_top(message, top_k=1)
    bot.send_message(message.chat.id, answer[0])
    
@bot.message_handler(content_types=['text'])
def small_talk(message):
    bot.send_message(message.chat.id, "Я бы с тобой поговорила, да вот сказать нечего :(\n Введи-ка лучше команду.")

ValueError: Dimension (-1) must be in the range [0, 2), where 2 is the number of dimensions in the input. for 'metrics/acc/ArgMax' (op: 'ArgMax') with input shapes: [?,?], [].

In [6]:
if __name__ == '__main__':
    bot.polling(none_stop=True)

2017-12-03 13:25:35,663 - __main__ - INFO - Started chat # 74141678
2017-12-03 13:27:08,034 - __main__ - INFO - Predicted sentiment 0.8989554047584534 for tweet We will rise again
2017-12-03 13:27:24,596 - __main__ - INFO - Predicted sentiment 0.009651520289480686 for tweet I am so sick
