In [7]:
import json
from sklearn.model_selection import train_test_split

In [26]:
from copy import deepcopy

In [27]:
bjs = json.load(open("./faqs/alfa_faq.02.json"))

traindata = []
testdata = []
valdata = []


for b in bjs:
    trainq, testq = train_test_split(b['paraphrased_questions'], test_size=0.5, random_state=34)
    valq, testq = train_test_split(testq, test_size=0.5, random_state=34)

    b.pop('paraphrase_author')
    b1 = deepcopy(b)
    b2 = deepcopy(b)
    b3 = deepcopy(b)
    
    b1['paraphrased_questions'] = trainq
    traindata.append(b1)
    b2['paraphrased_questions'] = testq
    testdata.append(b2)
    b3['paraphrased_questions'] = valq
    valdata.append(b3)
    
json.dump(traindata, open("faq_train.json","w"))
json.dump(testdata, open("faq_test.json","w"))
json.dump(valdata, open("faq_val.json","w"))

In [34]:
123

123

# Retrieval-based systems

Retrieval-based bots answer user queries by retrieving the most relevant answer from a pre-defined knowledge base.

In [24]:
import json
import nltk
import scipy
import pymystem3
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from itertools import chain

In [3]:
knowledge_base = json.load(open("../faq_train.json"))

First, let's have a look at our data.

In [None]:
knowledge_base[:4]

### Utility functions to evaluate your IR engines

In [4]:
def calc_score_for_engine(engine, val_json_path):
    data = []
    for q in json.load(open(val_json_path)):
        queries = q['paraphrased_questions']
        answer = q['answer']
        results = [engine.get_top(query, top_k=3) for query in queries]
        for r in results:
            data.append([answer] + r)
            
    ra1 = calc_recall(data, 1)
    ra3 = calc_recall(data, 3)
    print("recall @1: {}\nrecall @3: {}".format(ra1, ra3))
    return ra1, ra3
            
def calc_recall(data, k, bootstrap=0, subsample_rate=None):
    """
    :param data: 2d matrix
    data[i, 0] - true answer, data[i, 1:] - predicted answers, sorted by decreasing score.
    """
    count = np.zeros(1 + bootstrap)
    count_hit = np.zeros(1 + bootstrap)
    for fields in data:
        query = fields[0]

        if subsample_rate is None:
            increment = np.random.poisson(lam=1, size=bootstrap)
        else:
            increment = np.random.binomial(1, subsample_rate, bootstrap)
        increment = np.hstack([[1], increment])

        if query in fields[1:k+1]:
            count_hit += increment
        count += increment

    recall = count_hit / count

    return recall[0]

### Exercise 1.1

In this exercise we build a basic IR system using a TF-IDF representation.

In [25]:
class ENGINE_1(object):
    def __init__(self, kbase_path):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymorphy2.MorphAnalyzer()
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        
        self.vectorizer = self.prepare_vectorizer()
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
        
    
    def prepare_vectorizer(self):
        """
        Fits TF-IDF vectorizer using all available text from self.knowledge_base
        Returns TF-IDF vectorizer object
        """
        
        vectorizer = TfidfVectorizer(ngram_range=(1,2), tokenizer=self.tokenize_and_lemmatize)
        all_texts = []
        for kb in self.knowledge_base:
            all_texts.append(kb['answer'])
            all_texts.append(kb['question'])
            all_texts += kb['paraphrased_questions']
        vectorizer.fit(all_texts)
        return vectorizer
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.vectorizer.
        Returns a a matrix of shape [N, n_features]
        """
        
        return self.vectorizer.transform(data)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions using the vectorize function.
        Builds a list containing class number for each question.        
        """
        questions = [t['question'] for t in self.knowledge_base]
        return self.vectorize(questions), list(range(len(self.knowledge_base)))
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), similarities))
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        tokens = nltk.word_tokenize(text)

        for tk in tokens:

                try:
                    lemma = self.lemmatizer.parse(tk)[0].normal_form
                    tokens.append(lemma)
                except IndexError:
                    tokens.append(tk)
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [28]:
import pymorphy2

In [None]:
engine1 = ENGINE_1("../faq_train.json")

In [7]:
t1 = engine1.prepare_vectorizer()
assert isinstance(t1, TfidfVectorizer)
assert len(t1.get_feature_names()) > 1500
print("prepare vectorizer OK")

prepare vectorizer OK


In [8]:
t2 = engine1.vectorize(['Как получить ЗП-карту?',
   'А можно зарплатную карту вне очереди получить - очень надо?',
   'Как быть с зарплатой?'])

assert isinstance(t2, scipy.sparse.csr_matrix)
assert t2.shape[0] == 3 and t2.shape[1] == len(engine1.vectorizer.get_feature_names())
print("vectorize OK")

vectorize OK


In [9]:
t3 = engine1.compute_class_scores(np.arange(0,1,0.034))
assert isinstance(t3, dict)
assert list(t3.items()) == list(zip(range(30), np.arange(0,1,0.034)))
print("compute_class_scores OK")

compute_class_scores OK


In [10]:
r1, r3 = calc_score_for_engine(engine1, "./faq_val.json")
assert r1 >0.5
assert r3 >0.75
print("scores OK")

recall @1: 0.578125
recall @3: 0.8072916666666666
scores OK


### Exercise 1.2

In this exercise we improve our IR system using word vectors.

In [7]:
w2v = KeyedVectors.load_word2vec_format("../word_vectors.w2v")

In [8]:
def bow_encoder(wmodel, tokenizer, text, vsize=300):
    """
    This function encodes text into a vector.
    
    First, it tokenizes input text using the provided tokenizer function.
    Then it uses the provided word2vec model to get the vectors corresponding to text's tokens.
    Finally, it computes an average of all token's vectors and returns it.
    
    If the function failed to find and encode any words, it should at least return a vector of zeros.
    """
    tokens = tokenizer(text)
    
    zero_vector = np.zeros(vsize)
    word_vectors = []
    
    for token in tokens:
        if token in wmodel:
            word_vectors.append(wmodel[token])
            
    if len(word_vectors):
        sent_vector = np.mean(word_vectors, axis=0)
    else:
        sent_vector = zero_vector
    # your code goes here
    return sent_vector

In [9]:
class ENGINE_2(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        Returns a a matrix of shape [N, 300] (300 = word vector dimensionality)
        """
        vectorized = []
        for d in data:
            vectorized.append(bow_encoder(self.w2v_model, self.tokenize_and_lemmatize, d))
        
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions using the vectorize function.
        Builds a list containing class number for each question.        
        """
        questions = [t['question'] for t in self.knowledge_base]
        return self.vectorize(questions), list(range(len(self.knowledge_base)))
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), similarities))
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [14]:
engine2 = ENGINE_2("./faq_train.json", w2v)

In [15]:
t1 = engine2.vectorize(['Как получить ЗП-карту?',
   'А можно зарплатную карту вне очереди получить - очень надо?',
   'Как быть с зарплатой?'])

assert isinstance(t1, np.ndarray)
assert t1.shape[0] == 3 and t1.shape[1] == 300
print("vectorize OK")

vectorize OK


In [16]:
r1, r3 = calc_score_for_engine(engine2, "./faq_val.json")
assert r1 >0.65
assert r3 >0.8
print("scores OK")

recall @1: 0.6666666666666666
recall @3: 0.8541666666666666
scores OK


### Exercise 1.3

In this exercise we use available paraphrases to further improve the quiality of our IR system.

In [10]:
class ENGINE_3(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        Returns a a matrix of shape [N, 300]
        """
        vectorized = []
        for d in data:
            vectorized.append(bow_encoder(self.w2v_model, self.tokenize_and_lemmatize, d))
        
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions using the vectorize function.
        Builds a list containing class number for each question.        
        """
        vectors = []
        class_labels = []
        
        for i, t in enumerate(self.knowledge_base):
            vc = np.vstack([self.vectorize([t['question']]),
                            self.vectorize(t['paraphrased_questions'])])
            
            vectors.append(vc)
            class_labels.append(i)
            class_labels += [i]*len(t['paraphrased_questions'])
        
        
        return np.vstack(vectors), class_labels
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), [0]*len(self.answers)))
        
        for ci, sc in zip(self.class_indexes, similarities):
            class_scores[ci] += sc
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [31]:
engine3 = ENGINE_3("../faq_train.json", w2v)

In [14]:
t1 = engine3.vectorize_knowledge_base()

assert isinstance(t1[0], np.ndarray)
assert t1[0].shape[0] == len([t['answer'] for t in engine3.knowledge_base]) +len(list(chain.from_iterable(
    [t['paraphrased_questions'] for t in engine3.knowledge_base])))

print("vectorize knowledge base OK")

vectorize knowledge base OK


In [15]:
r1, r3 = calc_score_for_engine(engine3, "../faq_val.json")
assert r1 >0.70
assert r3 >0.85
print("scores OK")

recall @1: 0.7395833333333334
recall @3: 0.8958333333333334
scores OK


In [None]:
import logging

from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

from config import TOKEN, LOG_FILE


# Enable logging
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Logging to file
fh = logging.FileHandler(LOG_FILE)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
# Logging to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)


class Bot:
    def __init__(self):

        self.updater = Updater(TOKEN)
        self.dsp = self.updater.dispatcher

        # register handler functions which define how the bot reacts to events
        self.dsp.add_handler(CommandHandler("start", get_help))
        self.dsp.add_handler(CommandHandler("help", get_help))
        self.dsp.add_handler(CommandHandler("sentiment", get_sentiment))
        self.dsp.add_handler(CommandHandler("answer", get_answer))
        self.dsp.add_handler(MessageHandler(Filters.text, echo))
        self.dsp.add_error_handler(error)

        logger.info('Im alive!')

    def power_on(self):
        # start the Bot
        self.updater.start_polling()
        self.updater.idle()

# define command handlers. These usually take the two arguments: bot and
# update. Error handlers also receive the raised TelegramError object in error.


def echo(bot, update):
    logger.info('echo recieved message: {}'.format(update.message.text))
    bot.sendMessage(update.message.chat_id, text=update.message.text)


def error(bot, update, error):
    # all uncaught telegram-related exceptions will be rerouted here
    logger.error('Update "%s" caused error "%s"' % (update, error))


def get_help(bot, update):
    logger.info('get_help recieved message: {}'.format(update.message.text))
    help_msg = ('Greetings, {} {}! Name is {}, at your service.\n'
                'I currently support the following commands:\n'
                '/start - begins our chat and prints this message\n'
                '/help - prints this message\n'
                '/sentiment [message] - predicts the sentiment of the message').format(
        update.message.from_user.first_name, update.message.from_user.last_name, bot.name)
    bot.sendMessage(update.message.chat_id, text=help_msg)


def get_sentiment(bot, update):
    logger.info('get_sentiment recieved message: {}'.format(update.message.text))
    try:
        # get message text without the command '/sentiment'
        usr_msg = update.message.text.split(' ', maxsplit=1)[1]
        msg_sentiment = 0.5
        '''
        Now determine the sentiment of usr_msg.
        This should return a real number in [0,1].
        Your code goes here.
        '''
        bot.sendMessage(update.message.chat_id, text=msg_sentiment)
    except IndexError:
        bot.sendMessage(update.message.chat_id, text='Write your message after the command')
    except Exception as e:
        logger.error(e)
        
def get_answer(bot, update):
    logger.info('get_answer recieved message: {}'.format(update.message.text))
    try:
        # get message text without the command 
        usr_msg = update.message.text.split(' ', maxsplit=1)[1]
        answer = engine3.get_top(usr_msg)[0]
        '''
        Now determine the sentiment of usr_msg.
        This should return a real number in [0,1].
        Your code goes here.
        '''
        bot.sendMessage(update.message.chat_id, text=answer)
    except IndexError:
        bot.sendMessage(update.message.chat_id, text='Write your message after the command')
    except Exception as e:
        logger.error(e)

my_bot = Bot()
my_bot.power_on()


2017-12-02 14:01:23,671 - __main__ - INFO - Im alive!
2017-12-02 14:01:23,671 - __main__ - INFO - Im alive!
2017-12-02 14:01:23,671 - __main__ - INFO - Im alive!
2017-12-02 14:01:23,671 - __main__ - INFO - Im alive!
2017-12-02 14:01:23,671 - __main__ - INFO - Im alive!
2017-12-02 14:01:29,250 - __main__ - INFO - get_answer recieved message: /answer this
2017-12-02 14:01:29,250 - __main__ - INFO - get_answer recieved message: /answer this
2017-12-02 14:01:29,250 - __main__ - INFO - get_answer recieved message: /answer this
2017-12-02 14:01:29,250 - __main__ - INFO - get_answer recieved message: /answer this
2017-12-02 14:01:29,250 - __main__ - INFO - get_answer recieved message: /answer this
2017-12-02 14:01:55,840 - __main__ - INFO - get_answer recieved message: /answer где мне получить зарплату
2017-12-02 14:01:55,840 - __main__ - INFO - get_answer recieved message: /answer где мне получить зарплату
2017-12-02 14:01:55,840 - __main__ - INFO - get_answer recieved message: /answer где м