# Retrieval-based systems

Retrieval-based bots answer user queries by retrieving the most relevant answer from a pre-defined knowledge base.

In [1]:
import json
import scipy
import pymystem3
import numpy as np

from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from itertools import chain

Using TensorFlow backend.


In [2]:
knowledge_base = json.load(open("./faq_train.json"))

First, let's have a look at our data.

In [3]:
knowledge_base[:2]

[{'answer': 'ИН работнику должен сообщить его непосредственный руководитель, который видит номер в базе «Заказ ИТ-услуг» (первая строчка в начале страницы)',
  'paraphrased_questions': ['Как узнать свой идентификационный код?',
   'Где я могу посмотреть опознавательный номер недавно прибывшего сотрудника?',
   'Как мне узнать свой номер?',
   'У кого я могу узнать свои идентификационный номер id ?',
   'Где можно ознакомиться с идентификатором нового поступившего сотрудника?',
   'Где указан идентификационный номер сотрудника?',
   'В какой системе можно найти работника?',
   'Как определить нового работника?',
   'В какой системе находится новый работник?',
   'Как узнать идентификатор нового работника?',
   'Как узнать id новичка?',
   'Как узнать идентификационный номер нового сотрудника?',
   'Как узнать идентификационный номер сотрудника?',
   'Где ознакомиться с ИН вновь поступившего работника?',
   'Где узнать свой код?'],
  'question': 'Где можно узнать идентификационный номер 

### Utility functions to evaluate your IR engines

In [4]:
def calc_score_for_engine(engine, val_json_path):
    data = []
    for q in json.load(open(val_json_path)):
        queries = q['paraphrased_questions']
        answer = q['answer']
        results = [engine.get_top(query, top_k=3) for query in queries]
        for r in results:
            data.append([answer] + r)
            
    ra1 = calc_recall(data, 1)
    ra3 = calc_recall(data, 3)
    print("recall @1: {}\nrecall @3: {}".format(ra1, ra3))
    return ra1, ra3
            
def calc_recall(data, k, bootstrap=0, subsample_rate=None):
    """
    :param data: 2d matrix
    data[i, 0] - true answer, data[i, 1:] - predicted answers, sorted by decreasing score.
    """
    count = np.zeros(1 + bootstrap)
    count_hit = np.zeros(1 + bootstrap)
    for fields in data:
        query = fields[0]

        if subsample_rate is None:
            increment = np.random.poisson(lam=1, size=bootstrap)
        else:
            increment = np.random.binomial(1, subsample_rate, bootstrap)
        increment = np.hstack([[1], increment])

        if query in fields[1:k+1]:
            count_hit += increment
        count += increment

    recall = count_hit / count

    return recall[0]

### Exercise 1.1

In this exercise we build a basic IR system using a TF-IDF representation.

In [11]:
class ENGINE_1(object):
    def __init__(self, kbase_path):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        

        self.vectorizer = self.prepare_vectorizer()
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
        
    
    def prepare_vectorizer(self):
        """
        Fits TF-IDF vectorizer using all available text from self.knowledge_base
        
        Returns TF-IDF vectorizer object
        """
        # your code goes here
        vectorizer = None
        texts = []
        
        for kn in self.knowledge_base:
            texts.append(kn['answer'])
            for par in kn['paraphrased_questions']:
                texts.append(par)
            texts.append(kn['question'])
        
        all_texts = [' '.join(self.tokenize_and_lemmatize(text)) for text in tqdm(texts)]
        
        vectorizer = TfidfVectorizer().fit(all_texts)

        return vectorizer
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.vectorizer.
        
        Returns a a matrix of shape [N, n_features]
        """
        # your code goes here
        vectorized = None
        
        vectorized = self.vectorizer.transform(data)
        
        return 
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions using the vectorize function.
        Builds a list containing class id (it's index in self.knowledge_base) for each question.
        
        Returns vectorized questions and a list of class ids
        """
        questions = [t['question'] for t in self.knowledge_base]
        return self.vectorize(questions), list(range(len(self.knowledge_base)))
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        # your code goes here
        
        class_scores = {}
        
        for i in range(len(similarities)):
            class_scores[i] = similarities[i]

        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [12]:
engine1 = ENGINE_1("./faq_train.json")


  0%|                                                                                          | 0/446 [00:00<?, ?it/s]
  0%|▏                                                                                 | 1/446 [00:01<08:55,  1.20s/it]
  0%|▎                                                                                 | 2/446 [00:02<08:42,  1.18s/it]
  1%|▌                                                                                 | 3/446 [00:03<08:33,  1.16s/it]
  1%|▋                                                                                 | 4/446 [00:04<08:26,  1.15s/it]
  1%|▉                                                                                 | 5/446 [00:05<08:21,  1.14s/it]
  1%|█                                                                                 | 6/446 [00:06<08:17,  1.13s/it]
  2%|█▎                                                                                | 7/446 [00:07<08:15,  1.13s/it]
  2%|█▍                                

In [None]:
t1 = engine1.prepare_vectorizer()
assert isinstance(t1, TfidfVectorizer)
assert len(t1.get_feature_names()) > 1500
print("prepare vectorizer OK")

In [None]:
t2 = engine1.vectorize(['Как получить ЗП-карту?',
   'А можно зарплатную карту вне очереди получить - очень надо?',
   'Как быть с зарплатой?'])

assert isinstance(t2, scipy.sparse.csr_matrix)
assert t2.shape[0] == 3 and t2.shape[1] == len(engine1.vectorizer.get_feature_names())
print("vectorize OK")

In [None]:
t3 = engine1.compute_class_scores(np.arange(0,1,0.034))
assert isinstance(t3, dict)
assert list(t3.items()) == list(zip(range(30), np.arange(0,1,0.034)))
print("compute_class_scores OK")

In [None]:
r1, r3 = calc_score_for_engine(engine1, "./faq_val.json")
assert r1 >0.5
assert r3 >0.75
print("scores OK")

### Exercise 1.2

In this exercise we improve our IR system using word vectors.

In [None]:
w2v = KeyedVectors.load_word2vec_format("word_vectors.w2v")

In [None]:
class ENGINE_2(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        

        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        
        Returns a a matrix of shape [N, 300] (300 = word vector dimensionality)
        """
        vectorized = []
        # your code goes here
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions using the vectorize function.
        Builds a list containing class id (it's index in self.knowledge_base) for each question.
        
        Returns vectorized questions and a list of class ids
        """
        questions = [t['question'] for t in self.knowledge_base]
        return self.vectorize(questions), list(range(len(self.knowledge_base)))
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        # your code goes here
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [None]:
engine2 = ENGINE_2("./faq_train.json", w2v)

In [None]:
t1 = engine2.vectorize(['Как получить ЗП-карту?',
   'А можно зарплатную карту вне очереди получить - очень надо?',
   'Как быть с зарплатой?'])

assert isinstance(t1, np.ndarray)
assert t1.shape[0] == 3 and t1.shape[1] == 300
print("vectorize OK")

In [None]:
r1, r3 = calc_score_for_engine(engine2, "./faq_val.json")
assert r1 >0.65
assert r3 >0.8
print("scores OK")

### Exercise 1.3

In this exercise we use available paraphrases to further improve the quiality of our IR system.

In [None]:
class ENGINE_3(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        Returns a a matrix of shape [N, 300]
        """
        vectorized = []
        
        # your code goes here
        
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions AND paraphrased questions using the vectorize function.
        Builds a list containing class id (it's index in self.knowledge_base) for each vectorized question.
        
        Example: you vectorized 1 question and 5 paraphrases for that question
        Then you should append the ID of the question to class_labels list 5+1=6 times.
        """
        vectors = []
        class_labels = []
        
        # your code goes here
        
        return vectors, class_labels
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), [0]*len(self.answers)))
        
        # your code goes here
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

In [None]:
engine3 = ENGINE_3("./faq_train.json", w2v)

In [None]:
t1 = engine3.vectorize_knowledge_base()

assert isinstance(t1[0], np.ndarray)
assert t1[0].shape[0] == len([t['answer'] for t in engine3.knowledge_base]) +len(list(chain.from_iterable(
    [t['paraphrased_questions'] for t in engine3.knowledge_base])))

print("vectorize knowledge base OK")

In [None]:
r1, r3 = calc_score_for_engine(engine3, "./faq_val.json")
assert r1 >0.70
assert r3 >0.85
print("scores OK")

### Exercise 1.4

In this exercise we use all our knowledge to push the performance of our IR system to the limit.

Some suggestions:
* Stopwords removal
* Word-vector re-weighting (e.g. by idf score)
* Combining both TF-IDF and word2vec representation


In [None]:
class ENGINE_4(object):
    def __init__(self, kbase_path, w2v_model):
        self.knowledge_base = json.load(open(kbase_path))
        self.lemmatizer = pymystem3.Mystem()
        self.w2v_model = w2v_model
        
        # contains correct output for each class
        self.answers = np.array([t['answer'] for t in self.knowledge_base])
        
        self.vectorized_kbase, self.class_indexes = self.vectorize_knowledge_base()
    
    def vectorize(self, data):
        """
        Turns a list of N strings into their vector representation using self.w2v_model.
        In the simplest case, averages the word vectors of all words in a sentence.
        Returns a a matrix of shape [N, 300]
        """
        vectorized = []
        for d in data:
            vectorized.append(bow_encoder(self.w2v_model, self.tokenize_and_lemmatize, d))
        
        return np.array(vectorized)
        
    def vectorize_knowledge_base(self):
        """
        Vectorizes all questions AND paraphrased questions using the vectorize function.
        Builds a list containing class id (it's index in self.knowledge_base) for each vectorized question.
        
        Example: you vectorized 1 question and 5 paraphrases for that question
        Then you should append the ID of the question to class_labels list 5+1=6 times.
        """
        vectors = []
        class_labels = []
        
        # your code goes here
        
        return np.vstack(vectors), class_labels
    
    def compute_class_scores(self, similarities):
        """
        Accepts an array of similarities of shape (self.class_indexes, )
        Computes scores for classes.
        Returns a dictionary of size (n_classes) that looks like
        {
            0: 0.3,
            1: 0.1,
            2: 0.0,
            class_n_id: class_n_score
            ...
        }
        """
        
        class_scores = dict(zip(range(len(self.answers)), [0]*len(self.answers)))
        
        # your code goes here
        return class_scores
        
    def tokenize_and_lemmatize(self, text):
        analysis = self.lemmatizer.analyze(text.strip())
        tokens = []
        for an in analysis:
            if 'analysis' in an:
                try:
                    tokens.append(an['analysis'][0]['lex'])
                except IndexError:
                    tokens.append(an['text'])
        return tokens
    
    def get_top(self, query, top_k=3):
        if isinstance(query, str):
            query = [query]
            
        vectorized_query = self.vectorize(query)
        css = cosine_similarity(vectorized_query, self.vectorized_kbase)[0]
        scores = self.compute_class_scores(css)
        
        sorted_scores = sorted(scores.items(), key= lambda x: x[1])[::-1][:top_k]
        top_classes = np.array([c[0] for c in sorted_scores])
        top_answers = list(self.answers[top_classes])
        return top_answers

### Exercise 1.5

Finally, integrate your best model with your telegram frontend.

### Make sure config.py contains your valid bot token

In [None]:
import logging

from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

from config import TOKEN, LOG_FILE


# Enable logging
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Logging to file
fh = logging.FileHandler(LOG_FILE)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
# Logging to console
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)


class Bot:
    def __init__(self):

        self.updater = Updater(TOKEN)
        self.dsp = self.updater.dispatcher

        # register handler functions which define how the bot reacts to events
        self.dsp.add_handler(CommandHandler("start", get_help))
        self.dsp.add_handler(CommandHandler("help", get_help))
        self.dsp.add_handler(CommandHandler("sentiment", get_sentiment))
        self.dsp.add_handler(CommandHandler("answer", get_answer))
        self.dsp.add_handler(MessageHandler(Filters.text, echo))
        self.dsp.add_error_handler(error)

        logger.info('Im alive!')

    def power_on(self):
        # start the Bot
        self.updater.start_polling()
        self.updater.idle()

# define command handlers. These usually take the two arguments: bot and
# update. Error handlers also receive the raised TelegramError object in error.


def echo(bot, update):
    logger.info('echo recieved message: {}'.format(update.message.text))
    bot.sendMessage(update.message.chat_id, text=update.message.text)


def error(bot, update, error):
    # all uncaught telegram-related exceptions will be rerouted here
    logger.error('Update "%s" caused error "%s"' % (update, error))


def get_help(bot, update):
    logger.info('get_help recieved message: {}'.format(update.message.text))
    help_msg = ('Greetings, {} {}! Name is {}, at your service.\n'
                'I currently support the following commands:\n'
                '/start - begins our chat and prints this message\n'
                '/help - prints this message\n'
                '/sentiment [message] - predicts the sentiment of the message').format(
        update.message.from_user.first_name, update.message.from_user.last_name, bot.name)
    bot.sendMessage(update.message.chat_id, text=help_msg)


def get_sentiment(bot, update):
    logger.info('get_sentiment recieved message: {}'.format(update.message.text))
    try:
        # get message text without the command '/sentiment'
        usr_msg = update.message.text.split(' ', maxsplit=1)[1]
        msg_sentiment = 0.5
        '''
        Now determine the sentiment of usr_msg.
        This should return a real number in [0,1].
        Your code goes here.
        '''
        bot.sendMessage(update.message.chat_id, text=msg_sentiment)
    except IndexError:
        bot.sendMessage(update.message.chat_id, text='Write your message after the command')
    except Exception as e:
        logger.error(e)
        
def get_answer(bot, update):
    logger.info('get_answer recieved message: {}'.format(update.message.text))
    try:
        # get message text without the command 
        usr_msg = update.message.text.split(' ', maxsplit=1)[1]
        # your code goes here
        '''
        Now determine the sentiment of usr_msg.
        This should return a real number in [0,1].
        Your code goes here.
        '''
        bot.sendMessage(update.message.chat_id, text=answer)
    except IndexError:
        bot.sendMessage(update.message.chat_id, text='Write your message after the command')
    except Exception as e:
        logger.error(e)

my_bot = Bot()
my_bot.power_on()
