In [1]:
# %%capture
# !pip install pyserini==0.10.0./

import os, joblib
import numpy as np
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"


cast_index_loc = '/mnt/workdrive/Study/CAST/indices/index-cast2019'
cast_data_loc = '/mnt/workdrive/Study/CAST/treccastweb/2020/2020_automatic_evaluation_topics_v1.0.json'
tfidf_loc = '/mnt/workdrive/Study/CAST/idf_counter.pkl'
# tfidf_loc = '/mnt/workdrive/Study/CAST/msmarco/msmarco_tfidf.pkl'

In [2]:
from pyserini.search import SimpleSearcher

searcher = SimpleSearcher(cast_index_loc)

In [3]:
vectorizer = joblib.load(tfidf_loc)

In [14]:
# Passage Query Expansion Module.
# Given a query we do the followin
# 1. First we classify utterance as implicit and explicit
# 2. For explicit utterances we expand the query
#
# For explicit queries, get top-k documents to get expanded keywords.

import gc, joblib
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

from tf_idf_vectorizer import text_processor as idf_processor
from utils import preprocess_utterance as preprocess_utterance


class PQE(object):
    def __init__(self, ir_engine, idf, top_k_documents, top_k_tokens):
        """
        :param ir_engine: PySerini Backend Engine over CAST documents.
        :param idf: Token Inverse document frequency
        :param cfg: dict Configuration for cfg.
        """
        self.backend_engine = ir_engine
        self.idf = idf
        self.top_k_documents = top_k_documents
        self.top_k_tokens = top_k_tokens

    def classify_utterance(self, utterance):
        """
        :param utterance: str
        :return: True if utterance to be expanded else False
        """
        return True

    def get_topk_token(self, documents):
        scores = defaultdict(lambda : -1.0)

        for doc in documents:
            tokens = idf_processor(doc).split()
            tf = Counter(tokens)

            for tok in tf:
                score = tf[tok] * self.idf.get(tok, 0)
                if score > scores[tok]:
                    scores[tok] = score

        sorted_items = sorted(
            scores.items(), key=lambda x: x[1], reverse=True
        )[:self.top_k_tokens]

        return set([x[0] for x in sorted_items])

    def expand_query(self, utterance):
        """
        :param utterance: str
        :return query: str expanded query 
        """
        if not self.classify_utterance(utterance):
            return utterance

        # 1. Get top-k documents
        results = self.backend_engine.search(utterance, k=self.top_k_documents)

        if len(results) < self.top_k_documents:
            print(f'Number of results {len(results)} are less than top-k {self.top_k_documents}.')
            print(f'Query: {utterance}')

        if len(results) == 0:
            return utterance

        documents = [res.raw for res in results]

        # 2. Get TF-IDF query expansion
        extension_set = self.get_topk_token(documents)

        return utterance + " " + " ".join(extension_set)


In [25]:
pqe = PQE(
    ir_engine=searcher, idf=vectorizer,
    top_k_documents=100, top_k_tokens=5
)

In [29]:
pqe.expand_query('I would like to learn about planet Jupiter')

'I would like to learn about planet Jupiter migration jupiter kepler39b planets ganymede'