In [4]:
import gc
import time
import os, joblib
os.chdir('/content/drive/MyDrive/finalproj')
import _pickle as cpickle
import numpy as np

from utils import read_topics_as_utterances
from pyserini.search import SimpleSearcher
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from hqe import HQE
from pqe import PQE
from subprocess import Popen, PIPE
from ranker import rerank

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
cast_index_loc = './index-cast2019'
cast_data_loc = './treccastweb/2020/2020_automatic_evaluation_topics_v1.0.json'
tfidf_loc = './idf_lemmatized_counter.pkl'

# Data Related
training_topics = './treccastweb/2019/data/training/train_topics_v1.0.json'
evaluation_topics = './treccastweb/2019/data/evaluation/evaluation_topics_v1' \
                    '.0.json'


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
nltk.download('stopwords')
!pip install pyserini

In [5]:
config = {
    'cardinality': 10,
    'verbose': True,
    'hqe': {
        'qt_thresh': 3,
        'st_thresh': 3,
        'q_thresh': 12,
        'last_k': 3,
        'use_orig_for_query': True
    },
    'pqe': {
        'top_k_documents': 10,
        'top_k_tokens': 5,
    },
    'ranker':
        {
            'passage_max_len': 448,
            'query_max_len': 64
        }
}


In [40]:
class Pipeline(object):
    def __init__(self, cfg):
        self.cfg = cfg
        self.cardinality = cfg['cardinality']

        self.backend_engine = None
        self.idf = None
        self.expansion_pipeline = None
        self.tokenizer = None
        self.model = None
        self.ranker_pipeline = None
        self.prepare_pipeline(cfg)
        self.query_dict = {}

    def prepare_pipeline(self, cfg):

        print(f'Loading Backed PySerini Engine from {cast_index_loc}')
        self.backend_engine = SimpleSearcher(cast_index_loc)

        print(f'Loading IDF values from {tfidf_loc}')
        start = time.time()
        gc.disable()
        # self.idf = joblib.load(tfidf_loc)
        with open(tfidf_loc, 'rb') as f:
            self.idf = cpickle.load(f)
        gc.enable()
        print("Loading completed in {} sec.".format(time.time() - start))
        print(f'Loading BERT reranker and tokenizer')
        self.tokenizer = AutoTokenizer.from_pretrained(
            "amberoad/bert-multilingual-passage-reranking-msmarco")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "amberoad/bert-multilingual-passage-reranking-msmarco")

        components = []
        if 'hqe' in cfg:
            print(f'Adding HQE module to pipeline')
            hqe_cfg = cfg['hqe']
            hqe = HQE(self.backend_engine, hqe_cfg)
            components.append(hqe)

        if 'pqe' in cfg:
            print(f'Adding PQE module to pipeline')
            pqe_cfg = cfg['pqe']
            pqe = PQE(
                ir_engine=self.backend_engine,
                idf=self.idf,
                top_k_documents=pqe_cfg['top_k_documents'],
                top_k_tokens=pqe_cfg['top_k_tokens']
            )
            components.append(pqe)

        if 'ranker' in cfg:
            print(f'Adding Reranker module to pipeline')
            ranker_cfg = cfg['ranker']
            ranker = rerank(
                ir_engine=self.backend_engine,
                passage_max_len=ranker_cfg['passage_max_len'],
                query_max_len=ranker_cfg['query_max_len'],
                tokenizer=self.tokenizer,
                model=self.model,
                verbose=cfg['verbose']
            )
            self.ranker_pipeline = ranker

        self.expansion_pipeline = components
        print('Pipeline load completed.')

    def query_expansion(self, utterances):

        if len(self.expansion_pipeline) == 0:
            return utterances

        expanded_queries = utterances
        # change the expanded_queries[idx] to utterace[idx] if you want to
        # run just on PQE terms. Leave at it is if you want to run on the
        # both sets
        for module in self.expansion_pipeline:
            extension_sets = module.expand_queries(expanded_queries)
            expanded_queries = [
                expanded_queries[idx] + " " + " ".join(extension_sets[idx])
                for idx in range(len(utterances))
            ]
            self.query_dict[module] = extension_sets
        with open('post_eval.txt', 'a') as f:
            f.write("{} {} {}".format(utterances, '\n\n\n', self.query_dict))
        return expanded_queries

    def query_execution(self, utterances):
        results = []

        for query in utterances:
            hits = self.backend_engine.search(query, k=20)
            results.append(
                [hit.docid for hit in hits]
            )
        return results

    def passage_reranker(self, utterances, results):
        reranked_output = []
        for query, result in zip(utterances, results):
            # query = ' '.join(set(query.split()))
            # print(query)
            reranked_output.append(self.ranker_pipeline.rerank(query, result))
        return reranked_output

    def execute(self, utterances):
        queries = self.query_expansion(utterances)
        # print(queries)
        results = self.query_execution(queries)
        reranked = self.passage_reranker(queries,
                                         results)

        return results, reranked

In [42]:
def optim(qt_thresh=3, st_thresh=3, q_thresh=12, top_k_documents=10,
          top_k_token=5):
    config = {
    'cardinality': 10,
    'verbose': False,
    'hqe': {
        'qt_thresh': qt_thresh,
        'st_thresh': st_thresh,
        'q_thresh': q_thresh,
        'last_k': 3,
        'use_orig_for_query': True
    },
    'pqe': {
        'top_k_documents': top_k_documents,
        'top_k_tokens': top_k_token,
    },
    'ranker':
        {
            'passage_max_len': 448,
            'query_max_len': 64
        }
    }

    pipeline = Pipeline(config)
    train_utterances = read_topics_as_utterances(training_topics)
    extn = "{}_{}_{}_{}_{}".format(qt_thresh,st_thresh, q_thresh, top_k_documents, top_k_token)
    with open('run_' + extn+ '.txt', 'w') as f:
        f.write('')
    with open('reranked_run_' + extn+ '.txt', 'w') as f:
        f.write('')
    for idx, utter in enumerate(train_utterances):
        res, rerank = pipeline.execute(utter)
        # write the result in the final file
        # First write the output of the PQE and HQE
        number = str(idx+1)
        extn = "{}_{}_{}_{}_{}".format(qt_thresh,st_thresh, q_thresh, top_k_documents, top_k_token)
        with open('run_' + extn+ '.txt', 'a') as f:
            for idx, rec in enumerate(res):
                score = 10
                for idx_, i in enumerate(rec):
                    f.write(
                        "{} {} {} {} {} {}\n".format(number + '_' + str(idx + 1),
                                                        'Q0', i, idx_ + 1, score,
                                                        'Automatic_run'))
                    score *= 0.95
        with open('reranked_run_' + extn+ '.txt', 'a') as f:
            for idx, rec in enumerate(rerank):
                score = 10
                for idx_, i in enumerate(rec):
                    f.write(
                        "{} {} {} {} {} {}\n".format(number + '_' + str(idx + 1),
                                                        'Q0', i, idx_ + 1, score,
                                                        'Automatic_run'))
                    score *= 0.95
    
    p = Popen("./trec_eval/trec_eval -m ndcg ./treccastweb/2019/data/training/train_topics_mod.qrel ./reranked_run_" + extn + ".txt|awk -F' ' '{print $NF}'", cwd='/content/drive/MyDrive/finalproj',
            shell=True, stdout=PIPE)
    ndcg = p.communicate()[0].decode('ascii').strip('\n')
    p = Popen("./trec_eval/trec_eval -m ndcg ./treccastweb/2019/data/training/train_topics_mod.qrel ./run_" + extn + ".txt|awk -F' ' '{print $NF}'", cwd='/content/drive/MyDrive/finalproj',
            shell=True, stdout=PIPE)
    ndcg_ = p.communicate()[0].decode('ascii').strip('\n')
    return ndcg, ndcg_

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git

In [None]:
!cd trec_eval && make

In [None]:
!ls 

In [38]:
!./trec_eval/trec_eval ../finalproj/treccastweb/2019/data/training/train_topics_mod.qrel ../finalproj/reranked_run_2_5_16_200_10.txt

trec_eval.get_results: Cannot read results file '../finalproj/reranked_run_2_5_16_200_10.txt'
trec_eval: Quit in file '../finalproj/reranked_run_2_5_16_200_10.txt'


In [None]:
nltk.download('wordnet')
optim()

In [28]:
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials, space_eval

In [None]:
trials = Trials()
def opt_func(params):
    mae = optim(qt_thresh=params['qt_thresh'], 
                st_thresh=params['st_thresh'],
                q_thresh= params['q_thresh'],
                top_k_documents= params['top_k_documents'],
                top_k_token = params['top_k_token'])
    with open("optimization.txt", 'a') as f:
        f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
          params['qt_thresh'],params['st_thresh'], params['q_thresh'], params['top_k_documents'],
            params['top_k_token'], mae[0], mae[1]
        ))
    return {"loss": -float(mae[0]),
            "status": STATUS_OK}
with open("optimization.txt", 'w') as f:
    f.write("")            
trials = Trials()
space = {
    'qt_thresh': hp.uniform('qt_thresh', 2, 4),
    'st_thresh': hp.uniform('st_thresh', 2, 4),
    'q_thresh' : hp.uniform('q_thresh', 11, 18), 
    'top_k_documents' : hp.choice('top_k_documents', [3,4,5,6,7,8,9,10]),
    'top_k_token' : hp.choice('top_k_token', [3,4,5,6,7,8])
}

best = fmin(fn=opt_func,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials
            )

print("Best: {}".format(best))
print(trials.results)
print(trials.best_trial)
print(space_eval(space, best))

[('planet', 30.055985050463484), ('earth', 20.94031031834027), ('proved', 17.97188832851041), ('core', 17.586819607258256), ('temperature', 16.665516114184342)]
[('axis', 20.51935228818746), ('solar', 19.20696133737254), ('planet', 18.03359103027809), ('sun', 16.524966970451143), ('direction', 16.22640117447195)]
[('planet', 24.044788040370786), ('season', 22.21498379093823), ('sun', 22.033289293934857), ('earth', 20.94031031834027), ('axis', 20.51935228818746)]
[('planet', 30.055985050463484), ('axis', 20.51935228818746), ('solar', 19.20696133737254), ('sun', 16.524966970451143), ('direction', 16.22640117447195)]
[('planet', 30.055985050463484), ('axis', 20.51935228818746), ('solar', 19.20696133737254), ('current', 18.533394526768397), ('crossing', 18.29884413348155)]
[('equation', 51.38490756745931), ('chemical', 29.30477417320152), ('reaction', 23.41997566460376), ('set', 19.373101132272254), ('theory', 15.241131791633203)]
[('chemical', 41.02668384248213), ('reaction', 40.984957413