# Boolean retrieval implementation and execution over TREC-DL 2020 Passage Ranking task

In [72]:
import nltk
import string

import re

import numpy as np

import time

from scipy.sparse import csr_matrix, lil_matrix
from multiprocessing import Pool

import pickle
import gc

import os

import scipy.sparse as scipy_sparse

from datetime import datetime

In [103]:
MSMARCO_PASSAGE_COLLECTION="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/collections/msmarco-passage/collection.tsv"

STEMMED_DOCS_FILE="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_02/indexes/tempfile.pkl"

STEMMED_DOCS_FILE_FORMAT="{}_{}.pkl"
STEMMED_DOCS_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_02/indexes"

REVERSED_INDEX_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_02/indexes"
REVERSED_INDEX_FILE_FORMAT="reversed_index_TREC-DL_2020_{}.pkl"

REVERSED_INDEX_FILE="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_02/indexes/reversed_index_TREC-DL_2020_complete.pkl"

TREC_DL_2020_QUERIES="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/collections/trec-dl_2020-passage/msmarco-test2020-queries.tsv"
TREC_DL_2020_QRELS="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/collections/trec-dl_2020-passage/2020qrels-pass.txt"

TREC_DL_2020_RUN_FORMAT="TREC_DL_2020_boolean_run_{}.tsv"
TREC_DL_2020_RUN_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_02/runs"

TREC_EVAL_FULLPATH="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/tools/eval/trec_eval.9.0.4/trec_eval"

NUMBER_OF_DOCS=8841822

PARTIAL_STEMMED_DOCS_COUNT=1000000

## Text preprocessing functions

In [3]:
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
stemmer = nltk.stem.PorterStemmer()

In [4]:
def test_preprocess_text(which_text):

    all_tokens = nltk.word_tokenize(which_text.lower())
    cleaned_tokens = [token for token in all_tokens if token not in stop_words and token not in punctuation]
    stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]

    return stemmed_tokens

## Read all the documents and their corresponding indexes

In [120]:
read_documents_text = []
read_documents_index = []

In [121]:
with open(MSMARCO_PASSAGE_COLLECTION, 'r', encoding="utf-8") as inputFile:
    
    for i in range(NUMBER_OF_DOCS):
        line = inputFile.readline()
        cleaned = re.sub("(â.{1}.{1})", " ", line)
        document_data = cleaned.split('\t')        

        read_documents_text.append(document_data[1])
        read_documents_index.append(document_data[0])

In [122]:
len(read_documents_text)

8841822

In [139]:
read_documents_text[715779]

'The Complete Medical Record and Electronic Charting Chapter Outline The Medical Record Important Uses of the Medical Record Medical Record Formats Source-Oriented Medical Record (SOMR) Problem-Oriented Medical Record (POMR) Combining Formats Contents of the Medical Record Administrative Information in a Medical Record Clinical Information in a Medical Record Creating and Maintaining the Medical Record Maintaining the Medical Record Documenting in the Medical Record Electronic Medical Records (EMRs) Laws That Affect the Medical Record The Health Insurance Portability and Accountability Act of\n'

In [127]:
read_documents_index[1038342]

'1038342'

## Now, preprocess all the documents in parallel

In [None]:
with Pool(processes=8) as pool:
    stemmed_documents = pool.map(test_preprocess_text, read_documents_text)

In [None]:
with open(STEMMED_DOCS_FILE, "wb") as outputFile:
    pickle.dump({"doc_ids": read_documents_index, 
                 "stemmed_docs": stemmed_documents}, outputFile, pickle.HIGHEST_PROTOCOL)

In [None]:
del read_documents_text

In [None]:
del reversed_index

In [None]:
gc.collect()

In [None]:
len(stemmed_documents)

### Split the stemmed document file in smaller ones

In [None]:
with open(STEMMED_DOCS_FILE, "rb") as inputFile:
    
    temp_file = pickle.load(inputFile)
    
    read_documents_index = temp_file['doc_ids']
    stemmed_documents = temp_file['stemmed_docs']

In [None]:
for i in range(NUMBER_OF_DOCS // PARTIAL_STEMMED_DOCS_COUNT + 1):
    with open(os.path.join(STEMMED_DOCS_FOLDER, STEMMED_DOCS_FILE_FORMAT.format("temp", i)), "wb") as outputFile:
        pickle.dump({"doc_ids": read_documents_index[i * PARTIAL_STEMMED_DOCS_COUNT:(i * PARTIAL_STEMMED_DOCS_COUNT) + PARTIAL_STEMMED_DOCS_COUNT], 
                     "stemmed_docs": stemmed_documents[i * PARTIAL_STEMMED_DOCS_COUNT:(i * PARTIAL_STEMMED_DOCS_COUNT) + PARTIAL_STEMMED_DOCS_COUNT]}, outputFile, pickle.HIGHEST_PROTOCOL)

In [None]:
del read_documents_index
del stemmed_documents
del temp_file

In [None]:
gc.collect()

## Finally, compute the reversed index using a sparse matrix to store the documents' word frequencies

In [None]:
#
# This function loads the partial file with the stemmed documents and computes the reversed index for the terms in the
# documents.
#
# After that, the reversed index computed to the part of the entire collection is saved to be processed later.
#

def process_stemmed_docs_file(file_index):
    
    reversed_index = {}
    
    with open(os.path.join(STEMMED_DOCS_FOLDER, STEMMED_DOCS_FILE_FORMAT.format("temp", file_index)), "rb") as inputFile:
        temp_file = pickle.load(inputFile)

        read_documents_index = temp_file['doc_ids']
        stemmed_documents = temp_file['stemmed_docs']                

    print("Document range: {} until {}".format(read_documents_index[0], read_documents_index[-1]))
        
    start_time = time.time()
        
    for doc_index in range(len(stemmed_documents)):
        
        if doc_index % 100000 == 0:
            print(time.time() - start_time)
            print(doc_index)
            start_time = time.time()
        
        for token in stemmed_documents[doc_index]:
            if token not in reversed_index:
                reversed_index[token] = lil_matrix((1, NUMBER_OF_DOCS), dtype=np.short)

            reversed_index[token][0, int(read_documents_index[doc_index])] += 1

    del read_documents_index
    del stemmed_documents
    del temp_file            
            
    with open(os.path.join(REVERSED_INDEX_FOLDER, REVERSED_INDEX_FILE_FORMAT.format(file_index)), 'wb') as outputFile:
        pickle.dump(reversed_index, outputFile, pickle.HIGHEST_PROTOCOL)
    
    return True

### Process in parallel all the stemmed documents file parts

In [None]:
with Pool(processes=2) as pool:
    results = pool.map(process_stemmed_docs_file, range(0, NUMBER_OF_DOCS // PARTIAL_STEMMED_DOCS_COUNT + 1))

In [None]:
gc.collect()

### Load the precomputed reversed indexes and consolidate them into a single dictionary

The reversed index file had to be created and saved in chunks due to local RAM limitations. However, it is possible to consolidate it into a single dictionary and save it later.

In [None]:
start_time = time.time()

with open(os.path.join(REVERSED_INDEX_FOLDER, REVERSED_INDEX_FILE_FORMAT.format(0)), 'rb') as inputFile:
    reversed_indexes = pickle.load(inputFile)

for i in range(1, NUMBER_OF_DOCS // PARTIAL_STEMMED_DOCS_COUNT + 1):
    with open(os.path.join(REVERSED_INDEX_FOLDER, REVERSED_INDEX_FILE_FORMAT.format(i)), 'rb') as inputFile:
        reversed_indexes_tmp = pickle.load(inputFile)
        
    for token, token_docs in reversed_indexes_tmp.items():
        if token in reversed_indexes:
            reversed_indexes[token] = reversed_indexes[token] + token_docs
        else:
            reversed_indexes[token] = token_docs
            
print(time.time() - start_time)

In [None]:
gc.collect()

In [None]:
len(reversed_indexes)

In [None]:
with open(REVERSED_INDEX_FILE, 'wb') as outputFile:
    pickle.dump(reversed_indexes, outputFile, pickle.HIGHEST_PROTOCOL)

## .. or read the preprocessed data

In [5]:
with open(REVERSED_INDEX_FILE, 'rb') as inputFile:
    reversed_indexes = pickle.load(inputFile)

In [6]:
len(reversed_indexes)

3750155

## Now run the queries

In [7]:
query_index = []
query_text = []

### Read all the queries in memory

In [8]:
with open(TREC_DL_2020_QUERIES, 'r', encoding="utf-8") as inputFile:
    for line in inputFile:
        query_data = line.split('\t')
        
        query_index.append(query_data[0])
        query_text.append(query_data[1])

In [9]:
len(query_text)

200

### Preprocess the queries the same way as the documents

In [10]:
with Pool(processes=8) as pool:
    stemmed_queries = pool.map(test_preprocess_text, query_text)

In [11]:
len(stemmed_queries)

200

### Find the query matches

In [113]:
def find_related_documents(stemmed_query, only_all_terms=False):
    
    related_documents = {}
    
    for token in stemmed_query:
        if token in reversed_indexes:
            
#             print(token)
            
            related_docs = reversed_indexes[token].nonzero()
            
            if type(reversed_indexes[token]) is scipy_sparse._csr.csr_matrix:
                related_docs_scores = np.array(reversed_indexes[token][related_docs])[0]
            else:
                related_docs_scores = np.array(reversed_indexes[token][related_docs].todense())[0]
            
            for i, doc in enumerate(related_docs[1]):
                if only_all_terms:
                    if not related_documents:
                        related_documents[doc] = 0

                    if doc in related_documents:
                        related_documents[doc] += related_docs_scores[i]
                else:
                    if doc not in related_documents:
                        related_documents[doc] = 0
                    
                    related_documents[doc] += related_docs_scores[i]
            
    return related_documents

In [114]:
process_start_time = time.time()

queries_matches = []

for i in range(200):
    start_time = time.time()

    print("{} : {}".format(i, stemmed_queries[i]))
    
    queries_matches.append(find_related_documents(stemmed_queries[i]))

    print("{}\n".format(time.time() - start_time))
    
print("Total time to process the queries: {}\n".format(time.time() - process_start_time))    

0 : ['aziz', 'hashim']
0.0016260147094726562

1 : ['rep', 'scalis']
0.007838964462280273

2 : ['kill', 'nichola', 'ii', 'russia']
0.1594095230102539

3 : ['own', 'barnhart', 'crane']
0.05151653289794922

4 : ['said', 'one', 'make', 'feel', 'inferior']
2.6888651847839355

5 : ['sing', 'monk', 'theme', 'song']
0.11971712112426758

6 : ['highest', 'career', 'passer', 'rate', 'nfl']
0.5287888050079346

7 : ['hunter', 'pattern', 'shotgun']
0.06828188896179199

8 : ['place', 'scalp', 'feel', 'sore']
0.6487905979156494

9 : ['pete', 'rose', 'ban', 'hall', 'fame']
0.08589935302734375

10 : ['thoma', 'cooley']
0.027501821517944336

11 : ['definit', 'endors']
0.31087207794189453

12 : ['hormon', 'increas', 'calcium', 'level', 'blood']
0.9822475910186768

13 : ['defin', 'geon']
0.14307117462158203

14 : ['amazon', 'rainforest', 'locat']
0.4675126075744629

15 : ['four', 'forc', 'act', 'airplan', 'equilibrium']
0.5646154880523682

16 : ['defin', 'pareto', 'chart', 'statist']
0.30861949920654297

1

0.4984745979309082

135 : ['averag', 'wed', 'dress', 'alter', 'cost']
0.8791859149932861

136 : ['project', 'definit']
0.4269862174987793

137 : ['barclay', 'fca', 'number']
0.5474715232849121

138 : ['benefit', 'polici', 'layoff']
0.27014899253845215

139 : ['hour', 'clinic']
0.4460756778717041

140 : ['symptom', 'shingl']
0.27480006217956543

141 : ['biggest', 'loser', 'challeng']
0.09341049194335938

142 : ['villag', 'burnham']
0.038063764572143555

143 : ['vitamin', 'e', 'anti', 'scar']
0.1474909782409668

144 : ['weather', 'antigua', 'novemb']
0.18822431564331055

145 : ['weather', 'novi', 'sad']
0.142167329788208

146 : ['best', 'food', 'lower', 'cholesterol']
0.985297441482544

147 : ['carvedilol', 'use']
2.0077054500579834

148 : ['caus', 'bruis', 'appear']
0.8589925765991211

149 : ['caus', 'muscl', 'tear']
0.8229949474334717

150 : ['counti', 'dexter', 'michigan']
0.27413344383239746

151 : ['counti', 'new', 'york', 'new', 'york']
1.3285675048828125

152 : ['counti', 'rio', '

In [118]:
len(queries_matches[0].keys())

213

In [138]:
stemmed_queries[102]

['natur', 'record', 'public', 'inform']

### Now save the results in the TREC format

In [95]:
TREC_RESULT_LINE_FORMAT="{}\tQ0\t{}\t{}\t{}\tboolean\n"

In [89]:
MAX_RESULTS_TO_SAVE=1000

In [109]:
def generate_trec_format(queries_matches, query_ids, output_filename, verbose=False):
    
    with open(output_filename, 'w') as outputFile:
        for i, query_result in enumerate(queries_matches):
            
            if verbose:
                print("Saving query {}\n{}:".format(i, query_text[i]))
            
            relevant_docs = np.array(list(query_result.keys()))
            relevant_docs_scores = np.array(list(query_result.values()))
            
            relevant_docs_order = np.argsort(relevant_docs_scores)[::-1]
            
            if verbose:
                print("relevant_docs.shape={}".format(relevant_docs.shape))
            
            relevant_docs_final_result = relevant_docs[relevant_docs_order]
            relevant_docs_final_score = relevant_docs_scores[relevant_docs_order]
            
            if verbose:
                print("relevant_docs_final_result: {}\n\n".format(relevant_docs_final_result))
            
            for j, each_match in enumerate(relevant_docs_final_result[:MAX_RESULTS_TO_SAVE]):
                outputFile.write(TREC_RESULT_LINE_FORMAT.format(query_ids[i], each_match, j, relevant_docs_final_score[j]))

In [133]:
run_filename = os.path.join(TREC_DL_2020_RUN_FOLDER, TREC_DL_2020_RUN_FORMAT.format(datetime.now().strftime("%Y%m%d_%H%M%S")))

In [134]:
generate_trec_format(queries_matches, query_index, run_filename)

### Now, apply the TREC metrics

In [135]:
!{TREC_EVAL_FULLPATH} -c -mrecall.1000 -mmap -mndcg_cut.10 -mrecip_rank \
    {TREC_DL_2020_QRELS} {run_filename}

map                   	all	0.0259
recip_rank            	all	0.1238
recall_1000           	all	0.2519
ndcg_cut_10           	all	0.0538
