<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/BM25_topK_with_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q rank_bm25

In [2]:
import string
import pickle
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.utils import shuffle
from tqdm.autonotebook import tqdm
from collections import OrderedDict

  import sys


In [3]:
def pickle_file(path, data=None):
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
# reading RECEITA FEDERAL FAQ
corpus =  pickle_file('/content/drive/MyDrive/Ranking - Tutos/receita_faq_list')

# Shuffle no corpus
corpus = shuffle(corpus)

def make_qa(corpus):
    questions, docs = [],[]
    for text in corpus:
        q_point = text.find('?')+1
        questions.append(text[:q_point].strip())
        docs.append(text[q_point:].strip())
    assert len(questions) == len(docs), f'ERrooou'

    return questions, docs

q, d = make_qa(corpus)

df = pd.DataFrame(
    {
        'query': q,
        'doc': d,
    }
    )

qid_to_query = {k:v for k, v in enumerate(df['query'])}
query_to_qid = {k:v for v, k in qid_to_query.items()}

pid_to_doc = {k:v for k, v in enumerate(df.doc)}
doc_to_pid = {k:v for v, k in pid_to_doc.items()}

assert len(qid_to_query) == len(pid_to_doc) == df.shape[0]

df['qid'] = qid_to_query.keys()
df['pid'] = pid_to_doc.keys()
df = df[['qid', 'query', 'pid', 'doc']]
df

Unnamed: 0,qid,query,pid,doc
0,0,qual é o tratamento tributário conferido aos g...,0,os ganhos auferidos por pessoas físicas nas al...
1,1,qual é o valor de custo de aquisição que deve ...,1,deve ser informado o valor da doação constante...
2,2,podem ser deduzidos os gastos com aeronaves?,2,somente podem ser deduzidos os gastos com: 1...
3,3,como proceder quando o carnê-leão for pago a m...,3,poderá ser compensado o valor do principal pag...
4,4,que se considera bem de pequeno valor para fin...,4,considera-se bem de pequeno valor aquele decor...
...,...,...,...,...
674,674,como deve declarar o contribuinte viúvo no dec...,674,"no curso do inventário, apresenta declaração c..."
675,675,pode ser considerada como rendimento não tribu...,675,"não. lei nº 7.713, de 22 de dezembro de 1988,..."
676,676,como proceder para aplicar o percentual de red...,676,"no caso de percentual de redução diferenciado,..."
677,677,qual é o tratamento tributário da variação cam...,677,valor da variação cambial apurada no momento d...


# MS MARCO SHAPE

In [5]:
def make_triplets(qrels, passage_id_to_text):
    # triplets: qid pos_pid neg_pid
    qid_list, pos_pid_list, neg_pid_list = [], [], []
    for qid, pos_pid in zip(qrels.qid.values, qrels.pid.values):
        for positive in pos_pid:
            qid_list.append(qid)
            pos_pid_list.append(positive)
            neg_pid = np.random.choice(list(passage_id_to_text.keys()))
            while neg_pid in pos_pid:
                neg_pid = np.random.choice(list(passage_id_to_text.keys()))
            neg_pid_list.append(neg_pid)

    len(qid_list), len(pos_pid_list), len(neg_pid_list)      

    df_triplet = pd.DataFrame({'qid': qid_list, 'pos_pid': pos_pid_list, 'neg_pid': neg_pid_list})
    return df_triplet

# - - - - - 
qrels = df.groupby(['qid']).agg(
    lambda x: tuple(x)).applymap(list).reset_index()[['qid', 'pid']]

df_triplet = make_triplets(qrels, pid_to_doc)
df_triplet

Unnamed: 0,qid,pos_pid,neg_pid
0,0,0,138
1,1,1,286
2,2,2,658
3,3,3,622
4,4,4,58
...,...,...,...
674,674,674,76
675,675,675,494
676,676,676,566
677,677,677,551


In [6]:
qrels

Unnamed: 0,qid,pid
0,0,[0]
1,1,[1]
2,2,[2]
3,3,[3]
4,4,[4]
...,...,...
674,674,[674]
675,675,[675]
676,676,[676]
677,677,[677]


In [7]:
# FAQ QUERIES MS MARCO SHAPE
FAQ_queries = OrderedDict({qid:question for qid, question in zip(df['qid'].values, df['query'].values)})

In [8]:
# FAQ QRELS MS MARCO SHAPE
FAQ_qrels = OrderedDict({qid:pid for qid, pid in zip(qrels['qid'].values, qrels['pid'].values)})

# Metrics

In [9]:
class Metrics:
    def __init__(self, mrr_depths:set, recall_depths:set, success_depths:set, total_queries=None):
        self.results = {}
        self.mrr_sums = {depth:0.0 for depth in mrr_depths}
        self.recall_sums = {depth:0.0 for depth in recall_depths}
        self.success_sums = {depth:0.0 for depth in success_depths}
        self.total_queries = total_queries

    def get_result(self, query_idx, query_key, ranking, gold_positives):
        assert query_key not in self.results
        assert len(self.results) <= query_idx
        assert len(set(gold_positives)) == len(gold_positives)
        assert len(set([pid for _, pid, _ in ranking])) == len(ranking)

        self.results[query_key] = ranking

        positives = [i for i, (_, pid, _) in enumerate(ranking) if pid in gold_positives]

        if len(positives) == 0:
            return

        for depth in self.mrr_sums:
            first_positive = positives[0]
            self.mrr_sums[depth] += (1.0 / (first_positive+1.0)) if first_positive < depth else 0.0

        for depth in self.success_sums:
            first_positive = positives[0]
            self.success_sums[depth] += 1.0 if first_positive < depth else 0.0

        for depth in self.recall_sums:
            num_positives_up_to_depth = len([pos for pos in positives if pos < depth])
            self.recall_sums[depth] += num_positives_up_to_depth / len(gold_positives)

    def print_metrics(self, query_idx):
        for depth in sorted(self.mrr_sums):
            mrr_value =  self.mrr_sums[depth] / (query_idx+1.0)
            print(f"MRR@{str(depth):<2} = {mrr_value:.3}")
        
        print('- '*10)
        for depth in sorted(self.recall_sums):
            recall_value = self.recall_sums[depth] / (query_idx+1.0)
            print(f"Recall@{str(depth):<2} = {recall_value:.3}")
        print('- '*10)
        for depth in sorted(self.success_sums):
            success_value = self.success_sums[depth] / (query_idx+1.0)
            print(f"Success@{str(depth):<2} = {success_value:.3}")

## TopK retrieval

In [10]:
def pt_stop_words(path):
    #read_file with stop_words
    with open(path) as f:
        stop_words = f.readlines()

    pt_stop_words = []
    for w in stop_words:
        #remove break lines and spaces
        pt_stop_words.append(w.replace('\n', '').strip())

    return pt_stop_words
# - - - - - 
path = '/content/drive/MyDrive/Colab Notebooks/stopwords.txt'
stop_words = pt_stop_words(path)

# ----------------------------------------------------------------------
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in stop_words:
            tokenized_doc.append(token)
    return tokenized_doc
# - - - - - 


In [14]:
def search_all(queries=FAQ_queries, collection=None, K=20, qrels=None):
    tokenized_corpus = []
    for passage in tqdm(collection):
        tokenized_corpus.append(bm25_tokenizer(passage))
    bm25 = BM25Okapi(tokenized_corpus)
    
    metrics = Metrics(
        mrr_depths={1, 3, 5, 10, 20}, 
        recall_depths={1, 3, 5, 10, 20},
        success_depths={1, 3, 5, 10, 20},
        total_queries=len(queries)
        )
    
    topK_pids = {}
    
    keys = sorted(list(queries.keys()))
    for query_idx, qid in enumerate(keys):
        query = queries[qid]
        
        ##### BM25 search (lexical search) #####
        bm25_scores = bm25.get_scores(bm25_tokenizer(query))
        top_n = np.argpartition(bm25_scores, -5)[-K:]
        bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
        bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
        
        ranked_scores, ranked_pids, ranked_passages = [],[],[]
        for hit in bm25_hits[0:K]:
            ranked_scores.append(hit['score'])
            ranked_pids.append(hit['corpus_id'])
            ranked_passages.append(pid_to_doc[hit['corpus_id']])
        
        topK_pids[qid] = ranked_pids
        
        ranking = list(zip(ranked_scores, ranked_pids, ranked_passages))
        
        if qrels:
            metrics.get_result(query_idx, qid, ranking, qrels[qid])
            if query_idx%25 == 0:
                print(f'\n[{query_idx}]. Query: {query}')
                for i, (score, pid, passage) in enumerate(ranking, 1):
                    if pid in qrels[qid]:
                        print(f"Found at position: {i} with score {score:.3}")
                        print(passage)
                        break
                metrics.print_metrics(query_idx)
            
    return topK_pids
# - - - - - 
collection = df.doc.to_list()
topK_pids = search_all(queries=FAQ_queries, collection=collection, K=20, qrels=FAQ_qrels)

HBox(children=(FloatProgress(value=0.0, max=679.0), HTML(value='')))



[0]. Query: qual é o tratamento tributário conferido aos ganhos obtidos nas alienações de ações fora de  bolsa de valores?
Found at position: 1 with score 25.3
os ganhos auferidos por pessoas físicas nas alienações de ações fora de bolsa são tributados como ganho  de capital. se alienadas em bolsa de valores estão sujeitas às normas de apuração de renda variável.
MRR@1  = 1.0
MRR@3  = 1.0
MRR@5  = 1.0
MRR@10 = 1.0
MRR@20 = 1.0
- - - - - - - - - - 
Recall@1  = 1.0
Recall@3  = 1.0
Recall@5  = 1.0
Recall@10 = 1.0
Recall@20 = 1.0
- - - - - - - - - - 
Success@1  = 1.0
Success@3  = 1.0
Success@5  = 1.0
Success@10 = 1.0
Success@20 = 1.0

[25]. Query: como declarar empréstimos efetuados a empresa?
Found at position: 2 with score 6.72
os empréstimos feitos devem ser informados na declaração de bens e direitos, no código 51.    os juros pagos pela pessoa jurídica tomadora do empréstimo são tributados exclusivamente na fonte.   consulte a pergunta 214         - espólio
MRR@1  = 0.5
MRR@3  = 0.5

In [16]:
def build_topk_docs(topK_pids, query_id_to_text, passage_id_to_text):
    queries = OrderedDict()
    topK_docs = OrderedDict()

    for i, (qid, pids) in enumerate(topK_pids.items()):
        queries[qid] = query_id_to_text[qid]
        topK_docs[qid] = topK_docs.get(qid, [])

        for j, pid in enumerate(pids, 1):
            topK_docs[qid].append(passage_id_to_text[pid])
    return queries, topK_docs

queries, topK_docs = build_topk_docs(
    topK_pids, 
    qid_to_query, 
    pid_to_doc
    )
print(f'len(queries):    {len(queries)}')
print(f'len(topK_docs):  {len(topK_docs)}')
print(f'len(topK_pids):  {len(topK_pids)}')
print(f'len(collection): {len(collection)}')
assert len(queries) == len(topK_docs) == len(topK_pids)
# topK_pids qid: [pids]
# topK_docs : qid: [text_docs]
# queries: qid: text
# qrels: qid: [pids]
# collection: list

len(queries):    679
len(topK_docs):  679
len(topK_pids):  679
len(collection): 679
