In [1]:
import json
import pandas as pd  

DOC_PATH = '../data/magsample/documents.json'

data = []

with open(DOC_PATH, 'r') as f:
    for line in f:
        doc = json.loads(line)
        data.append((str(doc['id']), doc['title'], doc['abstract']))

df_docs = pd.DataFrame(data, 
                       columns=['id', 'title', 'abstract'])
df_docs.head()

Unnamed: 0,id,title,abstract
0,2075884494,First year Wilkinson Microwave Anisotropy Prob...,We present full-sky microwave maps in five fre...
1,2100507804,The Anti-k(t) jet clustering algorithm,The kt and Cambridge/Aachen inclusive jet find...
2,2107816296,The Catchment Area of Jets,The area of a jet is a measure of its suscepti...
3,1639032689,"Genetic Algorithms in Search, Optimization, an...",From the Publisher:\r\nThis book brings togeth...
4,1999864907,Higgs look-alikes at the LHC,The discovery of a Higgs particle is possible ...


Getting the duplicated docs and generating a map from title -> doc_ids.

In [2]:
duplicated_doc_ids = df_docs.groupby('title').id.apply(list).to_dict()

The variable `duplicated_doc_ids` can be seen as a ground truth, since we'll use title as queries.

---

Ok, so we have some duplicated docs. The problem with that is when we do negative sampling, we can sample the same doc as a negative example for a positive example. This is not good. 

To handle that, we will index the docs abstracts. Then, when we return the top-k docs by its title, we remove possible duplicate ids from the result.

Steps:
1. Index the documents: abstract -> id; 
2. The query will be the title;
3. Return the top-k docs by the title;
4. If the doc id associated with title has duplicate, remove the duplicates from the result.
5. If the doc id associated with title is not in the results, put it in the results at the first position with 2x the score of the second result.
6. If the doc id associated with title is in the results but not in the first position, put it in the first position with 2x the score of the second result.

## Indexing

In [3]:
!ls ../

arch.png               [0m[01;34mfull_scripts[0m/  requirements.txt
[01;34mcustom_scripts[0m/        keep.txt       [01;34mt5_decoder_start_token_embeds[0m/
[01;34mdata[0m/                  [01;34mnotebook[0m/      [01;34mt5_pretrainer[0m/
[01;34mfull_16_1024_scripts[0m/  README.md      [01;34mvenv[0m/


In [4]:
import os 
import re

from typing import Dict

from tqdm import tqdm

from pyserini.index.lucene import LuceneIndexer
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search.lucene import LuceneSearcher

INDEX_PATH = '../indices/magsample'

indexer = LuceneIndexer(INDEX_PATH)

qrels: Dict[str, str] = {}
qid_to_query: Dict[str, str] = {}

data = []
for i, row in tqdm(df_docs.iterrows()):
    doc_id = row['id']
    
    doc_title = re.sub('\s+', ' ', row['title']).strip()
    doc_abstract = re.sub('\s+', ' ', row['abstract']).strip()
    
    qrels[str(i)] = doc_id

    doc = {
        'id': doc_id,
        'contents': doc_abstract
    }

    indexer.add_doc_dict(doc)
    qid_to_query[str(i)] = doc_title 

    data.append((doc_id, doc_abstract))

indexer.close()

os.makedirs('../data/magsample/collection/', exist_ok=True)
df_data = pd.DataFrame(data, columns=['id', 'abstract'])
df_data.to_csv('../data/magsample/collection/raw.tsv', index=False, header=False, sep='\t')

  from .autonotebook import tqdm as notebook_tqdm
Mar 13, 2025 10:15:17 AM org.apache.lucene.store.MMapDirectory lookupProvider


2025-03-13 10:15:17,243 INFO  [main] index.SimpleIndexer (SimpleIndexer.java:141) - Using DefaultEnglishAnalyzer
2025-03-13 10:15:17,245 INFO  [main] index.SimpleIndexer (SimpleIndexer.java:142) - Stemmer: porter
2025-03-13 10:15:17,245 INFO  [main] index.SimpleIndexer (SimpleIndexer.java:143) - Keep stopwords? false
2025-03-13 10:15:17,245 INFO  [main] index.SimpleIndexer (SimpleIndexer.java:144) - Stopwords file: null


127716it [00:38, 3288.69it/s]


--- 

Let's check if all docs have been indexed.

In [5]:
import random

random.seed(1234)


all_qids = list(qid_to_query.keys())

dev_qids = set(random.sample(all_qids, int(0.1 * len(all_qids)))) 
train_qids = set(all_qids) - dev_qids

In [7]:
len(all_qids), len(dev_qids), len(train_qids)

(127716, 12771, 114945)

In [None]:
import os 

os.makedirs('../data/magsample/train_queries/', exist_ok=True)
os.makedirs('../data/magsample/dev_queries/', exist_ok=True)

train_filepath = '../data/magsample/train_queries/raw.tsv'
dev_filepath = '../data/magsample/dev_queries/raw.tsv'

with open(train_filepath, 'w') as w_train, open(dev_filepath, 'w') as w_dev:
    for qid, query in qid_to_query.items():
        if qid in train_qids:
            w_train.write(f'{qid}\t{query}\n')
        else:
            w_dev.write(f'{qid}\t{query}\n') 

In [10]:
train_qid_to_reldocs = {qid: [qrels[qid]] for qid in train_qids}
dev_qid_to_reldocs = {qid: [qrels[qid]] for qid in dev_qids}

os.makedirs('../data/magsample/train_qrels/', exist_ok=True)
os.makedirs('../data/magsample/dev_qrels/', exist_ok=True)

train_qrels_filepath = '../data/magsample/train_qrels/qid_to_reldocids.json'
dev_qrels_filepath = '../data/magsample/dev_qrels/qid_to_reldocids.json'

with open(train_qrels_filepath, 'w') as f:
    f.write(json.dumps(train_qid_to_reldocs))

with open(dev_qrels_filepath, 'w') as f:
    f.write(json.dumps(dev_qid_to_reldocs))

In [12]:
import json 

qrel_path = '../data/magsample/dev_qrels/'
with open(qrel_path + 'qid_to_reldocids.json') as reader, open(qrel_path + 'qrel.json', 'w') as writer:
    dev_qid_to_reldocs = json.load(reader)

    qrel = {}
    for qid, reldocs in dev_qid_to_reldocs.items():
        qrel[qid] = {docid: 1 for docid in reldocs}

    writer.write(json.dumps(qrel))

In [11]:
searcher = LuceneSearcher(INDEX_PATH)
k = 100

os.makedirs('../data/magsample/bm25_run/', exist_ok=True)

with open('../data/magsample/bm25_run/qrel_added_qid_docids_teacher_scores.train.jsonl', 'w') as f:
    for qid in tqdm(train_qids):
        rel_doc = qrels[qid]
        
        query = qid_to_query[qid]

        hits = searcher.search(query, k=k)

        if len(hits) == 0:
            print(f'"{query}" - No hits found')
            continue

        docids = []
        scores = []

        # if the ground truth isn't in the hits, we put it there in the first position
        if doc_id not in [hit.docid for hit in hits]:
            # the score will be 1.5x the score of the first hit
            score = hits[0].score * 1.25

            docids.append(doc_id)
            scores.append(round(score, 5))

        # if the ground truth isn't in the first position, we put it there
        elif hits[0].docid != doc_id:
            # the the score of ground truth
            score = hits[0].score * 1.25
            for hit in hits:
                if hit.docid == doc_id:
                    score = hit.score
                    break
            
            # the score of the ground truth will be the score of the first hit + the difference between the scores
            score = hits[0].score + (hits[0].score - score)

            # remove the ground truth from the hits
            hits = [hit for hit in hits if hit.docid != doc_id]

            docids.append(doc_id)
            scores.append(round(score, 5))

        # if the query has duplicates, we remove the duplicates from the hits
        has_duplicates = len(duplicated_doc_ids[query]) > 1
        if has_duplicates:
            doc_ids_to_remove = duplicated_doc_ids[query].copy()
            doc_ids_to_remove.remove(rel_doc)
            hits = [hit for hit in hits if hit.docid not in doc_ids_to_remove]

        # Now we add the hits to the results
        for hit in hits:
            docids.append(hit.docid)
            scores.append(round(hit.score, 5))

        if docids[0] != doc_id:
            raise Exception(f'"{query}" - Ground truth not in first position: {doc_id} - {docids}')

        f.write(json.dumps({
            'qid': qid,
            'docids': docids,
            'scores': scores
        }) + '\n')

 13%|█▎        | 14845/114945 [01:30<10:04, 165.68it/s]

"Yicesä2.2" - No hits found


100%|██████████| 114945/114945 [12:00<00:00, 159.63it/s]
