# Oracle Baseline for TOMT retrieval

We use reciprocal rank fusion over the positive queries as oracle

### Step 1: Import Dependencies

In [1]:
import pyterrier as pt
import pandas as pd
from tira.third_party_integrations import ensure_pyterrier_is_loaded, get_preconfigured_chatnoir_client, get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm

ensure_pyterrier_is_loaded()
input_directory, output_directory = get_input_directory_and_output_directory('/workspace/tomt-dataset-tira')

chatnoir = get_preconfigured_chatnoir_client(config_directory = input_directory, features = [], verbose = True, num_results=1000, page_size=1000)

Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


I will use a small hardcoded example located in /workspace/tomt-dataset-tira.
The output directory is /tmp/
ChatNoir Client will retrieve the top-1000 with page size of 1000 from index ClueWeb22 with 25 retries.


### Step 2: Load the data

In [17]:
print('Step 2: Load the data.')

queries = []
num_runs = 0

for _, query in pd.read_json(f'{input_directory}/queries.jsonl', lines=True).iterrows():
    positive_queries = query['original_query']['positiveQueries']
    for positive_query, internal_id in zip(positive_queries, range(len(positive_queries))):
        queries += [{'qid': str(query['qid'])  + '_' + str(internal_id), 'query': positive_query}]
        num_runs = max(num_runs, internal_id)

queries = pd.DataFrame(queries)
print(queries)

Step 2: Load the data.
    qid                                             query
0  20_0                         litographs art from books
1  20_1                                    litographs.com
2  20_2                    litographs t-shirts from books
3  21_0                                     Sssscomic.com
4  21_1                 stand still stay silent web comic
5  21_2  finnish swedish webcomic stand still stay silent
6  22_0                  tineye Multicolr Search by color
7  22_1                          tineye multicolor search
8  22_2                 tineye multicolr MulticolorEngine


### Step 3: Create Run

In [16]:
print('Step 3: Create Run.')
run = chatnoir(queries)
print(run[['qid', 'docno', 'score', 'rank']].head())

Step 3: Create Run.


Searching with ChatNoir: 100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [00:21<00:00,  2.38s/query]

       qid                      docno      score  rank
1470  22_0  clueweb22-en0035-21-15726  3789.2517     0
1372  21_2  clueweb22-en0027-30-10516  3763.0837     0
1471  22_0  clueweb22-en0009-94-03715  3747.3723     1
1490  22_2  clueweb22-en0035-21-15726  3405.0020     0
1491  22_2  clueweb22-en0009-94-03715  3373.2278     1





### Step 4: Do Reciprocal Rank fusion

In [37]:
print('Do reciprocal rank fusion')

from trectools import TrecRun, fusion

all_runs = []

for run_id in range(num_runs +1):
    r = []
    
    for _, i in run.iterrows():
        if i['qid'].endswith(f'_{run_id}'):
            r += [{'query': i['qid'].split('_')[0], 'docid': i['docno'], 'score': i['score'], 'rank': i['rank']}]
    tr = TrecRun()
    tr.run_data = pd.DataFrame(r)
    all_runs += [tr]
    print(f'Run with id {run_id} has {len(r)} documents')

fused_run = fusion.reciprocal_rank_fusion(all_runs)
fused_run = fused_run.run_data

fused_run['qid'] = fused_run['query']
del fused_run['query']
fused_run['docno'] = fused_run['docid']
del fused_run['docid']

fused_run

Do reciprocal rank fusion
Run with id 0 has 226 documents
Run with id 1 has 1030 documents
Run with id 2 has 241 documents


Unnamed: 0,q0,rank,score,system,qid,docno
0,Q0,1,0.047627,reciprocal_rank_fusion_k=60,20,clueweb22-en0024-09-06042
1,Q0,2,0.046871,reciprocal_rank_fusion_k=60,20,clueweb22-en0041-37-00460
2,Q0,3,0.045695,reciprocal_rank_fusion_k=60,20,clueweb22-en0023-36-06642
3,Q0,4,0.045291,reciprocal_rank_fusion_k=60,20,clueweb22-en0028-53-13178
4,Q0,5,0.043329,reciprocal_rank_fusion_k=60,20,clueweb22-en0027-37-15110
...,...,...,...,...,...,...
1245,Q0,15,0.013889,reciprocal_rank_fusion_k=60,22,clueweb22-en0034-65-11420
1246,Q0,16,0.013699,reciprocal_rank_fusion_k=60,22,clueweb22-en0046-41-13565
1247,Q0,17,0.013514,reciprocal_rank_fusion_k=60,22,clueweb22-en0015-84-05128
1248,Q0,18,0.013333,reciprocal_rank_fusion_k=60,22,clueweb22-en0007-39-17259


### Step 5: Persist Run

In [38]:
print('Step 5: Persist Run.')

persist_and_normalize_run(fused_run, 'chatnoir-oracle-baseline', output_file=output_directory + '/run.txt')

print('Done...')

Step 5: Persist Run.
Done...


In [39]:
!head -3 {output_directory}/run.txt

20 0 clueweb22-en0024-09-06042 1 0.04762704813108039 chatnoir-oracle-baseline
20 0 clueweb22-en0041-37-00460 2 0.046871392288155164 chatnoir-oracle-baseline
20 0 clueweb22-en0023-36-06642 3 0.04569460390355913 chatnoir-oracle-baseline
