# Hyperparameter Tuning for Sentence Level Query Reduction

In [2]:
import torch
from simpletransformers.classification import ClassificationModel
from tira.third_party_integrations import ensure_pyterrier_is_loaded
ensure_pyterrier_is_loaded()

import pyterrier as pt
import os
import sys
import pandas as pd
from tqdm import tqdm
import json
sys.path.append('chat-gpt')
from sentence_level_query_reduction import SentenceLevelQueryReduction, get_model_by_name


index = pt.IndexRef.of(os.path.abspath('pyterrier-index/index/data.properties'))

In [3]:
def report_effectiveness(model, dataset):
    queries = pd.read_json(f'tot-{dataset}/queries.xml', lines=True)
    sentence_level_query_reduction = SentenceLevelQueryReduction(model).as_transformer()

    bm25 = pt.BatchRetrieve(index, wmodel="BM25")

    retrieval_pipeline = sentence_level_query_reduction >> bm25

    ret = pt.Experiment([retrieval_pipeline], queries, pt.io.read_qrels(f'tot-{dataset}/qrels.txt'), eval_metrics=["ndcg_cut_10", 'recip_rank', 'P_10'])
    
    return {i: ret.iloc[0][i] for i in ['ndcg_cut_10', 'recip_rank', 'P_10']}

def report_pipeline(model):
    ret = {'Model': model}
    model = get_model_by_name(model)
    for dataset in ['train', 'dev']:
        for k,v in report_effectiveness(model, dataset).items():
            ret[dataset + '_' + k] = v
    return ret

df = []
for model in tqdm(['bert,/models/bert-checkpoint-31250-epoch-1']):
    df += [report_pipeline(model)]

df = pd.DataFrame(df)
df.to_json('hyperparameter-tuning-sentence-level-results.jsonl', lines=True, orient='records')

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/1 [00:06<?, ?it/s]


ValueError: Expected object or value

In [4]:
queries = pd.read_json(f'tot-train/queries.jsonl', lines=True).head(3)
sentence_level_query_reduction = SentenceLevelQueryReduction(get_model_by_name('bert,/models/bert-checkpoint-31250-epoch-1')).as_transformer()

In [5]:
queries = pd.concat([
    pd.read_json(f'tot-train/queries.jsonl', lines=True),
    pd.read_json(f'tot-test/queries.jsonl', lines=True),
    pd.read_json(f'tot-dev/queries.jsonl', lines=True),
])

model_name = 'bert,/models/bert-checkpoint-31250-epoch-1'

model = SentenceLevelQueryReduction(get_model_by_name(model_name))

already_covered = open(model_name.replace(',', '-'). replace('/', '-') + '-predictions.jsonl', 'r').read()
already_covered = set([json.loads(i)['qid'] for i in already_covered.split('\n') if i])

with open(model_name.replace(',', '-'). replace('/', '-') + '-predictions.jsonl', 'a+') as f:
    for _, query in tqdm(queries.iterrows()):
        qid, query = query[['qid', 'query']]
        if qid in already_covered:
            print('Skip ' + qid)
        prediction = model.reduce_query(query)
        f.write(json.dumps({'qid': str(qid), 'query': query}) + '\n')
        f.flush()

  1%|          | 1/102 [00:00<00:07, 13.22it/s]
100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
  1%|          | 1/167 [00:00<00:13, 12.70it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
  1%|          | 1/134 [00:00<00:14,  9.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
  1%|          | 1/108 [00:00<00:08, 13.34it/s]
100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
  1%|          | 1/102 [00:00<00:10,  9.43it/s]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
  7%|▋         | 1/15 [00:00<00:01, 11.42it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
  1%|          | 1/97 [00:00<00:07, 12.85it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
  1%|          | 1/97 [00:00<00:08, 11.52it/s]
100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
  5%|▌         | 1/20 [00:00<00:01, 11.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
  2%|▏         | 1/64 [00:00<00:06, 10.26it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
  4%|▍         | 1/24 [00:00<00:01, 14.28it/s]
100%|██████████| 1

KeyboardInterrupt: 