# Comparison between query sets

In [3]:
import numpy as np
from pyterrier_pisa import PisaIndex
from pyterrier.measures import *
from indexer.dataset import MSMarcoWebSearch, ResearchyQuestions, ClueWeb22

#_STOPWORDS = 'a and are as at be but by for if in into is it no not of on or s such t that the their then there these they this to was will with www'
_STOPWORDS = 'a abaft abafter abaftest about abouter aboutest above abover abovest accordingly aer aest afore after afterer afterest afterward afterwards again against aid ain albeit all aller allest alls allyou almost along alongside already also although always amid amidst among amongst an and andor anear anent another any anybody anyhow anyone anything anywhere apart aparter apartest appear appeared appearing appears appropriate appropriated appropriater appropriates appropriatest appropriating are ares around as ases aside asides aslant astraddle astraddler astraddlest astride astrider astridest at athwart atop atween aught aughts available availabler availablest awfully b be became because become becomes becoming becominger becomingest becomings been before beforehand beforehander beforehandest behind behinds below beneath beside besides better bettered bettering betters between betwixt beyond bist both but buts by by-and-by byandby c cannot canst cant canted cantest canting cants cer certain certainer certainest cest chez circa co come-on come-ons comeon comeons concerning concerninger concerningest consequently considering could couldst cum d dday ddays describe described describes describing despite despited despites despiting did different differenter differentest do doe does doing doings done doner dones donest dos dost doth downs downward downwarder downwardest downwards during e each eg eight either else elsewhere enough ere et etc even evened evenest evens evenser evensest ever every everybody everyone everything everywhere ex except excepted excepting excepts exes f fact facts failing failings few fewer fewest figupon figuponed figuponing figupons five followthrough for forby forbye fore forer fores forever former formerer formerest formerly formers fornenst forwhy four fourscore frae from fs further furthered furtherer furtherest furthering furthermore furthers g get gets getting go gone good got gotta gotten h had hadst hae hardly has hast hath have haves having he hence her hereafter hereafters hereby herein hereupon hers herself him himself his hither hitherer hitherest hoo hoos how how-do-you-do howbeit howdoyoudo however huh humph i idem idemer idemest ie if ifs immediate immediately immediater immediatest in inasmuch inc indeed indicate indicated indicates indicating info information insofar instead into inward inwarder inwardest inwards is it its itself j k l latter latterer latterest latterly latters layabout layabouts less lest lot lots lotted lotting m main make many mauger maugre mayest me meanwhile meanwhiles midst midsts might mights more moreover most mostly much mucher muchest must musth musths musts my myself n natheless nathless neath neaths necessarier necessariest necessary neither nethe nethermost never nevertheless nigh nigher nighest nine no no-one nobodies nobody noes none noone nor nos not nothing nothings notwithstanding nowhere nowheres o of off offest offs often oftener oftenest oh on one oneself onest ons onto or orer orest other others otherwise otherwiser otherwisest ought oughts our ours ourself ourselves out outed outest outs outside outwith over overall overaller overallest overalls overs own owned owning owns owt p particular particularer particularest particularly particulars per perhaps plaintiff please pleased pleases plenties plenty pro probably provide provided provides providing q qua que quite r rath rathe rather rathest re really regarding relate related relatively res respecting respectively s said saider saidest same samer sames samest sans sanserif sanserifs sanses saved sayid sayyid seem seemed seeminger seemingest seemings seems send sent senza serious seriouser seriousest seven several severaler severalest shall shalled shalling shalls she should shoulded shoulding shoulds since sine sines sith six so sobeit soer soest some somebody somehow someone something sometime sometimer sometimes sometimest somewhat somewhere stop stopped such summat sup supped supping sups syn syne t ten than that the thee their theirs them themselves then thence thener thenest there thereafter thereby therefore therein therer therest thereupon these they thine thing things this thises thorough thorougher thoroughest thoroughly those thou though thous thouses three thro through througher throughest throughout thru thruer thruest thus thy thyself till tilled tilling tills to together too toward towarder towardest towards two u umpteen under underneath unless unlike unliker unlikest until unto up upon uponed uponing upons upped upping ups us use used usedest username usually v various variouser variousest verier veriest versus very via vis-a-vis vis-a-viser vis-a-visest viz vs w was wast we were wert what whatever whateverer whateverest whatsoever whatsoeverer whatsoeverest wheen when whenas whence whencesoever whenever whensoever where whereafter whereas whereby wherefrom wherein whereinto whereof whereon wheresoever whereto whereupon wherever wherewith wherewithal whether which whichever whichsoever while whiles whilst whither whithersoever whoever whomever whose whoso whosoever why with withal within without would woulded woulding woulds x y ye yet yon yond yonder you your yours yourself yourselves z zillion'.split(' ')
print("Number fo stopwords loaded:", len(_STOPWORDS))

_INTERROGATIVES = ['what', 'who', 'where', 'when', 'why', 'how', 'whose', 'which']

Number fo stopwords loaded: 733


## Comparison on query length

In [None]:
TESTED_BENCHMARKS = {"msmarco-ws": "MSM-WS", "rq": "RQ"}

# load document corpus
documents_dataset = ClueWeb22(collection="cw22b", verbose=True)

TESTED_LEN_MODE = "all"

In [4]:
# init an inverted index
inverted_index = PisaIndex("./tmp/", text_field=['text'], overwrite=True)

def _calc_len(text: str, mode: str = "cleaned") -> tuple:
    if mode == "string":
        return len(text), -1
    
    tokens = inverted_index.tokenize(text)
    tokenized_interr = inverted_index.tokenize(" ".join(_INTERROGATIVES))
    tokenized_sw = inverted_index.tokenize(" ".join(_STOPWORDS))
    text_len = 0
    stopwords_removed = 0
        
    for tok in tokens:
        if mode == "cleaned":
            if tok not in tokenized_sw: 
                text_len += 1
            else:
                print("removed tok: ", tok)
                stopwords_removed += 1
        elif mode == "interrogatives":
            if tok in tokenized_interr:
                text_len += 1
        else:
            text_len += 1
    return text_len, stopwords_removed

In [None]:
def _find_text_lengths(texts_list: list, mode: str) -> list:
    """ given a list of texts, compute their lenght (tot. number of tokens) after tokenisation """
    dlens = []
    sw = 0

    for doc_text in texts_list:
        dlen, d_sw = _calc_len(doc_text, mode)
        dlens.append(dlen)
        sw += d_sw

    return dlens

def calc_qlen(benchmark_name: str, mode: str):    
    if benchmark_name == "msmarco-ws":
        queries_dataset = MSMarcoWebSearch(benchmark=benchmark_name)
    elif benchmark_name == "rq":
        queries_dataset = ResearchyQuestions(benchmark=benchmark_name)
    
    query_texts = queries_dataset.get_queries()["query"].to_list()
    print("type of query_texts: ", type(query_texts))
    print(query_texts)

    q_lens = _find_text_lengths(query_texts, mode)
    return q_lens

Actual comparison on query length

In [11]:
x_data = []

for benchmark, printable_bname in TESTED_BENCHMARKS.items():
    q_lens = calc_qlen(benchmark_name=benchmark, mode=TESTED_LEN_MODE)
    x_data.append(q_lens)

    mean_rel = np.mean(q_lens)
    std_rel = np.std(q_lens)
    median_rel = np.median(q_lens)
    print(f"Mean len for {benchmark}: {mean_rel}")
    print(f"Std len for {benchmark}: {std_rel}")
    print(f"Median for {benchmark}: {median_rel}")

type of query_texts:  <class 'list'>
Mean len for msmarco-ws: 3.828125
Std len for msmarco-ws: 2.2691475079743997
Median for msmarco-ws: 3.0
type of query_texts:  <class 'list'>
['how does climate change affect the caribbean', 'how did the panic of 1893 affect the railroad industry?', 'do the american population wish more autonomy in their work?', 'how did arms race lead to ww1', 'is neglect a form of abuse', 'is conformity a good thing, or a bad thing? why?', 'was the american revolution radical or moderate?', 'how did the roman empire split', "how does gender affect a child's development", 'what are the challenges faced by the united nations in maintaining global security?', 'why startups fail', 'why is branding so important for companies?', 'why are the issues of stereotyping and discrimination especially relevant in 21st century america?', 'how long has mental health been an issue', 'how social attitudes have changed over time regarding specific learning difficulties', 'why is covi

## Random selection of 5 queries

In [13]:
NUM_RANDOM_QUERIES = 5
TESTED_BENCHMARKS = {"msmarco-ws": "MSM-WS", "rq": "RQ"}

In [14]:
import random
def get_random_queries(benchmark_name: str, k: int = 3):    
    if benchmark_name == "msmarco-ws":
        queries_dataset = MSMarcoWebSearch(benchmark=benchmark_name)
    elif benchmark_name == "rq":
        queries_dataset = ResearchyQuestions(benchmark=benchmark_name)
    
    query_texts = queries_dataset.get_queries()["query"].to_list()
    print("q_texts:", )
    
    return random.sample(query_texts, k)

In [103]:
random.seed(38)
queries = get_random_queries(benchmark_name="msmarco-ws", k=NUM_RANDOM_QUERIES)
print(queries)
cleaned_queries = [_calc_len(q_text, "cleaned") for q_text in queries]
print(cleaned_queries)

q_texts:
['print variable sql', 'income tax challan payment', 'insta followers', 'the blizzard of', 'stillwater mine nye']
removed tok:  the
removed tok:  of
[(3, 0), (4, 0), (2, 0), (1, 2), (3, 0)]


In [93]:
random.seed(38)
queries2 = get_random_queries(benchmark_name="rq", k=NUM_RANDOM_QUERIES)
print(queries2)
cleaned_queries = [_calc_len(q_text, "cleaned") for q_text in queries2]
print(cleaned_queries)

q_texts:
['why is alibaba so successful', 'are educational games effective', 'what affects cryptocurrency price', 'how does global warming affect the environment', 'how mining pollutes the environment']
removed tok:  whi
removed tok:  is
removed tok:  so
removed tok:  are
removed tok:  what
removed tok:  how
removed tok:  doe
removed tok:  the
removed tok:  how
removed tok:  the
[(2, 3), (3, 1), (3, 1), (4, 3), (3, 2)]
