In [6]:
import functools
import multiprocessing
import os
import re
from typing import Callable, Iterator

import bs4
import pandas as pd
import pyterrier as pt

In [7]:
if not pt.started():
    pt.init(tqdm="notebook", logging="INFO")


# Utils


In [32]:

SEPS = r"-,.:;?! \n\t\[\]\(\)'\""
QUERY_MARKS = r"-#\{\}\(\)\^\+:~"


def paths_gen(docs_paths_file: str, docs_paths_prefix: str) -> Iterator[str]:
    with open(docs_paths_file, mode="r", encoding="utf-8") as paths_file:
        for line in paths_file:
            yield os.path.join(docs_paths_prefix, line.rstrip())


def take(iter_: Iterator, max_len=100) -> Iterator:
    for i, elem in enumerate(iter_):
        yield elem
        if i + 1 > max_len:
            break


def tokenise(doc: dict[str, str], tokeniser: Callable[[str], str]) -> dict[str, str]:
    doc["text"] = tokeniser(doc["text"])
    return doc


def basic_tokeniser(text: str) -> str:
    """Splits text by ',.:;?! \n\t[]()'"' and joins it by space."""
    words = re.split("[" + SEPS + "]", text)
    return " ".join(words)


def read_topics(
    topic_file: str, get_query: Callable[[bs4.element.Tag], dict[str, str]]
) -> pd.DataFrame:
    topics = []
    with open(topic_file, mode="r", encoding="utf-8") as file_handle:
        soup = bs4.BeautifulSoup(file_handle, "xml")
        for query_tag in soup.find_all("top"):
            topics.append(get_query(query_tag))

    return pd.DataFrame(topics)


def get_query_title(tag: bs4.element.Tag) -> dict[str, str]:
    qid = str(tag.num.string)
    title = str(tag.title.string)
    return {"qid": qid, "query": title}


def sanitize_query(row: pd.Series) -> str:
    query_tokens = re.split("[" + QUERY_MARKS + "]", row.query)
    return " ".join(query_tokens)


# Run-0 EN


In [49]:


en_doc_paths = list(paths_gen("../A1/documents_en.lst", "../A1/documents_en"))
text_gen = pt.index.treccollection2textgen(en_doc_paths)

In [50]:

indexer = pt.IterDictIndexer(
    "./en_index",
    overwrite=True,
    stemmer="none",
    stopwords="none",
    tokeniser="whitespace",
)


tokeniser_partial = functools.partial(tokenise, tokeniser=basic_tokeniser)
with multiprocessing.Pool() as pool:
    indexer.index(pool.imap(tokeniser_partial, text_gen), fields=["text"])

16:34:31.670 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020101.xml
16:34:31.752 [ForkJoinPool-11-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
16:34:31.754 [ForkJoinPool-11-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
16:34:31.758 [ForkJoinPool-11-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
16:34:31.992 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020102.xml
16:34:32.301 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020103.xml
16:34:32.676 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020104.xml
16:34:32.98

16:34:50.452 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020225.xml
16:34:50.750 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020226.xml
16:34:51.024 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020227.xml
16:34:51.363 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020228.xml
16:34:51.703 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020301.xml
16:34:52.041 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020302.xml
16:34:52.320 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020303.xml

16:35:09.505 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 30% processing ../A1/documents_en/la020424.xml
16:35:09.831 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020425.xml
16:35:10.150 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020426.xml
16:35:10.453 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020427.xml
16:35:10.732 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020428.xml
16:35:11.251 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020429.xml
16:35:11.547 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020430.xml

16:35:28.112 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 46% processing ../A1/documents_en/la020621.xml
16:35:28.429 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020622.xml
16:35:28.679 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020623.xml
16:35:29.174 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020624.xml
16:35:29.435 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020625.xml
16:35:29.709 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 48% processing ../A1/documents_en/la020626.xml
16:35:30.022 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 48% processing ../A1/documents_en/la020627.xml

16:35:45.702 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 62% processing ../A1/documents_en/la020818.xml
16:35:46.161 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020819.xml
16:35:46.435 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020820.xml
16:35:46.667 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020821.xml
16:35:46.968 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020822.xml
16:35:47.262 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 64% processing ../A1/documents_en/la020823.xml
16:35:47.575 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 64% processing ../A1/documents_en/la020824.xml

16:36:04.535 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_en/la021015.xml
16:36:04.785 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_en/la021016.xml
16:36:05.086 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021017.xml
16:36:05.381 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021018.xml
16:36:05.735 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021019.xml
16:36:06.032 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_en/la021020.xml
16:36:06.573 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_en/la021021.xml

16:36:23.351 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 94% processing ../A1/documents_en/la021212.xml
16:36:23.667 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 94% processing ../A1/documents_en/la021213.xml
16:36:23.984 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021214.xml
16:36:24.246 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021215.xml
16:36:24.759 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021216.xml
16:36:25.028 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021217.xml
16:36:25.330 [Thread-19] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 96% processing ../A1/documents_en/la021218.xml

In [51]:
indexref = pt.IndexRef.of("./en_index")
index = pt.IndexFactory.of(indexref)

print(index.getCollectionStatistics())

Number of documents: 88110
Number of terms: 486066
Number of postings: 24024520
Number of fields: 1
Number of tokens: 41532118
Field names: [text]
Positions:   false



In [53]:
topics = read_topics("../A1/topics-train_en.xml", get_query_title)

retrieve = pt.BatchRetrieve(indexref, {"wmmodel": "TF_IDF"}) % 1000
retrieve = pt.apply.query(sanitize_query) >> retrieve
retrieve = pt.apply.query(lambda row: basic_tokeniser(row.query)) >> retrieve

<class 'jnius.reflect.org.terrier.querying.IndexRef'>


In [35]:
res = retrieve.transform(topics)

16:01:08.330 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/401-AH - Euro Inflation
16:01:08.332 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.333 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.334 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.336 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=Stopwords,PorterStemmer)
16:01:08.338 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.368 [main] INFO org.terrier.structures.FSADocumentIndex - Document index requires 344.2 KiB remaining heap is 3.8 GiB
16:01:08.391 [main] INFO org.terrier.querying.LocalManager - euro { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@397fbdb] tags=[firstmatchscore]} inflation { req=null, w=1.0, stats=null, models=[org.terr

16:01:08.599 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.599 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.600 [main] INFO org.terrier.querying.LocalManager - animated { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@6b2ea799] tags=[firstmatchscore]} cartoons { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@411f53a0] tags=[firstmatchscore]} 
16:01:08.602 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/406-AH with 2 terms has 2 posting lists
16:01:08.603 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
16:01:08.604 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.605 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/406-AH in 10ms - 573 results retrieved
16:01:08.615 [main] INFO org.terrier.querying.LocalMana

16:01:08.755 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.756 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/411-AH in 12ms - 1000 results retrieved
16:01:08.772 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/412-AH - Books on Politicians
16:01:08.773 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.774 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.774 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.775 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.776 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.777 [main] INFO org.terrier.querying.LocalManager - books { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@

16:01:08.929 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/417-AH with 2 terms has 2 posting lists
16:01:08.930 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
16:01:08.932 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.932 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/417-AH in 9ms - 480 results retrieved
16:01:08.940 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/418-AH - BÃ¼lent Ecevit s Statements
16:01:08.941 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.942 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.943 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.944 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.945 

16:01:09.102 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/423-AH - Alternatives to Flu Shots
16:01:09.103 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:09.104 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:09.105 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:09.106 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:09.107 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:09.108 [main] INFO org.terrier.querying.LocalManager - alternatives { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@2f4948e4] tags=[firstmatchscore]} to { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@1f2586d6] tags=[firstmatchscore]} flu { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@1068

In [39]:
print(res)

                  qid  docid      docno  rank      score             query_1  \
0      10.2452/401-AH  52628  000100084     0  17.638051      Euro Inflation   
1      10.2452/401-AH  45761  000046618     1  16.788333      Euro Inflation   
2      10.2452/401-AH  48181  000049062     2  14.574033      Euro Inflation   
3      10.2452/401-AH  78135  000125856     3  13.934812      Euro Inflation   
4      10.2452/401-AH  29189  000029769     4  13.762167      Euro Inflation   
...               ...    ...        ...   ...        ...                 ...   
22802  10.2452/425-AH  11294  000011581   995   3.360791  Endangered Species   
22803  10.2452/425-AH  42734  000043549   996   3.357942  Endangered Species   
22804  10.2452/425-AH  41443  000042223   997   3.355104  Endangered Species   
22805  10.2452/425-AH  51756  000099201   998   3.352277  Endangered Species   
22806  10.2452/425-AH   1355  000001370   999   3.350398  Endangered Species   

                  query_0              


# Run-0 CS

In [54]:

cs_doc_paths = list(paths_gen("../A1/documents_cs.lst", "../A1/documents_cs"))
text_gen = pt.index.treccollection2textgen(cs_doc_paths)

In [55]:

indexer = pt.IterDictIndexer(
    "./cs_index",
    overwrite=True,
    stemmer="none",
    stopwords="none",
    tokeniser="whitespace",
)


tokeniser_partial = functools.partial(tokenise, tokeniser=basic_tokeniser)
with multiprocessing.Pool() as pool:
    indexer.index(pool.imap(tokeniser_partial, text_gen))

20:18:53.046 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020102.xml
20:18:53.129 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
20:18:53.131 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
20:18:53.134 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
20:18:53.183 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020105.xml
20:18:53.327 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020109.xml
20:18:53.432 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 1% processing ../A1/documents_cs/ln020110.xml
20:18:53.56

20:18:58.935 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 24% processing ../A1/documents_cs/ln020624.xml
20:18:59.050 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 25% processing ../A1/documents_cs/ln020627.xml
20:18:59.163 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 25% processing ../A1/documents_cs/ln020628.xml
20:18:59.288 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 26% processing ../A1/documents_cs/ln020702.xml
20:18:59.386 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 26% processing ../A1/documents_cs/ln020708.xml
20:18:59.494 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 27% processing ../A1/documents_cs/ln020711.xml
20:18:59.599 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 27% processing ../A1/documents_cs/ln020715.xml

20:19:05.226 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 51% processing ../A1/documents_cs/ln021228.xml
20:19:05.348 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 51% processing ../A1/documents_cs/ln021231.xml
20:19:05.452 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020103.xml
20:19:05.864 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020107.xml
20:19:06.265 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020110.xml
20:19:06.721 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 53% processing ../A1/documents_cs/mf020114.xml
20:19:07.163 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 53% processing ../A1/documents_cs/mf020117.xml

20:19:32.032 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 77% processing ../A1/documents_cs/mf020713.xml
20:19:32.539 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 77% processing ../A1/documents_cs/mf020717.xml
20:19:33.016 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_cs/mf020720.xml
20:19:33.476 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_cs/mf020724.xml
20:19:33.934 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_cs/mf020727.xml
20:19:34.403 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_cs/mf020729.xml
20:19:34.860 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_cs/mf020801.xml

20:20:04.347 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - lexicon has 1 fields
20:20:04.349 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.FSOMapFileLexiconUtilities - Optimising lexicon with 536459 entries
20:20:04.684 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Finished building the inverted index...
20:20:04.685 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Time elapsed for inverted file: 5
20:20:04.686 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Collection took 71 seconds to index (81735 documents)


In [56]:
indexref = pt.IndexRef.of("./cs_index")
index = pt.IndexFactory.of(indexref)

print(index.getCollectionStatistics())

Number of documents: 81735
Number of terms: 536459
Number of postings: 13562251
Number of fields: 1
Number of tokens: 21893821
Field names: [text]
Positions:   false



In [57]:
topics = read_topics("../A1/topics-train_cs.xml", get_query_title)

retrieve = pt.BatchRetrieve(indexref, {"wmmodel": "TF_IDF"}) % 1000
retrieve = pt.apply.query(sanitize_query) >> retrieve
retrieve = pt.apply.query(lambda row: basic_tokeniser(row.query)) >> retrieve

In [58]:
res = retrieve.transform(topics)

20:20:30.230 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/401-AH - Inflace Eura
20:20:30.231 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.232 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.232 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.234 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=Stopwords,PorterStemmer)
20:20:30.235 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.266 [main] INFO org.terrier.structures.FSADocumentIndex - Document index requires 319.3 KiB remaining heap is 3.7 GiB
20:20:30.285 [main] INFO org.terrier.querying.LocalManager - inflace { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@367ffa75] tags=[firstmatchscore]} eura { req=null, w=1.0, stats=null, models=[org.terrier

20:20:30.460 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
20:20:30.461 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.462 [main] INFO org.terrier.querying.LocalManager - animovanÃ© { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@752325ad] tags=[firstmatchscore]} filmy { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@279fedbd] tags=[firstmatchscore]} 
20:20:30.463 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/406-AH with 2 terms has 2 posting lists
20:20:30.465 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.466 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.467 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/406-AH in 12ms - 641 results retrieved
20:20:30.481 [main] INFO org.terrier.querying.LocalManag

20:20:30.633 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/411-AH with 4 terms has 3 posting lists
20:20:30.641 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.644 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.645 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/411-AH in 21ms - 1000 results retrieved
20:20:30.664 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/412-AH - Knihy o politicÃ­ch
20:20:30.665 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.666 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.667 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.668 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
20:20:30.669 [main]

20:20:30.800 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.801 [main] INFO org.terrier.querying.LocalManager - Ãºnosy { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@37ceb1df] tags=[firstmatchscore]} letadel { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@7c9d8e2] tags=[firstmatchscore]} 
20:20:30.803 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/417-AH with 2 terms has 2 posting lists
20:20:30.804 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.806 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.807 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/417-AH in 12ms - 310 results retrieved
20:20:30.815 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/418-AH - ProhlÃ¡Å¡enÃ­ BÃ¼lenta Ecevita
20:20:30.816 [main] INFO org.terrie

20:20:30.946 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/422-AH with 5 terms has 5 posting lists
20:20:30.954 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.957 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.957 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/422-AH in 19ms - 1000 results retrieved
20:20:30.974 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/423-AH - Alternativy oÄkovÃ¡nÃ­ proti chÅipce
20:20:30.975 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.975 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.976 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.977 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)


In [59]:
print(res)

                  qid  docid           docno  rank      score         query_1  \
0      10.2452/401-AH  12927  MF-20020103035     0  15.019438    Inflace Eura   
1      10.2452/401-AH  16968  MF-20020128180     1  14.822874    Inflace Eura   
2      10.2452/401-AH  23502  MF-20020301283     2  13.898261    Inflace Eura   
3      10.2452/401-AH  17514  MF-20020131198     3  13.545147    Inflace Eura   
4      10.2452/401-AH  47110  MF-20020713049     4  12.725229    Inflace Eura   
...               ...    ...             ...   ...        ...             ...   
20136  10.2452/425-AH  16624  MF-20020124449   513   3.565101  Ohrožené druhy   
20137  10.2452/425-AH   2313  LN-20020308074   514   3.564048  Ohrožené druhy   
20138  10.2452/425-AH  16080  MF-20020121472   515   3.550498  Ohrožené druhy   
20139  10.2452/425-AH  48121  MF-20020717391   516   3.549466  Ohrožené druhy   
20140  10.2452/425-AH  48840  MF-20020720560   517   3.536185  Ohrožené druhy   

              query_0      

In [60]:
pt.io.write_results(res, "./cs_res.res", run_name="run-0_cs")

## Documents

In [15]:


class Document:
    def __init__(self, tag: bs4.element.Tag) -> None:
        """
        Initializes `Document` instance.
        """
        self._tag = tag

    @property
    def id(self) -> str:
        """
        Returns unique document number.
        """
        return str(self._tag.DOCNO.string)

    @property
    def str_all(self) -> str:
        """
        Returns all string inside SGML tags.
        """
        string = ""
        for descendant in self._tag.descendants:
            if isinstance(descendant, bs4.NavigableString):
                string += str(descendant)

        return string


class DocumentCS(Document):
    @property
    def str_all(self) -> str:
        string = ""
        for descendant in self._tag.descendants:
            if isinstance(
                descendant, bs4.NavigableString
            ) and descendant.parent.name not in ["DOCNO", "DOCID"]:
                string += str(descendant)

        return string


class DocumentEN(Document):
    def __init__(self, tag: bs4.element.Tag) -> None:
        super().__init__(tag)
        self._tag_blacklist = set(
            [
                "DOCNO",
                "DOCID",
                "SN",
                "PD",
                "PN",
                "PG",
                "PP",
                "WD",
                "SM",
                "SL",
                "CB",
                "IN",
                "FN",
            ]
        )

    @property
    def str_all(self) -> str:
        string = ""
        for descendant in self._tag.descendants:
            if (
                isinstance(descendant, bs4.NavigableString)
                and descendant.parent.name not in self._tag_blacklist
            ):
                string += str(descendant)

        return string

## Indexing


In [16]:


docs_paths_file = "../A1/documents_en.lst"
docs_paths_prefix = "../A1/documents_en"


def get_document_iter(
    doc_path: str, create_doc: Callable[[bs4.element.Tag], Document]
) -> Iterator[Document]:
    with open(doc_path, mode="r", encoding="utf-8") as file_handle:
        soup = bs4.BeautifulSoup(file_handle, "xml")
        for doc_tag in soup.find_all("DOC"):
            yield create_doc(doc_tag)


def dict_gen(paths_gen: Iterator[str]) -> Iterator[dict[str, str]]:
    for doc_path in paths_gen:
        for doc in get_document_iter(doc_path, Document):
            yield {"docno": doc.id, "text": doc.str_all}


en_dict_gen = functools.partial(
    dict_gen,
    paths_gen=functools.partial(
        paths_gen, docs_paths_file=docs_paths_file, docs_paths_prefix=docs_paths_prefix
    )(),
)

en_doc_paths = list(paths_gen(docs_paths_file, docs_paths_prefix))

In [17]:
for i, dict_ in enumerate(en_dict_gen()):
    print(dict_)
    if i > 5:
        break

{'docno': '000000001', 'text': '\n000000001\n000000001\n\nTK69KO5\n20020101\nTuesday January 01, 2002\n2002\nLA\nHome Edition\nLV\nSouthern California Living\nE\n5\n1\n\nE-1\n5-1\nFeatures Desk\n\n\n\n\ncmagdaleno\n40\n1\n\n\n\n\n\nDressing to Excess in the WWF Ring\n\n\n\n\n\nDressing to Excess\n   in the WWF Ring\n   When the World Wrestling Federation tours, wrestler Lita, left,\nand other stars are costumed by four designers who each sew about 180\ncostumes a year. The wrestlers visited the Arrowhead Pond last month.\nE2\n\n\nPHOTO: (no caption)\nID NUMBER: 20020101go61jjke\nPHOTOGRAPHER: FRANCINE ORR / Los Angeles Times\n\n\n\n20020101go61jjke\n\n\n\n\n\n\n\n\n\n\nSocal_living\nPubDate:01-01-02;Zone:LA;Ed:1;Section:SoCal\nLiving;Page:E1;PubCharCount:243###\n\n0\near for tuesday\n\nlv-ear1\n\n\n\n'}
{'docno': '000000002', 'text': '\n000000002\n000000002\n\nTK69KP6\n20020101\nTuesday January 01, 2002\n2002\nLA\nHome Edition\nLV\nSouthern California Living\nE\n5\n2\n\nE-2\n5-2\nFeatu

In [18]:
pt.set_property("max.term.length", 80)
pt.set_property("indexer.meta.forward.keylens", 80)

indexer = pt.IterDictIndexer(
    "./index",
    meta={"docno": 20},
    threads=1,
    stemmer=pt.index.TerrierStemmer.none,
    stopwords=pt.index.TerrierStopwords.none,
    # tokeniser=pt.index.TerrierTokeniser.whitespace,
    overwrite=True,
    properties={"Lexicon.max.term.length": 80},
)


indexref = indexer.index(
    take(en_dict_gen()),
)

13:18:35.231 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
13:18:35.234 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
13:18:35.238 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
13:18:35.368 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.BaseMetaIndexBuilder - ZstdMetaIndexBuilder meta achieved compression ratio 2.652174 (> 1 is better)
13:18:35.392 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - 1 lexicons to merge
13:18:35.394 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - Optimising structure lexicon
13:18:35.395 [ForkJoinPool-3-worker-1] INFO org.terrier.structures.indexing.FSOMapFileLexiconUtilities - Optimising lexicon with 10979 entries
13:18:35.494 [ForkJoinPool-

In [19]:
pt.set_property("max.term.length", 80)
pt.set_property("indexer.meta.forward.keylens", 80)
pt.set_property("index.meta.entry-length", 80)
pt.set_property("index.meta.value-lengths", 80)
indexer = pt.TRECCollectionIndexer(
    "./trec_index",
    threads=6,
    stemmer=pt.index.TerrierStemmer.none,
    stopwords=pt.index.TerrierStopwords.none,
    # tokeniser=pt.index.TerrierTokeniser.whitespace,
    overwrite=True,
    properties={
        "max.term.length": 80,
        "indexer.meta.forward.keylens": 80,
        "index.meta.entry-length": 80,
        "index.meta.value-lengths": 80,
    },
)


index_ref = indexer.index(en_doc_paths)

13:18:35.584 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020101.xml
13:18:35.595 [main] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_1
13:18:35.597 [main] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
13:18:35.775 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020102.xml
13:18:35.913 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020103.xml
13:18:36.081 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020104.xml
13:18:36.216 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 1% processing ../A1/documents_en/la020105.xml
13:18:36.334 [main] INFO org.terrier.indexing.MultiD

13:18:44.124 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020228.xml
13:18:44.270 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020301.xml
13:18:44.422 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020302.xml
13:18:44.536 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020303.xml
13:18:44.756 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020304.xml
13:18:44.869 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 17% processing ../A1/documents_en/la020305.xml
13:18:44.978 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 17% processing ../A1/documents_en/la020306.xml
13:18:45.112 [main] INFO org.terri

13:18:52.942 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020429.xml
13:18:53.075 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020430.xml
13:18:53.187 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020501.xml
13:18:53.319 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 33% processing ../A1/documents_en/la020502.xml
13:18:53.456 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 33% processing ../A1/documents_en/la020503.xml
13:18:53.586 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 33% processing ../A1/documents_en/la020504.xml
13:18:53.694 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 33% processing ../A1/documents_en/la020505.xml
13:18:53.932 [main] INFO org.terri

13:19:01.386 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 48% processing ../A1/documents_en/la020628.xml
13:19:01.523 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 49% processing ../A1/documents_en/la020629.xml
13:19:01.636 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 49% processing ../A1/documents_en/la020630.xml
13:19:01.828 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 49% processing ../A1/documents_en/la020701.xml
13:19:01.944 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 49% processing ../A1/documents_en/la020702.xml
13:19:02.050 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 50% processing ../A1/documents_en/la020703.xml
13:19:02.185 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 50% processing ../A1/documents_en/la020704.xml
13:19:02.318 [main] INFO org.terri

13:19:09.337 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 65% processing ../A1/documents_en/la020827.xml
13:19:09.464 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 65% processing ../A1/documents_en/la020828.xml
13:19:09.617 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 65% processing ../A1/documents_en/la020829.xml
13:19:09.760 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 66% processing ../A1/documents_en/la020830.xml
13:19:09.927 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 66% processing ../A1/documents_en/la020831.xml
13:19:10.040 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 66% processing ../A1/documents_en/la020901.xml
13:19:10.255 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 66% processing ../A1/documents_en/la020902.xml
13:19:10.377 [main] INFO org.terri

13:19:18.020 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 81% processing ../A1/documents_en/la021026.xml
13:19:18.139 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 81% processing ../A1/documents_en/la021027.xml
13:19:18.398 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 82% processing ../A1/documents_en/la021028.xml
13:19:18.530 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 82% processing ../A1/documents_en/la021029.xml
13:19:18.646 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 82% processing ../A1/documents_en/la021030.xml
13:19:18.769 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 83% processing ../A1/documents_en/la021031.xml
13:19:18.892 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 83% processing ../A1/documents_en/la021101.xml
13:19:19.030 [main] INFO org.terri

13:19:26.265 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 98% processing ../A1/documents_en/la021225.xml
13:19:26.371 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 98% processing ../A1/documents_en/la021226.xml
13:19:26.486 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 98% processing ../A1/documents_en/la021227.xml
13:19:26.603 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 98% processing ../A1/documents_en/la021228.xml
13:19:26.703 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 99% processing ../A1/documents_en/la021229.xml
13:19:26.892 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 99% processing ../A1/documents_en/la021230.xml
13:19:27.003 [main] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 99% processing ../A1/documents_en/la021231.xml
13:19:27.213 [main] INFO org.terri

In [20]:
print(dir(index_ref))
print(index_ref.toString())

['__class__', '__cls_storage', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__javaclass__', '__javaconstructor__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_class', 'clone', 'equals', 'finalize', 'getClass', 'hashCode', 'location', 'notify', 'notifyAll', 'of', 'serialVersionUID', 'size', 'toString', 'wait', 'wait0']
./trec_index/data.properties


## Queries

In [21]:
class Query:
    def __init__(self, tag: bs4.element.Tag) -> None:
        self._tag = tag

    @property
    def title(self) -> str:
        return str(self._tag.title.string)

    @property
    def id(self) -> str:
        return str(self._tag.num.string)

In [23]:

queries_path = "../A1/topics-train_en.xml"


def get_query_iter(
    queries_path: str, create_query: Callable[[bs4.element.Tag], Query]
) -> Iterator[Query]:
    with open(queries_path, mode="r", encoding="utf-8") as file_handle:
        soup = bs4.BeautifulSoup(file_handle, "xml")
        for query_tag in soup.find_all("top"):
            yield create_query(query_tag)


queries_df = pd.DataFrame(
    [
        {"qid": query.id, "query": query.title}
        for query in get_query_iter(queries_path, Query)
    ],
)
print(queries_df)

               qid                                  query
0   10.2452/401-AH                         Euro Inflation
1   10.2452/402-AH               Renewable Energy Sources
2   10.2452/403-AH                        Acting as a Cop
3   10.2452/404-AH                   NATO Summit Security
4   10.2452/405-AH                       Childhood Asthma
5   10.2452/406-AH                      Animated Cartoons
6   10.2452/407-AH              Australian Prime Minister
7   10.2452/408-AH                          Human Cloning
8   10.2452/409-AH                       Bali Car Bombing
9   10.2452/410-AH  North Korea Nuclear Weapons Violation
10  10.2452/411-AH                     Best Picture Oscar
11  10.2452/412-AH                   Books on Politicians
12  10.2452/413-AH                 Reducing Diabetes Risk
13  10.2452/414-AH                         Beer Festivals
14  10.2452/415-AH                             Drug Abuse
15  10.2452/416-AH          Moscow Theatre Hostage Crisis
16  10.2452/41

In [45]:
import glob

all_topic_files = glob.glob("../A1/topics-*.xml")
print(all_topic_files)
queries = []
for topic_file in all_topic_files:
    for query in get_query_iter(topic_file, Query):
        queries.append({"qid": query.id, "query": query.title})

all_queries_df = pd.DataFrame(queries)
print(all_queries_df)

['../A1/topics-train_cs.xml', '../A1/topics-train_en.xml', '../A1/topics-test_cs.xml', '../A1/topics-test_en.xml', '../A1/topics-train_en_uppercase.xml']
               qid                                              query
0   10.2452/401-AH                                       Inflace Eura
1   10.2452/402-AH                                 Obnovitelné zdroje
2   10.2452/403-AH                                     Role policisty
3   10.2452/404-AH                Summit NATO a bezpečnostní opatření
4   10.2452/405-AH                                    Astma u dětství
5   10.2452/406-AH                                    Animované filmy
6   10.2452/407-AH                                 Australský premiér
7   10.2452/408-AH                                     Klonování lidí
8   10.2452/409-AH                 Automobilové bombové útoky na Bali
9   10.2452/410-AH  Porušení programu jaderných zbraní Severní Koreou
10  10.2452/411-AH                             Oskar za nejlepší film
11  10

In [36]:
# from jnius import autoclass


# def my_read_topics(
#     file_path: str,
#     doc_tag: str,
#     id_tag: str,
#     whitelist: list[str],
#     blacklist: list[str],
# ) -> pd.DataFrame:
#     assert pt.check_version("5.3")
#     trecquerysource = autoclass("org.terrier.applications.batchquerying.TRECQuery")
#     tqs = trecquerysource(
#         [file_path],
#         doc_tag,
#         id_tag,
#         whitelist,
#         blacklist,
#         # help jnius select the correct constructor
#         signature="([Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)V",
#     )

#     TagSet = autoclass("org.terrier.utility.TagSet")
#     tagset = (
#         TagSet.factory()
#         .setDocTag(doc_tag)
#         .setIdTag(id_tag)
#         .setWhitelist(*whitelist)
#         .setBlacklist(*blacklist)
#         .setCaseSensitive(True)
#         .build()
#     )
#     tqs.tags = tagset

#     topics_lst = []
#     while tqs.hasNext():
#         topic = tqs.next()
#         qid = tqs.getQueryId()
#         topics_lst.append([qid, topic])
#     topics_dt = pd.DataFrame(topics_lst, columns=["qid", "query"])
#     return topics_dt

13:49:03.170 [main] ERROR org.terrier.applications.batchquerying.TRECQuery - Topic files were specified, but non could be parsed correctly to obtain any topics. Check you have the correct topic files specified, and that tags are correct.


JavaException: JVM exception occurred: Cannot read the array length because "this.queries" is null java.lang.NullPointerException

In [40]:
queries_df = pt.io.read_topics(
    "../A1/topics-train_en_uppercase.xml",
    doc_tag="top",
    id_tag="num",
    whitelist=["title"],
    blacklist=["desc", "narr"],
)

               qid                                  query
0   10.2452/401-AH                         euro inflation
1   10.2452/402-AH               renewable energy sources
2   10.2452/403-AH                        acting as a cop
3   10.2452/404-AH                   nato summit security
4   10.2452/405-AH                       childhood asthma
5   10.2452/406-AH                      animated cartoons
6   10.2452/407-AH              australian prime minister
7   10.2452/408-AH                          human cloning
8   10.2452/409-AH                       bali car bombing
9   10.2452/410-AH  north korea nuclear weapons violation
10  10.2452/411-AH                     best picture oscar
11  10.2452/412-AH                   books on politicians
12  10.2452/413-AH                 reducing diabetes risk
13  10.2452/414-AH                         beer festivals
14  10.2452/415-AH                             drug abuse
15  10.2452/416-AH          moscow theatre hostage crisis
16  10.2452/41

In [None]:

# queries_df = my_read_topics(
#     queries_path,
#     doc_tag="top",
#     id_tag="num",
#     whitelist=["title"],
#     blacklist=["desc", "narr"],
# )
# print(queries_df)

In [53]:
def strip_single_quote(query: str) -> str:
    return query.replace("'", " ")


tfidf = pt.BatchRetrieve(indexer, wmodel="TF_IDF")
res = pt.apply.query(lambda row: strip_single_quote(row.query)) >> tfidf % 1000
print(res.transform(queries_df))

12:30:12.425 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/401-AH - Euro Inflation
12:30:12.427 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
12:30:12.428 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
12:30:12.429 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
12:30:12.430 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=Stopwords,PorterStemmer)
12:30:12.432 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
12:30:12.456 [main] INFO org.terrier.structures.FSADocumentIndex - Document index requires 344.2 KiB remaining heap is 3.5 GiB
12:30:12.481 [main] INFO org.terrier.querying.LocalManager - euro { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_IDF@4d465b11] tags=[firstmatchscore]} inflation { req=null, w=1.0, stats=null, models=[org.

12:30:12.691 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
12:30:12.692 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
12:30:12.693 [main] INFO org.terrier.querying.LocalManager - animated { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_IDF@56cdfb3b] tags=[firstmatchscore]} cartoons { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_IDF@2b91004a] tags=[firstmatchscore]} 
12:30:12.694 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/406-AH with 2 terms has 2 posting lists
12:30:12.696 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
12:30:12.699 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
12:30:12.700 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/406-AH in 13ms - 758 results retrieved
12:30:12.713 [main] INFO org.terrier.querying.Loc

12:30:12.866 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
12:30:12.867 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/411-AH in 18ms - 1000 results retrieved
12:30:12.885 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/412-AH - Books on Politicians
12:30:12.887 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
12:30:12.888 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
12:30:12.889 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
12:30:12.890 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
12:30:12.891 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
12:30:12.896 [main] INFO org.terrier.querying.LocalManager - books { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_I

12:30:13.073 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/417-AH with 2 terms has 2 posting lists
12:30:13.075 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
12:30:13.078 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
12:30:13.079 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/417-AH in 13ms - 870 results retrieved
12:30:13.097 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/418-AH - BÃ¼lent Ecevit s Statements
12:30:13.099 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
12:30:13.100 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
12:30:13.101 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
12:30:13.102 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
12:30:13.103

12:30:13.335 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/423-AH - Alternatives to Flu Shots
12:30:13.336 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
12:30:13.337 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
12:30:13.338 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
12:30:13.339 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
12:30:13.340 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
12:30:13.341 [main] INFO org.terrier.querying.LocalManager - alternatives { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_IDF@43dac38f] tags=[firstmatchscore]} to { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF_IDF@342c38f8] tags=[firstmatchscore]} flu { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.TF