In [6]:
import functools
import multiprocessing
import os
import re
from typing import Callable, Iterator

import bs4
import pandas as pd
import pyterrier as pt

In [7]:
if not pt.started():
    pt.init(tqdm="notebook", logging="INFO")

In [64]:

print("Before:")
print(
    pt.properties["max.term.length"] if "max.term.length" in pt.properties else "none"
)  # none
print(pt.ApplicationSetup.appProperties.getProperty("max.term.length", "none"))  # none

pt.set_property("max.term.length", 40)

print("After setting an int:")
print(pt.properties["max.term.length"])  # 40
print(pt.ApplicationSetup.appProperties.getProperty("max.term.length", "none"))  # none
print(pt.ApplicationSetup.getProperty("max.term.length", "none"))  # none

pt.set_property("max.term.length", "40")

print("After setting a string:")
print(pt.properties["max.term.length"])  # 40
print(pt.ApplicationSetup.appProperties.getProperty("max.term.length", "none"))  # 40
print(pt.ApplicationSetup.getProperty("max.term.length", "none"))  # none

Before:
40
40
After setting an int:
40
none
none
After setting a string:
40
40
40



# Utils


In [69]:

SEPS = r"-,.:;?! \n\t\[\]\(\)'\""
QUERY_MARKS = r"-#\{\}\(\)\^\+:~"


def paths_gen(docs_paths_file: str, docs_paths_prefix: str) -> Iterator[str]:
    with open(docs_paths_file, mode="r", encoding="utf-8") as paths_file:
        for line in paths_file:
            yield os.path.join(docs_paths_prefix, line.rstrip())


def take(iter_: Iterator, max_len=100) -> Iterator:
    for i, elem in enumerate(iter_):
        yield elem
        if i + 1 > max_len:
            break


def tokenise(doc: dict[str, str], tokeniser: Callable[[str], str]) -> dict[str, str]:
    doc["text"] = tokeniser(doc["text"])
    return doc


def basic_tokeniser(text: str) -> str:
    """Splits text by ',.:;?! \n\t[]()'"' and joins it by space."""
    words = re.split("[" + SEPS + "]", text)
    return " ".join(words)


def read_topics(
    topic_file: str, get_query: Callable[[bs4.element.Tag], dict[str, str]]
) -> pd.DataFrame:
    topics = []
    with open(topic_file, mode="r", encoding="utf-8") as file_handle:
        soup = bs4.BeautifulSoup(file_handle, "xml")
        for query_tag in soup.find_all("top"):
            topics.append(get_query(query_tag))

    return pd.DataFrame(topics)


def get_query_title(tag: bs4.element.Tag) -> dict[str, str]:
    qid = str(tag.num.string)
    title = str(tag.title.string)
    return {"qid": qid, "query": title}


def sanitize_query(row: pd.Series) -> str:
    query_tokens = re.split("[" + QUERY_MARKS + "]", row.query)
    return " ".join(query_tokens)


# Run-0 EN


In [70]:


en_doc_paths = list(paths_gen("../A1/documents_en.lst", "../A1/documents_en"))
text_gen = pt.index.treccollection2textgen(en_doc_paths)

In [71]:

indexer = pt.IterDictIndexer(
    "./en_index",
    overwrite=True,
    stemmer="none",
    stopwords="none",
    tokeniser="whitespace",
)


tokeniser_partial = functools.partial(tokenise, tokeniser=basic_tokeniser)
with multiprocessing.Pool() as pool:
    indexer.index(pool.imap(tokeniser_partial, text_gen), fields=["text"])

23:41:27.121 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020101.xml
23:41:27.206 [ForkJoinPool-14-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
23:41:27.208 [ForkJoinPool-14-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
23:41:27.211 [ForkJoinPool-14-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
23:41:27.452 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020102.xml
23:41:27.750 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020103.xml
23:41:28.136 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_en/la020104.xml
23:41:28.45

23:41:46.389 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020225.xml
23:41:46.706 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020226.xml
23:41:46.988 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020227.xml
23:41:47.308 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 15% processing ../A1/documents_en/la020228.xml
23:41:47.657 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020301.xml
23:41:48.031 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020302.xml
23:41:48.317 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 16% processing ../A1/documents_en/la020303.xml

23:42:06.491 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 30% processing ../A1/documents_en/la020424.xml
23:42:06.820 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020425.xml
23:42:07.137 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020426.xml
23:42:07.443 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 31% processing ../A1/documents_en/la020427.xml
23:42:07.714 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020428.xml
23:42:08.290 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020429.xml
23:42:08.598 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 32% processing ../A1/documents_en/la020430.xml

23:42:26.306 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 46% processing ../A1/documents_en/la020621.xml
23:42:26.639 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020622.xml
23:42:26.896 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020623.xml
23:42:27.397 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020624.xml
23:42:27.672 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 47% processing ../A1/documents_en/la020625.xml
23:42:27.993 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 48% processing ../A1/documents_en/la020626.xml
23:42:28.318 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 48% processing ../A1/documents_en/la020627.xml

23:42:44.678 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 62% processing ../A1/documents_en/la020818.xml
23:42:45.191 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020819.xml
23:42:45.502 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020820.xml
23:42:45.755 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020821.xml
23:42:46.074 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 63% processing ../A1/documents_en/la020822.xml
23:42:46.377 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 64% processing ../A1/documents_en/la020823.xml
23:42:46.698 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 64% processing ../A1/documents_en/la020824.xml

23:43:04.303 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_en/la021015.xml
23:43:04.562 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_en/la021016.xml
23:43:04.872 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021017.xml
23:43:05.188 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021018.xml
23:43:05.575 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_en/la021019.xml
23:43:05.856 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_en/la021020.xml
23:43:06.427 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_en/la021021.xml

23:43:23.834 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 94% processing ../A1/documents_en/la021212.xml
23:43:24.156 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 94% processing ../A1/documents_en/la021213.xml
23:43:24.503 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021214.xml
23:43:24.767 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021215.xml
23:43:25.279 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021216.xml
23:43:25.544 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 95% processing ../A1/documents_en/la021217.xml
23:43:25.832 [Thread-26] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 96% processing ../A1/documents_en/la021218.xml

In [51]:
indexref = pt.IndexRef.of("./en_index")
index = pt.IndexFactory.of(indexref)

print(index.getCollectionStatistics())

Number of documents: 88110
Number of terms: 486066
Number of postings: 24024520
Number of fields: 1
Number of tokens: 41532118
Field names: [text]
Positions:   false



In [53]:
topics = read_topics("../A1/topics-train_en.xml", get_query_title)

retrieve = pt.BatchRetrieve(indexref, {"wmmodel": "TF_IDF"}) % 1000
retrieve = pt.apply.query(sanitize_query) >> retrieve
retrieve = pt.apply.query(lambda row: basic_tokeniser(row.query)) >> retrieve

<class 'jnius.reflect.org.terrier.querying.IndexRef'>


In [35]:
res = retrieve.transform(topics)

16:01:08.330 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/401-AH - Euro Inflation
16:01:08.332 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.333 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.334 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.336 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=Stopwords,PorterStemmer)
16:01:08.338 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.368 [main] INFO org.terrier.structures.FSADocumentIndex - Document index requires 344.2 KiB remaining heap is 3.8 GiB
16:01:08.391 [main] INFO org.terrier.querying.LocalManager - euro { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@397fbdb] tags=[firstmatchscore]} inflation { req=null, w=1.0, stats=null, models=[org.terr

16:01:08.599 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.599 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.600 [main] INFO org.terrier.querying.LocalManager - animated { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@6b2ea799] tags=[firstmatchscore]} cartoons { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@411f53a0] tags=[firstmatchscore]} 
16:01:08.602 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/406-AH with 2 terms has 2 posting lists
16:01:08.603 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
16:01:08.604 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.605 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/406-AH in 10ms - 573 results retrieved
16:01:08.615 [main] INFO org.terrier.querying.LocalMana

16:01:08.755 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.756 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/411-AH in 12ms - 1000 results retrieved
16:01:08.772 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/412-AH - Books on Politicians
16:01:08.773 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.774 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.774 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.775 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.776 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:08.777 [main] INFO org.terrier.querying.LocalManager - books { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@

16:01:08.929 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/417-AH with 2 terms has 2 posting lists
16:01:08.930 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
16:01:08.932 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
16:01:08.932 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/417-AH in 9ms - 480 results retrieved
16:01:08.940 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/418-AH - BÃ¼lent Ecevit s Statements
16:01:08.941 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:08.942 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:08.943 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:08.944 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:08.945 

16:01:09.102 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/423-AH - Alternatives to Flu Shots
16:01:09.103 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
16:01:09.104 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
16:01:09.105 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
16:01:09.106 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
16:01:09.107 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
16:01:09.108 [main] INFO org.terrier.querying.LocalManager - alternatives { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@2f4948e4] tags=[firstmatchscore]} to { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@1f2586d6] tags=[firstmatchscore]} flu { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@1068

In [39]:
print(res)

                  qid  docid      docno  rank      score             query_1  \
0      10.2452/401-AH  52628  000100084     0  17.638051      Euro Inflation   
1      10.2452/401-AH  45761  000046618     1  16.788333      Euro Inflation   
2      10.2452/401-AH  48181  000049062     2  14.574033      Euro Inflation   
3      10.2452/401-AH  78135  000125856     3  13.934812      Euro Inflation   
4      10.2452/401-AH  29189  000029769     4  13.762167      Euro Inflation   
...               ...    ...        ...   ...        ...                 ...   
22802  10.2452/425-AH  11294  000011581   995   3.360791  Endangered Species   
22803  10.2452/425-AH  42734  000043549   996   3.357942  Endangered Species   
22804  10.2452/425-AH  41443  000042223   997   3.355104  Endangered Species   
22805  10.2452/425-AH  51756  000099201   998   3.352277  Endangered Species   
22806  10.2452/425-AH   1355  000001370   999   3.350398  Endangered Species   

                  query_0              


# Run-0 CS

In [54]:

cs_doc_paths = list(paths_gen("../A1/documents_cs.lst", "../A1/documents_cs"))
text_gen = pt.index.treccollection2textgen(cs_doc_paths)

In [55]:

indexer = pt.IterDictIndexer(
    "./cs_index",
    overwrite=True,
    stemmer="none",
    stopwords="none",
    tokeniser="whitespace",
)


tokeniser_partial = functools.partial(tokenise, tokeniser=basic_tokeniser)
with multiprocessing.Pool() as pool:
    indexer.index(pool.imap(tokeniser_partial, text_gen))

20:18:53.046 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020102.xml
20:18:53.129 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
20:18:53.131 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
20:18:53.134 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
20:18:53.183 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020105.xml
20:18:53.327 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 0% processing ../A1/documents_cs/ln020109.xml
20:18:53.432 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 1% processing ../A1/documents_cs/ln020110.xml
20:18:53.56

20:18:58.935 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 24% processing ../A1/documents_cs/ln020624.xml
20:18:59.050 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 25% processing ../A1/documents_cs/ln020627.xml
20:18:59.163 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 25% processing ../A1/documents_cs/ln020628.xml
20:18:59.288 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 26% processing ../A1/documents_cs/ln020702.xml
20:18:59.386 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 26% processing ../A1/documents_cs/ln020708.xml
20:18:59.494 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 27% processing ../A1/documents_cs/ln020711.xml
20:18:59.599 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 27% processing ../A1/documents_cs/ln020715.xml

20:19:05.226 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 51% processing ../A1/documents_cs/ln021228.xml
20:19:05.348 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 51% processing ../A1/documents_cs/ln021231.xml
20:19:05.452 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020103.xml
20:19:05.864 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020107.xml
20:19:06.265 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 52% processing ../A1/documents_cs/mf020110.xml
20:19:06.721 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 53% processing ../A1/documents_cs/mf020114.xml
20:19:07.163 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 53% processing ../A1/documents_cs/mf020117.xml

20:19:32.032 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 77% processing ../A1/documents_cs/mf020713.xml
20:19:32.539 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 77% processing ../A1/documents_cs/mf020717.xml
20:19:33.016 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_cs/mf020720.xml
20:19:33.476 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 78% processing ../A1/documents_cs/mf020724.xml
20:19:33.934 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_cs/mf020727.xml
20:19:34.403 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 79% processing ../A1/documents_cs/mf020729.xml
20:19:34.860 [Thread-21] INFO org.terrier.indexing.MultiDocumentFileCollection - TRECCollection 80% processing ../A1/documents_cs/mf020801.xml

20:20:04.347 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - lexicon has 1 fields
20:20:04.349 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.FSOMapFileLexiconUtilities - Optimising lexicon with 536459 entries
20:20:04.684 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Finished building the inverted index...
20:20:04.685 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Time elapsed for inverted file: 5
20:20:04.686 [ForkJoinPool-12-worker-1] INFO org.terrier.structures.indexing.Indexer - Collection took 71 seconds to index (81735 documents)


In [56]:
indexref = pt.IndexRef.of("./cs_index")
index = pt.IndexFactory.of(indexref)

print(index.getCollectionStatistics())

Number of documents: 81735
Number of terms: 536459
Number of postings: 13562251
Number of fields: 1
Number of tokens: 21893821
Field names: [text]
Positions:   false



In [57]:
topics = read_topics("../A1/topics-train_cs.xml", get_query_title)

retrieve = pt.BatchRetrieve(indexref, {"wmmodel": "TF_IDF"}) % 1000
retrieve = pt.apply.query(sanitize_query) >> retrieve
retrieve = pt.apply.query(lambda row: basic_tokeniser(row.query)) >> retrieve

In [58]:
res = retrieve.transform(topics)

20:20:30.230 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/401-AH - Inflace Eura
20:20:30.231 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.232 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.232 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.234 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=Stopwords,PorterStemmer)
20:20:30.235 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.266 [main] INFO org.terrier.structures.FSADocumentIndex - Document index requires 319.3 KiB remaining heap is 3.7 GiB
20:20:30.285 [main] INFO org.terrier.querying.LocalManager - inflace { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@367ffa75] tags=[firstmatchscore]} eura { req=null, w=1.0, stats=null, models=[org.terrier

20:20:30.460 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
20:20:30.461 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.462 [main] INFO org.terrier.querying.LocalManager - animovanÃ© { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@752325ad] tags=[firstmatchscore]} filmy { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@279fedbd] tags=[firstmatchscore]} 
20:20:30.463 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/406-AH with 2 terms has 2 posting lists
20:20:30.465 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.466 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.467 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/406-AH in 12ms - 641 results retrieved
20:20:30.481 [main] INFO org.terrier.querying.LocalManag

20:20:30.633 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/411-AH with 4 terms has 3 posting lists
20:20:30.641 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.644 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.645 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/411-AH in 21ms - 1000 results retrieved
20:20:30.664 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/412-AH - Knihy o politicÃ­ch
20:20:30.665 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.666 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.667 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.668 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)
20:20:30.669 [main]

20:20:30.800 [main] INFO org.terrier.querying.LocalManager - running process ApplyLocalMatching
20:20:30.801 [main] INFO org.terrier.querying.LocalManager - Ãºnosy { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@37ceb1df] tags=[firstmatchscore]} letadel { req=null, w=1.0, stats=null, models=[org.terrier.matching.models.DPH@7c9d8e2] tags=[firstmatchscore]} 
20:20:30.803 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/417-AH with 2 terms has 2 posting lists
20:20:30.804 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.806 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.807 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/417-AH in 12ms - 310 results retrieved
20:20:30.815 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/418-AH - ProhlÃ¡Å¡enÃ­ BÃ¼lenta Ecevita
20:20:30.816 [main] INFO org.terrie

20:20:30.946 [main] INFO org.terrier.matching.PostingListManager - Query 10.2452/422-AH with 5 terms has 5 posting lists
20:20:30.954 [main] INFO org.terrier.querying.LocalManager - running process PostFilterProcess
20:20:30.957 [main] INFO org.terrier.querying.LocalManager - running process SimpleDecorateProcess
20:20:30.957 [main] INFO org.terrier.querying.LocalManager - Finished executing query 10.2452/422-AH in 19ms - 1000 results retrieved
20:20:30.974 [main] INFO org.terrier.querying.LocalManager - Starting to execute query 10.2452/423-AH - Alternativy oÄkovÃ¡nÃ­ proti chÅipce
20:20:30.975 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLParser
20:20:30.975 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToControls
20:20:30.976 [main] INFO org.terrier.querying.LocalManager - running process TerrierQLToMatchingQueryTerms
20:20:30.977 [main] INFO org.terrier.querying.LocalManager - running process ApplyTermPipeline(termpipelines=)


In [59]:
print(res)

                  qid  docid           docno  rank      score         query_1  \
0      10.2452/401-AH  12927  MF-20020103035     0  15.019438    Inflace Eura   
1      10.2452/401-AH  16968  MF-20020128180     1  14.822874    Inflace Eura   
2      10.2452/401-AH  23502  MF-20020301283     2  13.898261    Inflace Eura   
3      10.2452/401-AH  17514  MF-20020131198     3  13.545147    Inflace Eura   
4      10.2452/401-AH  47110  MF-20020713049     4  12.725229    Inflace Eura   
...               ...    ...             ...   ...        ...             ...   
20136  10.2452/425-AH  16624  MF-20020124449   513   3.565101  Ohrožené druhy   
20137  10.2452/425-AH   2313  LN-20020308074   514   3.564048  Ohrožené druhy   
20138  10.2452/425-AH  16080  MF-20020121472   515   3.550498  Ohrožené druhy   
20139  10.2452/425-AH  48121  MF-20020717391   516   3.549466  Ohrožené druhy   
20140  10.2452/425-AH  48840  MF-20020720560   517   3.536185  Ohrožené druhy   

              query_0      

In [60]:
pt.io.write_results(res, "./cs_res.res", run_name="run-0_cs")

## Queries

In [130]:
class Query:
    def __init__(self, tag: bs4.element.Tag) -> None:
        self._tag = tag

    @property
    def title(self) -> str:
        return str(self._tag.title.string)

    @property
    def id(self) -> str:
        return str(self._tag.num.string)

In [131]:

queries_path = "../A1/topics-train_en.xml"


def get_query_iter(
    queries_path: str, create_query: Callable[[bs4.element.Tag], Query]
) -> Iterator[Query]:
    with open(queries_path, mode="r", encoding="utf-8") as file_handle:
        soup = bs4.BeautifulSoup(file_handle, "xml")
        for query_tag in soup.find_all("top"):
            yield create_query(query_tag)


queries_df = pd.DataFrame(
    [
        {"qid": query.id, "query": query.title}
        for query in get_query_iter(queries_path, Query)
    ],
)
print(queries_df)

               qid                                  query
0   10.2452/401-AH                         Euro Inflation
1   10.2452/402-AH               Renewable Energy Sources
2   10.2452/403-AH                        Acting as a Cop
3   10.2452/404-AH                   NATO Summit Security
4   10.2452/405-AH                       Childhood Asthma
5   10.2452/406-AH                      Animated Cartoons
6   10.2452/407-AH              Australian Prime Minister
7   10.2452/408-AH                          Human Cloning
8   10.2452/409-AH                       Bali Car Bombing
9   10.2452/410-AH  North Korea Nuclear Weapons Violation
10  10.2452/411-AH                     Best Picture Oscar
11  10.2452/412-AH                   Books on Politicians
12  10.2452/413-AH                 Reducing Diabetes Risk
13  10.2452/414-AH                         Beer Festivals
14  10.2452/415-AH                             Drug Abuse
15  10.2452/416-AH          Moscow Theatre Hostage Crisis
16  10.2452/41

In [134]:
new_queries = [query.lower() for query in list(queries_df["query"])]
queries_df["new_query"] = new_queries
print(queries_df)

               qid                                  query  \
0   10.2452/401-AH                         Euro Inflation   
1   10.2452/402-AH               Renewable Energy Sources   
2   10.2452/403-AH                        Acting as a Cop   
3   10.2452/404-AH                   NATO Summit Security   
4   10.2452/405-AH                       Childhood Asthma   
5   10.2452/406-AH                      Animated Cartoons   
6   10.2452/407-AH              Australian Prime Minister   
7   10.2452/408-AH                          Human Cloning   
8   10.2452/409-AH                       Bali Car Bombing   
9   10.2452/410-AH  North Korea Nuclear Weapons Violation   
10  10.2452/411-AH                     Best Picture Oscar   
11  10.2452/412-AH                   Books on Politicians   
12  10.2452/413-AH                 Reducing Diabetes Risk   
13  10.2452/414-AH                         Beer Festivals   
14  10.2452/415-AH                             Drug Abuse   
15  10.2452/416-AH      

In [45]:
import glob

all_topic_files = glob.glob("../A1/topics-*.xml")
print(all_topic_files)
queries = []
for topic_file in all_topic_files:
    for query in get_query_iter(topic_file, Query):
        queries.append({"qid": query.id, "query": query.title})

all_queries_df = pd.DataFrame(queries)
print(all_queries_df)

['../A1/topics-train_cs.xml', '../A1/topics-train_en.xml', '../A1/topics-test_cs.xml', '../A1/topics-test_en.xml', '../A1/topics-train_en_uppercase.xml']
               qid                                              query
0   10.2452/401-AH                                       Inflace Eura
1   10.2452/402-AH                                 Obnovitelné zdroje
2   10.2452/403-AH                                     Role policisty
3   10.2452/404-AH                Summit NATO a bezpečnostní opatření
4   10.2452/405-AH                                    Astma u dětství
5   10.2452/406-AH                                    Animované filmy
6   10.2452/407-AH                                 Australský premiér
7   10.2452/408-AH                                     Klonování lidí
8   10.2452/409-AH                 Automobilové bombové útoky na Bali
9   10.2452/410-AH  Porušení programu jaderných zbraní Severní Koreou
10  10.2452/411-AH                             Oskar za nejlepší film
11  10

In [40]:
queries_df = pt.io.read_topics(
    "../A1/topics-train_en_uppercase.xml",
    doc_tag="top",
    id_tag="num",
    whitelist=["title"],
    blacklist=["desc", "narr"],
)

               qid                                  query
0   10.2452/401-AH                         euro inflation
1   10.2452/402-AH               renewable energy sources
2   10.2452/403-AH                        acting as a cop
3   10.2452/404-AH                   nato summit security
4   10.2452/405-AH                       childhood asthma
5   10.2452/406-AH                      animated cartoons
6   10.2452/407-AH              australian prime minister
7   10.2452/408-AH                          human cloning
8   10.2452/409-AH                       bali car bombing
9   10.2452/410-AH  north korea nuclear weapons violation
10  10.2452/411-AH                     best picture oscar
11  10.2452/412-AH                   books on politicians
12  10.2452/413-AH                 reducing diabetes risk
13  10.2452/414-AH                         beer festivals
14  10.2452/415-AH                             drug abuse
15  10.2452/416-AH          moscow theatre hostage crisis
16  10.2452/41

In [None]:


def strip_single_quote(query: str) -> str:
    return query.replace("'", " ")


tfidf = pt.BatchRetrieve(indexer, wmodel="TF_IDF")
res = pt.apply.query(lambda row: strip_single_quote(row.query)) >> tfidf % 1000
print(res.transform(queries_df))

In [72]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
english = set(stopwords.words("english"))
print(english)

{'me', 'at', 'yourselves', 'was', "it's", 'herself', 'after', 'your', 'over', "doesn't", 'mightn', 'a', 'he', 'himself', 'there', 'each', 'hadn', "isn't", 'how', 'most', 'too', 'did', 'theirs', "should've", 'should', 'will', 'her', 'do', 'about', 'doing', "mightn't", 'being', 'very', "you'll", 'she', 'isn', 'through', 'him', 'we', 'weren', 'itself', 'between', 'under', 'yourself', 've', 'hasn', 'or', "wasn't", 'while', 'what', 'those', 'for', 'won', 'ourselves', 'is', 'both', 'll', 'if', 'because', 'as', 'out', 'further', 'has', "shouldn't", 'our', 'd', 'and', 'against', 'above', 'once', "couldn't", 'haven', 'can', 'an', 'then', "aren't", 'o', "won't", "weren't", 'again', 'ain', 'wouldn', 'when', 'had', 'y', 'didn', 'with', 'up', 'own', 'myself', 'below', 'now', 'to', "you'd", 'am', 'from', "didn't", 'my', "don't", 'of', 'which', 'doesn', 'having', 'needn', 'have', 'you', 'whom', 'shan', 'where', 'than', 'were', 'any', 'ours', 'few', 'so', 'be', "you've", 'are', 'it', 't', 'down', 'mus

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dburian/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [73]:
czech = set(stopwords.words("czech"))
print(czech)

OSError: No such file or directory: '/home/dburian/nltk_data/corpora/stopwords/czech'

In [78]:
# pt.ApplicationSetup.getProperty("terrier.home", "none")
pt.set_property("terrier.home", os.path.abspath("."))
pt.set_property("stopwords.filename", "cs_stopwords.txt")
docs = [{"docno": "0001", "text": "ahoj jak se máš já se mám dobře"}]
indexer = pt.IterDictIndexer(
    "./test",
    tokeniser="utf",
    stopwords="english",
    stemmer="none",
    overwrite=True,
)

index_ref = indexer.index(docs)
index = pt.IndexFactory.of(index_ref)

for kv in index.getLexicon():
    print(kv.getKey())

12:28:13.935 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
12:28:13.937 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
12:28:13.938 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
12:28:13.940 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.BaseMetaIndexBuilder - ZstdMetaIndexBuilder meta achieved compression ratio 3.05 (> 1 is better)
12:28:13.942 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - 1 lexicons to merge
12:28:13.942 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - Optimising structure lexicon
12:28:13.944 [ForkJoinPool-18-worker-1] INFO org.terrier.structures.indexing.FSOMapFileLexiconUtilities - Optimising lexicon with 3 entries
12:28:13.946 [ForkJoinPool-1

In [81]:

pt.set_property("terrier.home", os.path.abspath("."))
pt.ApplicationSetup.appProperties.remove("stopwords.filename")
print(pt.ApplicationSetup.getProperty("stopwords.filename", "none"))
docs = [{"docno": "0001", "text": "a an the car bottle after of"}]
indexer = pt.IterDictIndexer(
    "./test",
    tokeniser="english",
    stopwords="english",
    stemmer="none",
    overwrite=True,
)

index_ref = indexer.index(docs)
index = pt.IndexFactory.of(index_ref)

for kv in index.getLexicon():
    print(kv.getKey())

none
12:49:35.186 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
12:49:35.188 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
12:49:35.190 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit
12:49:35.192 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.BaseMetaIndexBuilder - ZstdMetaIndexBuilder meta achieved compression ratio 3.05 (> 1 is better)
12:49:35.194 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - 1 lexicons to merge
12:49:35.196 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.LexiconBuilder - Optimising structure lexicon
12:49:35.197 [ForkJoinPool-20-worker-1] INFO org.terrier.structures.indexing.FSOMapFileLexiconUtilities - Optimising lexicon with 2 entries
12:49:35.273 [ForkJoinP


## UDPipe


In [83]:

import email.mime.multipart
import email.mime.nonmultipart
import email.policy
import json
import os
import sys
import urllib.error
import urllib.request

UDPipeService = "https://lindat.mff.cuni.cz/services/udpipe/api"


def perform_request(server, method, params={}):
    if not params:
        request_headers, request_data = {}, None
    else:
        message = email.mime.multipart.MIMEMultipart(
            "form-data", policy=email.policy.HTTP
        )

        for name, value in params.items():
            payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain")
            payload.add_header(
                "Content-Disposition", 'form-data; name="{}"'.format(name)
            )
            payload.add_header("Content-Transfer-Encoding", "8bit")
            payload.set_payload(value, charset="utf-8")
            message.attach(payload)

        request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1]
        request_headers = {"Content-Type": message["Content-Type"]}

    try:
        with urllib.request.urlopen(
            urllib.request.Request(
                url="{}/{}".format(server, method),
                headers=request_headers,
                data=request_data,
            )
        ) as request:
            return json.loads(request.read())
    except urllib.error.HTTPError as e:
        print(
            "An exception was raised during UDPipe 'process' REST request.\n"
            "The service returned the following error:\n"
            "  {}".format(e.fp.read().decode("utf-8")),
            file=sys.stderr,
        )
        raise
    except json.JSONDecodeError as e:
        print(
            "Cannot parse the JSON response of UDPipe 'process' REST request.\n"
            "  {}".format(e.msg),
            file=sys.stderr,
        )
        raise


def process(args: dict[str, str], text_data):
    data = {
        "input": args["input"],
        "output": args["output"],
        "data": text_data,
    }
    for option in ["model", "tokenizer", "parser", "tagger"]:
        value = args.get(option, None)
        if value is not None:
            data[option] = value

    response = perform_request(UDPipeService, "process", data)
    if "model" not in response or "result" not in response:
        raise ValueError("Cannot parse the UDPipe 'process' REST request response.")

    print(
        "UDPipe generated an output using the model '{}'.".format(response["model"]),
        file=sys.stderr,
    )
    print(
        "Please respect the model licence (CC BY-NC-SA unless stated otherwise).",
        file=sys.stderr,
    )

    return response["result"]

In [127]:
args = {
    "input": "horizontal",
    "output": "conllu",
    "model": "czech",
    "tokenizer": "",
    "tagger": "",
}
text_data = """Ahoj jak se máš já se mám dobře

babička koupila rohlíky za 20 Kč"""

return_data = process(args, text_data)

print(return_data)

# generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
# udpipe_model = czech-pdt-ud-2.10-220711
# udpipe_model_licence = CC BY-NC-SA
# newdoc
# newpar
# sent_id = 1
1	Ahoj	ahoj	PART	TT-------------	_	_	_	_	_
2	jak	jak	ADV	Db-------------	PronType=Int,Rel	_	_	_	_
3	se	se	PRON	P7-X4----------	Case=Acc|PronType=Prs|Reflex=Yes|Variant=Short	_	_	_	_
4	máš	mít	VERB	VB-S---2P-AA---	Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act	_	_	_	_
5	já	já	PRON	PP-S1--1-------	Case=Nom|Number=Sing|Person=1|PronType=Prs	_	_	_	_
6	se	se	PRON	P7-X4----------	Case=Acc|PronType=Prs|Reflex=Yes|Variant=Short	_	_	_	_
7	mám	mít	VERB	VB-S---1P-AA---	Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act	_	_	_	_
8	dobře	dobře	ADV	Dg-------1A----	Degree=Pos|Polarity=Pos	_	_	_	_

# newpar
# sent_id = 2
1	babička	babička	NOUN	NNFS1-----A----	Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos	_	_	_	_
2	koupila	koupit	VERB	VpQW---XR-AA---	Aspect=Perf|Gender=Fem,N

UDPipe generated an output using the model 'czech-pdt-ud-2.10-220711'.
Please respect the model licence (CC BY-NC-SA unless stated otherwise).


In [126]:

from conllu import parse

data = parse(return_data)

for sentence in data:
    for token in sentence:
        print(token["lemma"])

    print()
    print("new sentence")

ahoj
jak
se
mít
já
se
mít
dobře

new sentence
babička
koupit
rohlík
za
20
Kč

new sentence
