In [1]:
pwd()

'/data1/home/mrim/gonzagab/Kodicare'

In [11]:
import numpy as np
import pandas as pd
import pyterrier as pt
if not pt.started():
    pt.init(mem=8000)
pt.set_tqdm('tqdm')
import xml.etree.ElementTree as ET
import json
import random
import math
import ir_datasets
import os

# Robust Collection (TREC45 (Ad hoc retrieval))

In [5]:
robust_path = "/data1/home/mrim/galuscap/data/robust/disk45"
robust_index_path = "/data1/home/mrim/galuscap/data/indexes/robust"

In [None]:
if os.path.exists(os.path.join(robust_index_path, "data.properties")):
    robust_indexref = pt.IndexRef.of(os.path.join(robust_index_path, "data.properties"))
else:    
    files = pt.io.find_files(robust_path)
    # no-one indexes the congressional record in directory /CR/
    # indeed, recent copies from NIST dont contain it
    # we also remove some of the other unneeded files
    bad = ['/CR/', '/AUX/', 'READCHG', 'READMEFB', 'READFRCG', 'READMEFR', 'READMEFT', 'READMELA']
    for b in bad:
        files = list(filter(lambda f: b not in f, files))
    robust_indexer = pt.TRECCollectionIndexer(robust_index_path, verbose=True)
    robust_indexref = indexer.index(files)
    # processing the files took 7 minutes; the total indexing process took 7m40

robust_index = pt.IndexFactory.of(robust_indexref)
print(robust_index.getCollectionStatistics().toString())

In [16]:
BM25 = pt.BatchRetrieve(index, wmodel="BM25")
DPH  = pt.BatchRetrieve(index, wmodel="DPH")
PL2  = pt.BatchRetrieve(index, wmodel="PL2")
DLM  = pt.BatchRetrieve(index, wmodel="DirichletLM")

In [17]:
pt.Experiment(
    [BM25, DPH, PL2, DLM],
    pt.get_dataset("trec-robust-2004").get_topics(),
    pt.get_dataset("trec-robust-2004").get_qrels(),
    eval_metrics=["map", "P_10", "P_20", "ndcg_cut_20"],
    names=["BM25", "DPH", "PL2", "Dirichlet QL"]
)

Downloading trec-robust-2004 topics to /home/mrim/gonzagab/.pyterrier/corpora/trec-robust-2004/04.testset.gz






04.testset.gz:   0%|                                            | 0.00/33.5k [00:00<?, ?iB/s][A[A[A[A



04.testset.gz: 100%|████████████████████████████████████| 33.5k/33.5k [00:00<00:00, 260kiB/s][A[A[A[A
  0%|                                                            | 0/2306 [31:21<?, ?files/s]


Downloading trec-robust-2004 qrels to /home/mrim/gonzagab/.pyterrier/corpora/trec-robust-2004/qrels.robust2004.txt


qrels.robust2004.txt: 100%|████████████████████████████| 6.24M/6.24M [00:01<00:00, 5.62MiB/s]
  0%|                                                            | 0/2309 [34:41<?, ?files/s]
  0%|                                                            | 0/2307 [35:53<?, ?files/s]


Unnamed: 0,name,map,P_10,P_20,ndcg_cut_20
0,BM25,0.241763,0.426104,0.349398,0.408061
1,DPH,0.251307,0.44739,0.361446,0.422524
2,PL2,0.229383,0.420884,0.343775,0.402179
3,Dirichlet QL,0.236826,0.407631,0.337952,0.39687


# CAR Collection 

List of collections can be found here: https://pyterrier.readthedocs.io/en/latest/datasets.html

Description of CAR 1.5 (https://trec-car.cs.unh.edu/datareleases/v1.5-release.html) -- we might need v1.7 which is not supported by Terrier? 

It actually points to this: https://ir-datasets.com/car.html

In [None]:
car_dataset = "car/v1.5"
car_index_path = "/data1/home/mrim/galuscap/data/indexes/car"

dataset = ir_datasets.load(car_dataset)
for doc in dataset.docs_iter():
    doc # namedtuple<doc_id, text>
    


In [6]:
dataset = pt.datasets.get_dataset('car/v1.5')
indexer = pt.index.IterDictIndexer('./car-index')
indexref = indexer.index(dataset.get_corpus_iter(), fields=('text', 'title', 'headings'))
index = pt.IndexFactory.of(indexref)

DPH_br = pt.BatchRetrieve(index, wmodel="DPH") % 100
BM25_br = pt.BatchRetrieve(index, wmodel="BM25") % 100
# this runs an experiment to obtain results on the TREC COVID queries and qrels
pt.Experiment(
    [DPH_br, BM25_br],
    dataset.get_topics('title'),
    dataset.get_qrels(),
    eval_metrics=["P.5", "P.10", "ndcg_cut.10", "map"])

#indexer = pt.TRECCollectionIndexer(car_index_path)
# this downloads the file msmarco-docs.trec.gz
#indexref = indexer.index(car_dataset.get_corpus())
#index = pt.IndexFactory.of(indexref)

#DPH_br = pt.BatchRetrieve(index, wmodel="DPH") % 100
#BM25_br = pt.BatchRetrieve(index, wmodel="BM25") % 100
# this runs an experiment to obtain results on the TREC 2019 Deep Learning track queries and qrels
#pt.Experiment(
#    [DPH_br, BM25_br],
#    dataset.get_topics("test"),
#    dataset.get_qrels("test"),
#    eval_metrics=["recip_rank", "ndcg_cut_10", "map"])

KeyError: 'car/v1.5'

# trec-wt10g

In [7]:
files_wt10g = pt.io.find_files("/home/mrim/galuscap/data/wt10g")

# build the index
indexer_wt10g = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/wt10g_index", verbose=True, blocks=False)
indexref_wt10g = indexer_wt10g.index(files_wt10g)

# load the index, print the statistics
index_wt10g = pt.IndexFactory.of(indexref_wt10g)
print(index_wt10g.getCollectionStatistics().toString())


  0%|                                                            | 0/5159 [00:00<?, ?files/s][A
  0%|                                                    | 2/5159 [00:00<31:34,  2.72files/s][A
  0%|                                                    | 3/5159 [00:01<54:04,  1.59files/s][A
  0%|                                                    | 4/5159 [00:02<51:55,  1.65files/s][A
  0%|                                                    | 5/5159 [00:02<43:12,  1.99files/s][A
  0%|                                                    | 6/5159 [00:02<36:27,  2.36files/s][A
  0%|                                                    | 7/5159 [00:03<34:59,  2.45files/s][A
  0%|                                                    | 8/5159 [00:03<32:57,  2.60files/s][A
  0%|                                                    | 9/5159 [00:03<32:10,  2.67files/s][A
  0%|                                                   | 10/5159 [00:04<30:27,  2.82files/s][A
  0%|                        

14:39:19.896 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/wt10g/qrels.adhoc.451-550.txt. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed


  0%|                                                            | 0/5159 [28:20<?, ?files/s]

14:39:19.916 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/wt10g/topics.adhoc.451-550.txt. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed




5160files [27:01,  3.18files/s]                                                              [A


Number of documents: 1692096
Number of terms: 3135780
Number of postings: 279636926
Number of fields: 0
Number of tokens: 673375016
Field names: []
Positions:   false



In [14]:
# These are the queries from TREC10 ad-hoc collection -- another options are trec 9 and trec9, trec10-adhoc, trec10-hp 

dataset_wt10g = pt.get_dataset("trec-wt10g")
bm25 = pt.BatchRetrieve(index_wt10g, wmodel="BM25")
dph = pt.BatchRetrieve(index_wt10g, wmodel="DPH")
pt.Experiment(
    [bm25, dph],
    dataset_wt10g.get_topics("trec10-adhoc"),
    dataset_wt10g.get_qrels("trec10-adhoc"),
    eval_metrics=["map"]
)

Downloading trec-wt10g qrels to /home/mrim/gonzagab/.pyterrier/corpora/trec-wt10g/qrels.trec10.main_web.gz


qrels.trec10.main_web.gz: 100%|███████████████████████████| 277k/277k [00:00<00:00, 550kiB/s]


Unnamed: 0,name,map
0,BR(BM25),0.191039
1,BR(DPH),0.214665


# TREC12 (Ad hoc retrieval)

More on this: https://github.com/diazf/trec-data (Tipster collection)

In [19]:
files_disk12 = pt.io.find_files("/home/mrim/galuscap/data/disk12")

# build the index
indexer_disk12 = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/disk12", verbose=True, blocks=False)
indexref_disk12 = indexer_disk12.index(files_disk12)

# load the index, print the statistics
index_disk12 = pt.IndexFactory.of(indexref_disk12)
print(index_disk12.getCollectionStatistics().toString())

# There were errors here !!! Is this dataset complet?

 19%|█████████▋                                        | 462/2394 [02:56<25:50,  1.25files/s]



 23%|███████████▍                                      | 547/2394 [03:53<18:00,  1.71files/s]

15:56:47.910 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/dtd/ap.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:56:47.943 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/dtd/doe.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:56:47.977 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/dtd/fr.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:56:47.999 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/dtd/wsj.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTa

 31%|███████████████▋                                  | 749/2394 [04:54<05:33,  4.93files/s]

15:57:48.519 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/readme.d1. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:57:48.576 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/readme.d2. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:57:48.594 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/readme.doc. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:57:48.648 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/readme.tag. Perhaps trec.collection.class is wrongly set, TrecDocTags a

 43%|█████████████████████                            | 1030/2394 [05:11<01:23, 16.42files/s]

15:58:05.184 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/ziff/zf_001.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:58:05.264 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/ziff/zf_002.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:58:05.304 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/ziff/zf_003.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:58:05.340 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk1/ziff/zf_004.z. Perhaps trec.collection.class is wrongly set,

 67%|████████████████████████████████▊                | 1604/2394 [06:46<02:31,  5.21files/s]

15:59:40.852 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/dtd/ap.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:59:40.858 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/dtd/fr.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:59:40.888 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/dtd/wsj.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
15:59:40.897 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/dtd/zf.dtd. Perhaps trec.collection.class is wrongly set, TrecDocTag

 74%|████████████████████████████████████             | 1761/2394 [07:25<02:17,  4.61files/s]

16:00:19.847 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/readme.d1. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:19.906 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/readme.d2. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:19.931 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/readme.doc. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:19.943 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/readme.tag. Perhaps trec.collection.class is wrongly set, TrecDocTags a

 92%|█████████████████████████████████████████████▎   | 2213/2394 [07:43<00:07, 24.81files/s]

16:00:37.562 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/ziff/zf_251.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:37.618 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/ziff/zf_252.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:37.658 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/ziff/zf_253.z. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:00:37.697 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/disk12/disk2/ziff/zf_254.z. Perhaps trec.collection.class is wrongly set,

2395files [07:53,  5.06files/s]                                                              

16:00:47.272 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents





16:01:02.807 [main] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key DOE1-96-0001 is not unique: 336728,310766
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:346)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:369)
Number of documents: 436517
Number of terms: 415734
Number of postings: 53075626
Number of fields: 0
Number of tokens: 104889119
Field names: []
Positions:   false



# Clueweb 09 and Clueweb12 (WebTrack)

# Gov2

In [None]:
files_gov2 = pt.io.find_files("/home/mrim/galuscap/data/gov2/gov2-corpus")

# build the index
indexer_gov2 = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/gov2", verbose=True, blocks=True)
indexref_gov2 = indexer_gov2.index(files_gov2)

# load the index, print the statistics
index_gov2 = pt.IndexFactory.of(indexref_gov2)
print(index_gov2.getCollectionStatistics().toString())


  0%|                                                           | 0/27205 [00:00<?, ?files/s][A




  0%|                                                 | 3/27205 [00:01<3:43:38,  2.03files/s][A
  0%|                                                 | 4/27205 [00:02<5:50:04,  1.29files/s][A
  0%|                                                 | 5/27205 [00:04<7:26:57,  1.01files/s][A
  0%|                                                 | 6/27205 [00:05<8:37:12,  1.14s/files][A
  0%|                                                 | 7/27205 [00:06<8:48:03,  1.16s/files][A
  0%|                                                 | 8/27205 [00:08<9:03:33,  1.20s/files][A
  0%|                                                 | 9/27205 [00:09<9:00:34,  1.19s/files][A
  0%|                                                | 10/27205 [00:10<9:07:06,  1.21s/files][A
  0%|                                                | 11/27205 [00:11<9:12:46,  1.22s/files][A
  0%|                                                | 12/27205 [00:13<9:13:53,  1.22s/files][A
  0%|                        

14:44:20.326 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/gov2/gov2-corpus/GX003/.nfs0000002e1dbafe0d000000ac. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed



  1%|▌                                             | 303/27205 [08:44<13:01:35,  1.74s/files][A
  1%|▌                                             | 304/27205 [08:46<13:20:32,  1.79s/files][A
  1%|▌                                             | 305/27205 [08:49<14:03:58,  1.88s/files][A
  1%|▌                                             | 306/27205 [08:51<14:36:27,  1.95s/files][A
  1%|▌                                             | 307/27205 [08:53<15:10:58,  2.03s/files][A
  1%|▌                                             | 308/27205 [08:55<15:29:22,  2.07s/files][A
  1%|▌                                             | 309/27205 [08:57<15:32:45,  2.08s/files][A
  1%|▌                                             | 310/27205 [09:00<16:23:19,  2.19s/files][A
  1%|▌                                             | 311/27205 [09:02<16:01:57,  2.15s/files][A
  1%|▌                                             | 312/27205 [09:04<16:00:46,  2.14s/files][A
  1%|▌                       

In [None]:
DPH_br = pt.BatchRetrieve(index, wmodel="DPH") % 100
BM25_br = pt.BatchRetrieve(index, wmodel="BM25") % 100
# this runs an experiment to obtain results on the TREC COVID queries and qrels
pt.Experiment(
    [DPH_br, BM25_br],
    dataset.get_topics('title'),
    dataset.get_qrels(),
    eval_metrics=["P.5", "P.10", "ndcg_cut.10", "map"])

# AQUAINT

In [23]:
files_AQUAINT = pt.io.find_files("/home/mrim/galuscap/data/AQUAINT")

# build the index
indexer_AQUAINT = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/AQUAINT", verbose=True, blocks=False)
indexref_AQUAINT = indexer_AQUAINT.index(files_AQUAINT)

# load the index, print the statistics
index_AQUAINT = pt.IndexFactory.of(indexref_AQUAINT)
print(index_AQUAINT.getCollectionStatistics().toString())


  0%|                                                            | 0/3350 [00:00<?, ?files/s][A

16:09:36.265 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk1/AQUAINT.DTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:09:36.283 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk1/FILE.TBL. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:09:36.306 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk1/INDEX.HTML. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed



  0%|                                                    | 6/3350 [00:00<05:34, 10.01files/s][A
  0%|                                                    | 8/3350 [00:01<12:16,  4.54files/s][A
  0%|▏                                                   | 9/3350 [00:02<14:55,  3.73files/s][A
  0%|▏                                                  | 10/3350 [00:02<17:16,  3.22files/s][A
  0%|▏                                                  | 11/3350 [00:02<16:21,  3.40files/s][A
  0%|▏                                                  | 12/3350 [00:02<14:36,  3.81files/s][A
  0%|▏                                                  | 13/3350 [00:03<16:58,  3.28files/s][A
  0%|▏                                                  | 14/3350 [00:03<18:56,  2.94files/s][A
  0%|▏                                                  | 15/3350 [00:04<21:26,  2.59files/s][A
  0%|▏                                                  | 16/3350 [00:04<22:42,  2.45files/s][A
  1%|▎                       

16:18:23.258 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk2/AQUAINT.DTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:18:23.317 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk2/FILE.TBL. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
16:18:23.325 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /home/mrim/galuscap/data/AQUAINT/disk2/INDEX.HTML. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed



 48%|███████████████████████▋                         | 1618/3350 [08:47<03:18,  8.72files/s][A
 48%|███████████████████████▋                         | 1619/3350 [08:47<03:30,  8.22files/s][A
 48%|███████████████████████▋                         | 1620/3350 [08:47<03:48,  7.55files/s][A
 48%|███████████████████████▋                         | 1621/3350 [08:47<03:56,  7.33files/s][A
 48%|███████████████████████▋                         | 1622/3350 [08:47<04:15,  6.76files/s][A
 48%|███████████████████████▋                         | 1623/3350 [08:47<04:04,  7.07files/s][A
 49%|███████████████████████▊                         | 1625/3350 [08:48<03:59,  7.20files/s][A
 49%|███████████████████████▊                         | 1626/3350 [08:48<03:55,  7.33files/s][A
 49%|███████████████████████▊                         | 1627/3350 [08:48<03:57,  7.24files/s][A
 49%|███████████████████████▊                         | 1628/3350 [08:48<04:08,  6.94files/s][A
 49%|███████████████████████▊

Number of documents: 1033461
Number of terms: 647280
Number of postings: 179114512
Number of fields: 0
Number of tokens: 282858247
Field names: []
Positions:   false



In [26]:
dataset_AQUAINT = pt.get_dataset("irds:aquaint/trec-robust-2005")
bm25 = pt.BatchRetrieve(index_AQUAINT, wmodel="BM25")
dph = pt.BatchRetrieve(index_AQUAINT, wmodel="DPH")
pt.Experiment(
    [bm25, dph],
    dataset_AQUAINT.get_topics('title'), # Might also use  'description', 'narrative'
    dataset_AQUAINT.get_qrels(),
    eval_metrics=["map"]
)

INFO:root:If you have a local copy of https://trec.nist.gov/data/robust/05/05.50.topics.txt, you can symlink it here to avoid downloading it again: /home/mrim/gonzagab/.ir_datasets/downloads/c2e722e6bdfd00f088c6f6517db564ce
[INFO] If you have a local copy of https://trec.nist.gov/data/robust/05/05.50.topics.txt, you can symlink it here to avoid downloading it again: /home/mrim/gonzagab/.ir_datasets/downloads/c2e722e6bdfd00f088c6f6517db564ce
  2%|▊                                                | 68/4016 [18:02<1:32:08,  1.40s/files]INFO:root:[starting] https://trec.nist.gov/data/robust/05/05.50.topics.txt
[INFO] [starting] https://trec.nist.gov/data/robust/05/05.50.topics.txt                      
  2%|▊                                                | 68/4016 [18:03<1:32:08,  1.40s/files]
https://trec.nist.gov/data/robust/05/05.50.topics.txt: 0.0%| 0.00/25.1k [00:00<?, ?B/s][AINFO:root:[finished] https://trec.nist.gov/data/robust/05/05.50.topics.txt: [00:00] [25.1kB] [263kB/s]
      

Unnamed: 0,name,map
0,BR(BM25),0.180697
1,BR(DPH),0.199057


# Twitter 

In [17]:
files_tweets = pt.io.find_files("/home/mrim/galuscap/data/Twitter2013/Tweets2013-corpus/data")

# build the index
indexer_tweets = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/tweets", verbose=True, blocks=False)
indexref_tweets = indexer_tweets.index(files_tweets)

# load the index, print the statistics
index_tweets = pt.IndexFactory.of(indexref_tweets)
print(index_tweets.getCollectionStatistics().toString())

  0%|                                                            | 0/1416 [00:00<?, ?files/s]

14:33:36.971 [main] WARN org.terrier.compression.bit.BitOutputStream - Could not open new BitOutputStream because it alleged file could not be found.
java.io.FileNotFoundException: /home/mrim/galuscap/data/indexes/tweets/data_1.direct.bf (Permission non accordÃ©e)
	at java.base/java.io.FileOutputStream.open0(Native Method)
	at java.base/java.io.FileOutputStream.open(FileOutputStream.java:298)
	at java.base/java.io.FileOutputStream.<init>(FileOutputStream.java:237)
	at java.base/java.io.FileOutputStream.<init>(FileOutputStream.java:126)
	at org.terrier.utility.io.LocalFileSystem.writeFileStream(LocalFileSystem.java:130)
	at org.terrier.utility.Files.writeFile(Files.java:370)
	at org.terrier.utility.Files.writeFileStream(Files.java:706)
	at org.terrier.compression.bit.BitOutputStream.<init>(BitOutputStream.java:113)
	at org.terrier.structures.bit.DirectInvertedOutputStream.<init>(DirectInvertedOutputStream.java:73)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInsta

JavaException: JVM exception occurred: Could not instantiate MetaIndexBuilder org.terrier.structures.indexing.ZstdMetaIndexBuilder java.lang.IllegalArgumentException

In [None]:
import ir_datasets
dataset = ir_datasets.load("tweets2013-ia")
for doc in dataset.docs_iter():

In [None]:
# TREC DL

In [20]:
files_TRECDL = pt.io.find_files("/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized")
print (files_TRECDL)
print(pt)

# build the index
indexer_TRECDL = pt.TRECCollectionIndexer("/home/mrim/galuscap/data/indexes/DL_index", verbose=True, blocks=False)
#indexref_TRECDL = indexer_TRECDL.index(files_TRECDL)

# load the index, print the statistics
#index_TRECDL = pt.IndexFactory.of(indexref_TRECDL)

['/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_00.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_01.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_02.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_03.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_04.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_05.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_06.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_07.gz.txt', '/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/msmarco_passage_08.gz.txt', '/home/mr

TypeError: 'NoneType' object is not callable