In [1]:
import os
import pandas as pd
import numpy as np
import glob

In [2]:
PYSERINI_FOLDER="/media/eduseiti/bigdata01/unicamp/ia368v_dd/pyserini"

In [3]:
CLUEWEB22_FOLDER="/media/eduseiti/bigdata01/unicamp/ia368v_dd/trabalho_final/clueweb22-pt_10M_sample_fixed/"

In [4]:
CLUEWEB22_1M_PT_BR_DATASET="clueweb22-pt_colbertx_cleaned_massiveweb_1M_sample.tsv"

BM25_OUTPUT_FOLDER="/media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb"
BM25_INDEX="/media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb/index"
BM25_RUNS="/media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb/runs"

In [5]:
QUERIES_FOLDER="/media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/tests"
QUERIES_PART_01="queries_validation.tsv"
QUERIES_PART_02="validation_queries_mirelle.tsv"
SAMPLED_QUERIES_PASSAGES="07_retrievers_docs_000_009_sampled_50_queries.tsv"

In [6]:
os.chdir(PYSERINI_FOLDER)

In [7]:
os.environ["ANSERINI_CLASSPATH"]="/media/eduseiti/bigdata01/unicamp/ia368v_dd/anserini/target"

### Converting the dataset to pyserini's format

In [28]:
dataset_df = pd.read_csv(os.path.join(CLUEWEB22_FOLDER, CLUEWEB22_1M_PT_BR_DATASET), 
                         sep="\t", header=None, names=['passage_id', 'passage', 'n_tokens', 'massiveweb_filters'])

In [29]:
dataset_df

Unnamed: 0,passage_id,passage,n_tokens,massiveweb_filters
0,5,"Se você precisar de ajuda, visite o website na...",361,0
1,6,Qual a parada final de Sintra para Cascais ôni...,382,0
2,7,"O Casino Estoril fica localizado no Estoril, a...",255,0
3,8,Sintra para Lisboa Sintra para Óbidos Leiria P...,283,0
4,11,Iniciar a digitalização Iniciar a digitalizaçã...,293,0
...,...,...,...,...
999995,9999960,"Pararam totalmente suas vidas, reagiram parand...",337,0
999996,9999967,Recicla Eletro - Coleta Gratuita de Lixo Eletr...,386,0
999997,9999981,É importante falar de liberdade ao falar de Ro...,261,0
999998,9999989,Os artigos 2º e artigos 84 a 86 da Lei nº 13.1...,375,0


In [30]:
output_filename = "{}_two_colunms.tsv".format(os.path.splitext(CLUEWEB22_1M_PT_BR_DATASET)[0])

In [31]:
two_columns_file = os.path.join(CLUEWEB22_FOLDER, output_filename)

In [32]:
dataset_df[dataset_df.columns[:2]].to_csv(two_columns_file, sep='\t', header=None, index=False)

In [33]:
!python3 tools/scripts/msmarco/convert_collection_to_jsonl.py \
    --collection-path {two_columns_file} \
    --output-folder {BM25_OUTPUT_FOLDER}

Converting collection...
Converted 0 docs, writing into file 1
Converted 100,000 docs, writing into file 1
Converted 200,000 docs, writing into file 1
Converted 300,000 docs, writing into file 1
Converted 400,000 docs, writing into file 1
Converted 500,000 docs, writing into file 1
Converted 600,000 docs, writing into file 1
Converted 700,000 docs, writing into file 1
Converted 800,000 docs, writing into file 1
Converted 900,000 docs, writing into file 1
Done!


## Indexação do dataset

In [34]:
!python3 -m pyserini.index.lucene \
    --collection JsonCollection \
    --input {BM25_OUTPUT_FOLDER} \
    --index {BM25_INDEX} \
    --language pt \
    --generator DefaultLuceneDocumentGenerator \
    --threads 9 \
    --storePositions --storeDocvectors --storeRaw

2024-01-02 20:57:48,472 INFO  [main] index.IndexCollection (IndexCollection.java:393) - Setting log level to INFO
2024-01-02 20:57:48,473 INFO  [main] index.IndexCollection (IndexCollection.java:396) - Starting indexer...
2024-01-02 20:57:48,473 INFO  [main] index.IndexCollection (IndexCollection.java:398) - DocumentCollection path: /media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb
2024-01-02 20:57:48,473 INFO  [main] index.IndexCollection (IndexCollection.java:399) - CollectionClass: JsonCollection
2024-01-02 20:57:48,474 INFO  [main] index.IndexCollection (IndexCollection.java:400) - Generator: DefaultLuceneDocumentGenerator
2024-01-02 20:57:48,474 INFO  [main] index.IndexCollection (IndexCollection.java:401) - Threads: 9
2024-01-02 20:57:48,474 INFO  [main] index.IndexCollection (IndexCollection.java:402) - Language: pt
2024-01-02 20:57:48,474 INFO  [main] index.IndexCollection (IndexCollection.java:403) - Stemmer: porter
2024-01-02 20:57:48,474 INFO 

### Perform search on the 50 sampled queries

In [35]:
from pyserini.search import LuceneSearcher, SimpleSearcher
from pyserini.analysis import get_lucene_analyzer

In [36]:
from datetime import datetime

#### First, consolidate all the queries

In [37]:
sampled_df = pd.read_csv(os.path.join(QUERIES_FOLDER, "sampled_queries_no_header.tsv"), sep='\t', names=['query_id', 'query'])

#### Perform the search

In [39]:
analyzer = get_lucene_analyzer("pt")
searcher = LuceneSearcher(BM25_INDEX)
searcher.set_analyzer(analyzer)

In [40]:
sampled_df.iloc[0]['query']

'Qual a maior característica da fauna brasileira?'

In [41]:
hits = searcher.search(sampled_df.iloc[1]['query'], k=1000)

In [42]:
hits

[<io.anserini.search.SimpleSearcher$Result at 0x7fb4b65c9cb0 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be30 at 0x7fb610951450>>,
 <io.anserini.search.SimpleSearcher$Result at 0x7fb6251dad40 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be38 at 0x7fb4b6502b30>>,
 <io.anserini.search.SimpleSearcher$Result at 0x7fb4b65ca610 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be40 at 0x7fb4b65025b0>>,
 <io.anserini.search.SimpleSearcher$Result at 0x7fb4b65cb380 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be48 at 0x7fb4b6502c10>>,
 <io.anserini.search.SimpleSearcher$Result at 0x7fb4b65cb330 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be50 at 0x7fb4b6501c30>>,
 <io.anserini.search.SimpleSearcher$Result at 0x7fb4b65cb290 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x563e1833be60 at 0x7fb4ceccc970>>,
 <io.anser

In [43]:
for hit in hits[:5]:
    print("passage_id: {}, score: {}, passage: {}".format(hit.docid, hit.score, hit.raw))

passage_id: 9973433, score: 16.687700271606445, passage: {
  "id" : "9973433",
  "contents" : "\"Quais são os países mais miscigenados do mundo, além do Brasil? - Quora Quais são os países mais miscigenados do mundo, além do Brasil? 2 respostas Alcides Araujo Jr , estudou Engenharia eletrônica e de telecomunicações em UTFPR Curitiba (2011) Respondido Há 1 ano O nível de miscigenação do Brasil não chega nem perto daquele das 3 Guianas (Guiana, Suriname e Guinana Francesa) ou dos países do Caribe. Abaixo, um vídeo ilustrativo sobre a Guiana e o Suriname: Outro mais geral, do mesmo canal: Mais sobre os povos do Caribe: Em países como Suriname e a Guiana, consegue-se ter mistura de: Nativos americanos (índios) Africanos Indianos Indonésios Europeus Entre outros. Aqui o vídeo de um brasileiro que foi para lá de moto: Alguns out Continuar a ler Perguntas relacionadas Mais respostas abaixo Como são os cemitérios dos países mais populosos do mundo? Quais outros países além do Brasil tem uma cu

#### Perform bulk search

In [44]:
topics_path = os.path.join(QUERIES_FOLDER, "sampled_queries_no_header.tsv")

execution_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
pyserini_runfile = os.path.join(BM25_RUNS, "run_{}.txt".format(execution_timestamp))

In [45]:
!python3 -m pyserini.search.lucene \
    --language "pt" \
    --index {BM25_INDEX} \
    --topics {topics_path} \
    --output {pyserini_runfile} \
    --output-format trec \
    --hits 1000 \
    --bm25 --k1 0.82 --b 0.68 \
    --threads 8

Setting BM25 parameters: k1=0.82, b=0.68
Running /media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/tests/sampled_queries_no_header.tsv topics, saving to /media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb/runs/run_20240102_210158.txt...
100%|███████████████████████████████████████████| 50/50 [00:03<00:00, 16.49it/s]


### Convert document ID to original clueweb22 ID

In [46]:
run_file = pd.read_csv(pyserini_runfile, sep=' ', header=None, names=['query_id', 'Q0', 'colbertx_id', 'position', 'score', 'comment'])

In [47]:
run_file

Unnamed: 0,query_id,Q0,colbertx_id,position,score,comment
0,1,Q0,264425,1,11.670300,Anserini
1,1,Q0,3718313,2,10.147600,Anserini
2,1,Q0,4178525,3,9.934500,Anserini
3,1,Q0,3299432,4,9.916500,Anserini
4,1,Q0,2605802,5,9.785500,Anserini
...,...,...,...,...,...,...
49995,199,Q0,1363114,996,4.064500,Anserini
49996,199,Q0,5680245,997,4.064499,Anserini
49997,199,Q0,6674135,998,4.064498,Anserini
49998,199,Q0,6887827,999,4.064497,Anserini


In [48]:
id_conversion_df = pd.read_csv(os.path.join(CLUEWEB22_FOLDER, "clueweb22-pt_colbertx_to_clueweb_id.tsv"), sep="\t", names=['colbertx_id', 'clueweb_id'])

In [49]:
run_file_converted_id = run_file.merge(id_conversion_df).sort_values(['query_id', 'position'])

In [50]:
converted_run_filename = os.path.join(BM25_RUNS, "{}_converted.txt".format(os.path.splitext(os.path.basename(pyserini_runfile))[0]))

In [51]:
converted_run_filename

'/media/eduseiti/bigdata01/unicamp/ia368v_dd/unicamp_IR/retrievals/bm25_massiveweb/runs/run_20240102_210158_converted.txt'

In [52]:
run_file_converted_id[['query_id', 'Q0', 'clueweb_id', 'position', 'score', 'comment']].to_csv(converted_run_filename, sep=" ", header=None, index=False)

### Evaluate the result

In [53]:
qrels_filename = os.path.join(QUERIES_FOLDER, "evaluation_08_retrievers_000_009_sampled_50_queries_gpt-4-1106-preview_20231227_qrel.txt")

In [54]:
result = !/media/eduseiti/bigdata01/unicamp/ia368v_dd/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -mndcg_cut.10 \
    {qrels_filename} {converted_run_filename}

In [55]:
result

['ndcg_cut_10           \tall\t0.1657']

### Check GPT-4 execution log to verify how many passages have been annotated

In [56]:
import pickle

In [57]:
with open(os.path.join(QUERIES_FOLDER, "execution_with_reverse_rev_2.1_sampled_50_queries_gpt-4-1106-preview_20231227.pkl"), "rb") as input_file:
    logs = pickle.load(input_file)

In [58]:
logs.keys()

dict_keys(['args', 'evaluation_cache', 'reverse_mapping'])

In [59]:
run_file_converted_id = run_file_converted_id.merge(sampled_df)

In [60]:
top_10_df = run_file_converted_id.groupby('query_id').head(10)

In [61]:
already_analized = 0

for i, row in top_10_df.iterrows():
    if (row['query'] + "_" + row['clueweb_id']) in logs['evaluation_cache']:
        print("Already analized...")

        already_analized += 1

Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...


In [62]:
already_analized

128

In [63]:
top_10_df.shape

(500, 8)

In [64]:
top_10_df

Unnamed: 0,query_id,Q0,colbertx_id,position,score,comment,clueweb_id,query
0,1,Q0,264425,1,11.6703,Anserini,clueweb22-pt0000-05-02747_0,Qual a maior característica da fauna brasileira?
1,1,Q0,3718313,2,10.1476,Anserini,clueweb22-pt0000-78-09747_0,Qual a maior característica da fauna brasileira?
2,1,Q0,4178525,3,9.9345,Anserini,clueweb22-pt0000-87-14633_2,Qual a maior característica da fauna brasileira?
3,1,Q0,3299432,4,9.9165,Anserini,clueweb22-pt0000-69-05954_7,Qual a maior característica da fauna brasileira?
4,1,Q0,2605802,5,9.7855,Anserini,clueweb22-pt0000-55-02859_0,Qual a maior característica da fauna brasileira?
...,...,...,...,...,...,...,...,...
49005,199,Q0,2320343,6,11.0092,Anserini,clueweb22-pt0000-48-17441_2,Quais foram os principais artistas do Impressi...
49006,199,Q0,9715417,7,10.6608,Anserini,clueweb22-pt0002-04-01418_0,Quais foram os principais artistas do Impressi...
49007,199,Q0,5978727,8,10.0990,Anserini,clueweb22-pt0001-24-17228_0,Quais foram os principais artistas do Impressi...
49008,199,Q0,410165,9,10.0778,Anserini,clueweb22-pt0000-08-07683_2,Quais foram os principais artistas do Impressi...


### Add passage text to run GPT-4 evaluation

In [65]:
dataset_df

Unnamed: 0,passage_id,passage,n_tokens,massiveweb_filters
0,5,"Se você precisar de ajuda, visite o website na...",361,0
1,6,Qual a parada final de Sintra para Cascais ôni...,382,0
2,7,"O Casino Estoril fica localizado no Estoril, a...",255,0
3,8,Sintra para Lisboa Sintra para Óbidos Leiria P...,283,0
4,11,Iniciar a digitalização Iniciar a digitalizaçã...,293,0
...,...,...,...,...
999995,9999960,"Pararam totalmente suas vidas, reagiram parand...",337,0
999996,9999967,Recicla Eletro - Coleta Gratuita de Lixo Eletr...,386,0
999997,9999981,É importante falar de liberdade ao falar de Ro...,261,0
999998,9999989,Os artigos 2º e artigos 84 a 86 da Lei nº 13.1...,375,0


In [66]:
dataset_with_clueweb_id_df = dataset_df.merge(id_conversion_df, left_on="passage_id", right_on="colbertx_id")

In [67]:
dataset_with_clueweb_id_df

Unnamed: 0,passage_id,passage,n_tokens,massiveweb_filters,colbertx_id,clueweb_id
0,5,"Se você precisar de ajuda, visite o website na...",361,0,5,clueweb22-pt0000-00-00003_1
1,6,Qual a parada final de Sintra para Cascais ôni...,382,0,6,clueweb22-pt0000-00-00003_2
2,7,"O Casino Estoril fica localizado no Estoril, a...",255,0,7,clueweb22-pt0000-00-00003_3
3,8,Sintra para Lisboa Sintra para Óbidos Leiria P...,283,0,8,clueweb22-pt0000-00-00003_5
4,11,Iniciar a digitalização Iniciar a digitalizaçã...,293,0,11,clueweb22-pt0000-00-00008_0
...,...,...,...,...,...,...
999995,9999960,"Pararam totalmente suas vidas, reagiram parand...",337,0,9999960,clueweb22-pt0002-45-01752_2
999996,9999967,Recicla Eletro - Coleta Gratuita de Lixo Eletr...,386,0,9999967,clueweb22-pt0002-45-01757_0
999997,9999981,É importante falar de liberdade ao falar de Ro...,261,0,9999981,clueweb22-pt0002-45-01760_4
999998,9999989,Os artigos 2º e artigos 84 a 86 da Lei nº 13.1...,375,0,9999989,clueweb22-pt0002-45-01762_5


In [68]:
dataset_with_clueweb_id_df[['clueweb_id', 'passage', 'n_tokens', 'colbertx_id']].to_csv(os.path.join(CLUEWEB22_FOLDER, "clueweb22-pt_cleaned_massiveweb_1M_sample.tsv"), 
                                                                                        sep="\t", header=None, index=False)

In [69]:
dataset_with_clueweb_id_df[['clueweb_id', 'passage']].to_csv(os.path.join(CLUEWEB22_FOLDER, "clueweb22-pt_cleaned_massiveweb_1M_sample_two_columns.tsv"), 
                                                             sep="\t", header=None, index=False)

In [70]:
top_10_df = top_10_df.merge(dataset_with_clueweb_id_df[['clueweb_id', 'passage']]).sort_values(['query_id', 'position'])

In [71]:
top_10_df

Unnamed: 0,query_id,Q0,colbertx_id,position,score,comment,clueweb_id,query,passage
0,1,Q0,264425,1,11.6703,Anserini,clueweb22-pt0000-05-02747_0,Qual a maior característica da fauna brasileira?,BIOMAS E FITOFISIONOMIAS BRASILEIROS: CARACTER...
1,1,Q0,3718313,2,10.1476,Anserini,clueweb22-pt0000-78-09747_0,Qual a maior característica da fauna brasileira?,Onça-pintada - Escola Kids Qual matéria está p...
2,1,Q0,4178525,3,9.9345,Anserini,clueweb22-pt0000-87-14633_2,Qual a maior característica da fauna brasileira?,- Fauna e Flora A fauna da Amazônia é represen...
3,1,Q0,3299432,4,9.9165,Anserini,clueweb22-pt0000-69-05954_7,Qual a maior característica da fauna brasileira?,"Aprenda sobre a Bacia do Tocantins-Araguaia, a..."
4,1,Q0,2605802,5,9.7855,Anserini,clueweb22-pt0000-55-02859_0,Qual a maior característica da fauna brasileira?,Fauna e Flora: : entenda o que são e quais as ...
...,...,...,...,...,...,...,...,...,...
495,199,Q0,2320343,6,11.0092,Anserini,clueweb22-pt0000-48-17441_2,Quais foram os principais artistas do Impressi...,O IMPRESSIONISMO foi um movimento artístico qu...
496,199,Q0,9715417,7,10.6608,Anserini,clueweb22-pt0002-04-01418_0,Quais foram os principais artistas do Impressi...,O Impressionismo de Monet | Cultura - Cultura ...
497,199,Q0,5978727,8,10.0990,Anserini,clueweb22-pt0001-24-17228_0,Quais foram os principais artistas do Impressi...,A cultura da gare - 1534 Palavras | Trabalhosf...
498,199,Q0,410165,9,10.0778,Anserini,clueweb22-pt0000-08-07683_2,Quais foram os principais artistas do Impressi...,Severini e o Cubismo Os traços do futurismo ga...


In [72]:
top_10_df.rename(columns={'clueweb_id': 'passage_id'}).to_csv(os.path.join(QUERIES_FOLDER, "bm25_massiveweb_1M_docs_000_009_sampled_50_queries.tsv"), sep="\t", index=False)

### Check how many passages in the 1000 retrieved per query had already been analyzed by GPT-4

In [73]:
already_analized = 0

for i, row in run_file_converted_id.iterrows():
    if (row['query'] + "_" + row['clueweb_id']) in logs['evaluation_cache']:
        print("Already analized...")

        already_analized += 1

Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...
Already analized...


In [74]:
already_analized

260

In [None]:
'_'.join('Onde está localizada a Praça XV de Novembro?_clueweb22-pt0000-44-08794_4'.split('_')[1:])

In [None]:
all_analized_passages = ['_'.join(which_key.split('_')[1:]) for which_key in list(logs['evaluation_cache'].keys())]

In [None]:
len(all_analized_passages)

In [None]:
analized_passages = np.unique(all_analized_passages)

In [None]:
analized_passages.shape

In [None]:
pd.DataFrame(analized_passages, columns=["clueweb_id"]).merge(pd.DataFrame(dataset_with_clueweb_id_df['clueweb_id'].drop_duplicates()))