<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/RAG_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Libs

In [None]:
%%shell
pip install faiss-cpu -q
pip install pyserini -q
pip install ftfy -q
pip install tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.1/142.1 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m81.4 MB/s[



In [None]:
%%shell
apt-get install maven -qq
git clone --recurse-submodules https://github.com/castorini/pyserini.git
cd pyserini
cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
cd tools/eval/ndeval && make && cd ../../..

Extracting templates from packages: 100%
Selecting previously unselected package libapache-pom-java.
(Reading database ... 121654 files and directories currently installed.)
Preparing to unpack .../00-libapache-pom-java_18-1_all.deb ...
Unpacking libapache-pom-java (18-1) ...
Selecting previously unselected package libatinject-jsr330-api-java.
Preparing to unpack .../01-libatinject-jsr330-api-java_1.0+ds1-5_all.deb ...
Unpacking libatinject-jsr330-api-java (1.0+ds1-5) ...
Selecting previously unselected package libgeronimo-interceptor-3.0-spec-java.
Preparing to unpack .../02-libgeronimo-interceptor-3.0-spec-java_1.0.1-4fakesync_all.deb ...
Unpacking libgeronimo-interceptor-3.0-spec-java (1.0.1-4fakesync) ...
Selecting previously unselected package libcdi-api-java.
Preparing to unpack .../03-libcdi-api-java_1.2-3_all.deb ...
Unpacking libcdi-api-java (1.2-3) ...
Selecting previously unselected package libcommons-cli-java.
Preparing to unpack .../04-libcommons-cli-java_1.4-2_all.deb ...



## Import Libs

In [None]:
import os
import re
import ftfy
import math
import json
import torch
import shutil
import numpy as np
import pandas as pd
import collections
import tiktoken

In [None]:
from tqdm import tqdm
from time import time
from IPython.display import display
from collections import defaultdict, Counter
from pyserini.search.lucene import LuceneSearcher

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Dataset

In [None]:
# define data path
path = f"/content/drive/MyDrive/22h/RAG/data/"

In [None]:
# clear data
!rm -rf "topics.tsv" "chunk.tsv" "qrels.tsv"

# select data to run
data_selector = "all"

# data selector
if data_selector == "train":
    print(f"Downloading '{data_selector}' data")
    shutil.copyfile(os.path.join(path, "topics_train.tsv"), "/content/topics.tsv")
    shutil.copyfile(os.path.join(path, "corpus_chunk_train.tsv"), "/content/chunk.tsv")
    shutil.copyfile(os.path.join(path, "qrels_test.tsv"), "/content/qrels.tsv") # qrels_train == qrels_test
elif data_selector == "test":
    print(f"Downloading '{data_selector}' data")
    shutil.copyfile(os.path.join(path, "topics_test.tsv"), "/content/topics.tsv")
    shutil.copyfile(os.path.join(path, "corpus_chunk_test.tsv"), "/content/chunk.tsv")
    shutil.copyfile(os.path.join(path, "qrels_test.tsv"), "/content/qrels.tsv")
elif data_selector == "all":
    print(f"Downloading '{data_selector}' data")
    shutil.copyfile(os.path.join(path, "topics.tsv"), "/content/topics.tsv")
    shutil.copyfile(os.path.join(path, "corpus_chunk.tsv"), "/content/chunk.tsv")
    shutil.copyfile(os.path.join(path, "qrels.tsv"), "/content/qrels.tsv")
else:
    print("Select valid data")

Downloading 'all' data


In [None]:
# load topics
input_tsv = f'/content/topics.tsv'
topics = {}
with open(input_tsv,'r') as f_in:
    for line in tqdm(f_in, desc=f'Reading file on {input_tsv}'):
        id, text = line.strip().split('\t')
        topics[int(id)] = ftfy.fix_text(" ".join(text.replace("\n", " ").replace("\t", " ").strip().split()))

Reading file on /content/topics.tsv: 140it [00:00, 8055.24it/s]


In [None]:
list(topics.items())[:3]

[(0,
  'Qual é o segredo que o Sr. e a Sra. Dursley temiam que alguém descobrisse?'),
 (1,
  'Qual era a cor da capa que o Sr. Dursley viu um homem mais velho do que ele usando, que o fez pensar que poderia ser uma promoção boba?'),
 (2,
  'Qual foi a reação do Sr. Dursley ao ser abraçado por um estranho que usava uma capa roxa e o chamou de trouxa?')]

In [None]:
# load chunks
input_tsv = f'/content/chunk.tsv'
corpus = {}
with open(input_tsv,'r') as f_in:
    for line in tqdm(f_in, desc=f'Reading file on {input_tsv}'):
        id, text = line.strip().split('\t')
        corpus[int(id)] = ftfy.fix_text(" ".join(text.replace("\n", " ").replace("\t", " ").strip().split()))

Reading file on /content/chunk.tsv: 140it [00:00, 438.77it/s]


In [None]:
list(corpus.items())[:3]

[(0,
  '- CAPÍTULO UM - O menino que sobreviveu O Sr. e a Sra. Dursley, da rua dos Alfeneiros, no 4, se orgulhavam de dizer que eram perfeitamente normais, muito bem, obrigado. Eram as últimas pessoas no mundo que se esperaria que se metessem em alguma coisa estranha ou misteriosa, porque simplesmente não compactuavam com esse tipo de bobagem. O Sr. Dursley era diretor de uma firma chamada Grunnings, que fazia perfurações. Era um homem alto e corpulento quase sem pescoço, embora tivesse enormes bigodes. A Sra. Dursley era magra e loura e tinha um pescoço quase duas vezes mais comprido que o normal, o que era muito útil porque ela passava grande parte do tempo espichando-o por cima da cerca do jardim para espiar os vizinhos. Os Dursley tinham um filhinho chamado Dudley, o Duda, e em sua opinião não havia garoto melhor em nenhum lugar do mundo. Os Dursley tinham tudo que queriam, mas tinham também um segredo, e seu maior receio era que alguém o descobrisse. Achavam que não iriam aguentar

In [None]:
# load qrels FAQ like
qrels = pd.read_csv("/content/qrels.tsv", sep = "\t", header = None)
qrels.columns = ["query_id", "0", "doc_id", "rel"]
qrels

Unnamed: 0,query_id,0,doc_id,rel
0,0,0,0,1
1,1,0,1,1
2,2,0,2,1
3,3,0,3,1
4,4,0,4,1
...,...,...,...,...
135,135,0,135,1
136,136,0,136,1
137,137,0,137,1
138,138,0,138,1


In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("text-embedding-ada-002")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
doc_lengths = pd.DataFrame(corpus.items(), columns = ["id", "text"])
doc_lengths["n_tokens"] = [num_tokens_from_string(corpus[idx], "cl100k_base") for idx in corpus]
doc_lengths["n_tokens"].describe()

count     140.000000
mean      992.235714
std        73.317837
min       131.000000
25%       998.000000
50%       999.000000
75%       999.000000
max      1001.000000
Name: n_tokens, dtype: float64

## Build Folders

In [None]:
!rm -rf runs
!rm -rf evals
!mkdir runs
!mkdir evals

In [None]:
trec_results = {}

## BM25

* https://github.com/castorini/anserini/blob/master/src/main/resources/regression/msmarco-passage.yaml
* https://github.com/castorini/pyserini/blob/f75adca8c410e64b3ff1375e181a0ea3af1ddb28/docs/usage-index.md

In [None]:
!rm -rf /content/collections
!mkdir /content/collections

# Save corpus as jsonl file
output_json = f'/content/collections/corpus.jsonl'

with open(output_json,'w') as f_out:
    for doc_id, doc_text in tqdm(corpus.items(), desc=f'Writing file on {output_json}'):
        output_dict = {'id': doc_id, 'contents': doc_text}
        f_out.write(json.dumps(output_dict, ensure_ascii=False) + '\n')

Writing file on /content/collections/corpus.jsonl: 100%|██████████| 140/140 [00:00<00:00, 14854.61it/s]


In [None]:
!rm -rf indexes

# Create JsonCollection from jsonl file
!python -m pyserini.index.lucene \
--collection JsonCollection \
--input /content/collections \
--index indexes/lucene-index-trec-hp \
--generator DefaultLuceneDocumentGenerator \
--threads 9 \
--language pt \
--storePositions \
--storeDocvectors \
--storeRaw

2024-01-09 03:58:20,904 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:204) - Setting log level to INFO
2024-01-09 03:58:20,907 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:208) - AbstractIndexer settings:
2024-01-09 03:58:20,907 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) -  + DocumentCollection path: /content/collections
2024-01-09 03:58:20,908 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + CollectionClass: JsonCollection
2024-01-09 03:58:20,908 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + Index path: indexes/lucene-index-trec-hp
2024-01-09 03:58:20,908 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Threads: 9
2024-01-09 03:58:20,909 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Optimize (merge segments)? false
2024-01-09 03:58:20,945 INFO  [main] index.IndexCollection (IndexCollection.java:237) - Using language-specific analyzer
2024-01-09 03:58:20,946 INFO  [ma

In [None]:
# Utiliza o pacote pyserini para ranquear
experiment = "bm25.lucene"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"evals/eval.hp.{experiment}.txt"

!rm -rf $run_file
!rm -rf $eval_file

!python -m pyserini.search.lucene \
  --index "indexes/lucene-index-trec-hp" \
  --topics "topics.tsv" \
  --output $run_file \
  --output-format trec \
  --language pt \
  --hits 1000 \
  --bm25 --k1 0.82 --b 0.68

!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
trec_aux["experiment"] = experiment
trec_results[experiment] = trec_aux.copy()

Setting BM25 parameters: k1=0.82, b=0.68
Running topics.tsv topics, saving to runs/run.hp.bm25.lucene.txt...
100% 140/140 [00:02<00:00, 69.08it/s]


In [None]:
trec_results[experiment]

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.8794,bm25.lucene
1,recall_3,all,0.9143,bm25.lucene
2,recall_5,all,0.9714,bm25.lucene
3,recall_7,all,0.9857,bm25.lucene
4,recall_9,all,0.9857,bm25.lucene


##  ADA002

In [None]:
from google.colab import userdata
from openai import OpenAI

In [None]:
client = OpenAI(api_key=userdata.get('openai-leonardo'))

In [None]:
def get_embedding(texts, model="text-embedding-ada-002"):
   return [emb.embedding for emb in client.embeddings.create(input = texts, model=model).data]

In [None]:
def ada002_search(corpus, topics):

    # ada002 search
    df = pd.DataFrame(list(corpus.items()), columns=['id', 'conteudo'])

    df["pergunta"] = df["id"].map(topics)
    df["pergunta_emb"] = get_embedding(df["pergunta"].values.tolist())
    df["conteudo_emb"] = get_embedding(df["conteudo"].values.tolist())

    query_matrix = np.vstack(df["pergunta_emb"])
    contents_matrix = np.vstack(df["conteudo_emb"])
    similarity_matrix = np.dot(query_matrix, contents_matrix.T)

    ids = df["id"].values.tolist()
    similarity = {}
    for query_id, hits in zip(ids, similarity_matrix):
      if similarity.get(query_id) is None:
          similarity[query_id] = []
      for doc_id, hit in zip(ids, hits):
          similarity[query_id].append((doc_id, hit))

    sorted_similarity = {key: sorted(value, key=lambda x: x[1], reverse=True) for key, value in similarity.items()}

    return sorted_similarity, df

In [None]:
sorted_similarity, df_ada002 = ada002_search(corpus, topics)

# Save run file
experiment = "ada002"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"evals/eval.hp.{experiment}.txt"

!rm -rf $run_file
!rm -rf $eval_file

with open(run_file,'w') as f_out:
    for query_id, hits in tqdm(sorted_similarity.items(), desc=f'Writing file on {run_file}'):
        rank = 1
        for doc_id, score in hits:
            f_out.write(f'{query_id}\tQ0\t{doc_id}\t{rank}\t{score}\t{"ADA002"}\n')
            rank+=1

!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
trec_aux["experiment"] = experiment
trec_results[experiment] = trec_aux.copy()

Writing file on runs/run.hp.ada002.txt: 100%|██████████| 140/140 [00:00<00:00, 4481.13it/s]


In [None]:
df_ada002.to_csv("df_ada002.csv", sep = "\t", index = False)

In [None]:
trec_results[experiment]

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.5839,ada002
1,recall_3,all,0.6214,ada002
2,recall_5,all,0.7286,ada002
3,recall_7,all,0.8,ada002
4,recall_9,all,0.8714,ada002


## Hybrid Search (BM25 + ADA002)

* https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf

In [None]:
# save run file
rrf_k_list = [0, 60]
for rrf_k in rrf_k_list:
    experiment = f"hybrid_bm25_ada002_k_{rrf_k}"
    run_file = f'runs/run.hp.{experiment}.txt'
    eval_file = f'evals/eval.hp.{experiment}.txt'

    run_file1_path = 'runs/run.hp.bm25.lucene.txt'
    run_file2_path = 'runs/run.hp.ada002.txt'

    !python -m pyserini.fusion \
      --runs $run_file1_path $run_file2_path \
      --output $run_file \
      --rrf.k $rrf_k \
      --k 1000

    !/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
    trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
    trec_aux["experiment"] = experiment
    trec_results[experiment] = trec_aux.copy()

In [None]:
rrf_k_list = [0, 60]
for rrf_k in rrf_k_list:
    experiment = f"hybrid_bm25_ada002_k_{rrf_k}"
    display(trec_results[experiment])

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.8177,hybrid_bm25_ada002_k_0
1,recall_3,all,0.9429,hybrid_bm25_ada002_k_0
2,recall_5,all,0.9786,hybrid_bm25_ada002_k_0
3,recall_7,all,0.9857,hybrid_bm25_ada002_k_0
4,recall_9,all,0.9929,hybrid_bm25_ada002_k_0


Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.7702,hybrid_bm25_ada002_k_60
1,recall_3,all,0.8714,hybrid_bm25_ada002_k_60
2,recall_5,all,0.9143,hybrid_bm25_ada002_k_60
3,recall_7,all,0.9286,hybrid_bm25_ada002_k_60
4,recall_9,all,0.95,hybrid_bm25_ada002_k_60


## ADA002 CUSTOM

In [None]:
def embedding_multiplied_by_matrix(embedding, matrix):
    embedding_tensor = torch.tensor(embedding).float()
    modified_embedding = torch.einsum("b, bc -> c", embedding_tensor, matrix)

    modified_embedding = modified_embedding.detach().numpy()

    return modified_embedding


def apply_matrix_to_embeddings_dataframe(matrix, df):
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


    for column in ["pergunta_embedding", "chunk_embedding"]:
        df[f"{column}_custom"] = df[column].apply(
            lambda x: embedding_multiplied_by_matrix(x, matrix)
        )
    df["cosine_similarity_custom"] = df.apply(
        lambda row: cosine_similarity(
            row["pergunta_embedding_custom"], row["chunk_embedding_custom"]
        ),
        axis=1,
    )

In [None]:
df_custom = pd.read_csv(f"{path}/tozip/df_final_to_customize_embds_HP_2negs.csv")

In [None]:
df_custom.head()

Unnamed: 0,pergunta,contexto,resposta,chunk,label,dataset,pergunta_embedding,chunk_embedding,cosine_similarity
0,Qual é o segredo que o Sr. e a Sra. Dursley te...,"Os Dursley tinham tudo que queriam, mas tinham...",a existência dos Potter,- CAPÍTULO UM - O menino que sobreviveu O Sr. ...,1,train,"[0.022689364850521088, 0.0038326631765812635, ...","[0.014419169165194035, 0.011491919867694378, -...",0.873794
1,Qual era a cor da capa que o Sr. Dursley viu u...,O Sr. Dursley não tolerava gente que andava co...,uma capa verde-esmeralda,ante o caminho para a cidade ele não pensou em...,1,train,"[0.005216388031840324, 0.0008732206770218909, ...","[-0.002306348877027631, -0.0014758666511625051...",0.873999
2,Qual foi a reação do Sr. Dursley ao ser abraça...,Levou alguns segundos até o Sr. Dursley perceb...,O Sr. Dursley ficou pregado no chão. Fora abra...,em alguém parado ali à porta. - Desculpe - mur...,1,train,"[-0.009695776738226414, -0.0014956077793613076...","[0.004920588340610266, 0.004709234461188316, 0...",0.884103
3,Qual é o nome completo do personagem que apare...,Ninguém jamais vislumbrara nada parecido com e...,O nome dele era Alvo Dumbledore.,"orrecida. Afinal, normalmente fingiam que ela ...",1,train,"[0.004215216264128685, 0.00600033113732934, 0....","[0.008049095049500465, -0.0045163994655013084,...",0.812161
4,Qual objeto Dumbledore utilizou para apagar os...,Encontrou o que procurava no bolso interior da...,Ele acionou o 'apagueiro' doze vezes.,"pando a capa, procurando alguma coisa. Mas par...",1,train,"[0.001769782043993473, 0.012089592404663563, 0...","[0.007251789793372154, 0.003910213243216276, 0...",0.879149


In [None]:
df_custom["pergunta_embedding"] = df_custom["pergunta_embedding"].apply(eval).apply(np.array)
df_custom["chunk_embedding"] = df_custom["chunk_embedding"].apply(eval).apply(np.array)
df_custom = df_custom.assign(cosine_similarity = df_custom.cosine_similarity.apply(lambda x: float(x)))

In [None]:
#best_matrix = torch.load(f"{path}best_matrix.zip", map_location="cpu")
best_matrix = torch.load(f"{path}best_matrix", map_location="cpu")
apply_matrix_to_embeddings_dataframe(best_matrix, df_custom)
df_custom.head(2)

Unnamed: 0,pergunta,contexto,resposta,chunk,label,dataset,pergunta_embedding,chunk_embedding,cosine_similarity,pergunta_embedding_custom,chunk_embedding_custom,cosine_similarity_custom,id
0,Qual é o segredo que o Sr. e a Sra. Dursley te...,"Os Dursley tinham tudo que queriam, mas tinham...",a existência dos Potter,- CAPÍTULO UM - O menino que sobreviveu O Sr. ...,1,train,"[0.022689364850521088, 0.0038326631765812635, ...","[0.014419169165194035, 0.011491919867694378, -...",0.873794,"[0.17453432, -0.60518175, -0.7326583, 0.358211...","[0.024289679, -0.73309463, -0.37434945, 0.1869...",0.398477,0
1,Qual era a cor da capa que o Sr. Dursley viu u...,O Sr. Dursley não tolerava gente que andava co...,uma capa verde-esmeralda,ante o caminho para a cidade ele não pensou em...,1,train,"[0.005216388031840324, 0.0008732206770218909, ...","[-0.002306348877027631, -0.0014758666511625051...",0.873999,"[-0.62348986, -0.43539473, -0.32246643, -0.106...","[-0.28205645, -0.14895901, 0.066049635, 0.5515...",0.479307,1


In [None]:
df_id = pd.DataFrame(list(corpus.items()), columns=['id', 'conteudo'])
df_id["pergunta"] = df_id["id"].map(topics)
df_id.head(2)

Unnamed: 0,id,conteudo,pergunta
0,0,- CAPÍTULO UM - O menino que sobreviveu O Sr. ...,Qual é o segredo que o Sr. e a Sra. Dursley te...
1,1,ante o caminho para a cidade ele não pensou em...,Qual era a cor da capa que o Sr. Dursley viu u...


In [None]:
# pergunta:id
map_dict = {k:v for k,v in zip(df_id["pergunta"], df_id["id"])}

df_custom['id'] = df_custom.pergunta.apply(lambda x: map_dict[x] if x in map_dict.keys() else -1)

df_ada002_custom = df_custom[(df_custom.id != -1) & (df_custom.label == 1)]
print(len(df_ada002_custom))
df_ada002_custom.head(2)

140


Unnamed: 0,pergunta,contexto,resposta,chunk,label,dataset,pergunta_embedding,chunk_embedding,cosine_similarity,pergunta_embedding_custom,chunk_embedding_custom,cosine_similarity_custom,id
0,Qual é o segredo que o Sr. e a Sra. Dursley te...,"Os Dursley tinham tudo que queriam, mas tinham...",a existência dos Potter,- CAPÍTULO UM - O menino que sobreviveu O Sr. ...,1,train,"[0.022689364850521088, 0.0038326631765812635, ...","[0.014419169165194035, 0.011491919867694378, -...",0.873794,"[0.17453432, -0.60518175, -0.7326583, 0.358211...","[0.024289679, -0.73309463, -0.37434945, 0.1869...",0.398477,0
1,Qual era a cor da capa que o Sr. Dursley viu u...,O Sr. Dursley não tolerava gente que andava co...,uma capa verde-esmeralda,ante o caminho para a cidade ele não pensou em...,1,train,"[0.005216388031840324, 0.0008732206770218909, ...","[-0.002306348877027631, -0.0014758666511625051...",0.873999,"[-0.62348986, -0.43539473, -0.32246643, -0.106...","[-0.28205645, -0.14895901, 0.066049635, 0.5515...",0.479307,1


In [None]:
df_ada002_custom["pergunta_embedding_custom"].iloc[0].shape

(2048,)

In [None]:
def ada002_custom_search(df_ada002_custom):

    query_matrix_custom = np.vstack(df_ada002_custom["pergunta_embedding_custom"])
    contents_matrix_custom = np.vstack(df_ada002_custom["chunk_embedding_custom"])
    similarity_matrix_custom = np.dot(query_matrix_custom, contents_matrix_custom.T)

    ids = df_ada002_custom["id"].values.tolist()
    similarity_custom = {}
    for query_id, hits in zip(ids, similarity_matrix_custom):
        if similarity_custom.get(query_id) is None:
            similarity_custom[query_id] = []
        for doc_id, hit in zip(ids, hits):
            similarity_custom[query_id].append((doc_id, hit))

    sorted_similarity_custom = {key: sorted(
        value, key=lambda x: x[1], reverse=True) for key, value in similarity_custom.items()}

    return sorted_similarity_custom

In [None]:
sorted_similarity_custom = ada002_custom_search(df_ada002_custom)

# Save run file
experiment = "custom_ada002"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"evals/eval.hp.{experiment}.txt"

!rm -rf $run_file
!rm -rf $eval_file

with open(run_file,'w') as f_out:
    for query_id, hits in tqdm(sorted_similarity_custom.items(), desc=f'Writing file on {run_file}'):
        rank = 1
        for doc_id, score in hits:
            f_out.write(f'{query_id}\tQ0\t{doc_id}\t{rank}\t{score}\t{"CUSTOM-ADA002"}\n')
            rank+=1

!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
trec_aux["experiment"] = experiment
trec_results[experiment] = trec_aux.copy()

Writing file on runs/run.hp.custom_ada002.txt: 100%|██████████| 140/140 [00:00<00:00, 4031.41it/s]


In [None]:
trec_results[experiment]

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.6518,custom_ada002
1,recall_3,all,0.75,custom_ada002
2,recall_5,all,0.8357,custom_ada002
3,recall_7,all,0.9,custom_ada002
4,recall_9,all,0.9143,custom_ada002


## Hybrid Search (BM25 + ADA002-CUSTOM)

In [None]:
# save run file
rrf_k_list = [0, 60]
for rrf_k in rrf_k_list:
    experiment = f"hybrid_bm25_custom_ada002_k_{rrf_k}"
    run_file = f'runs/run.hp.{experiment}.txt'
    eval_file = f'evals/eval.hp.{experiment}.txt'

    run_file1_path = 'runs/run.hp.bm25.lucene.txt'
    run_file2_path = 'runs/run.hp.custom_ada002.txt'

    !python -m pyserini.fusion \
      --runs $run_file1_path $run_file2_path \
      --output $run_file \
      --rrf.k $rrf_k \
      --k 1000

    !/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
    trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
    trec_aux["experiment"] = experiment
    trec_results[experiment] = trec_aux.copy()

In [None]:
rrf_k_list = [0, 60]
for rrf_k in rrf_k_list:
    experiment = f"hybrid_bm25_custom_ada002_k_{rrf_k}"
    display(trec_results[experiment])

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.8821,hybrid_bm25_custom_ada002_k_0
1,recall_3,all,0.9786,hybrid_bm25_custom_ada002_k_0
2,recall_5,all,0.9929,hybrid_bm25_custom_ada002_k_0
3,recall_7,all,0.9929,hybrid_bm25_custom_ada002_k_0
4,recall_9,all,0.9929,hybrid_bm25_custom_ada002_k_0


Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.8436,hybrid_bm25_custom_ada002_k_60
1,recall_3,all,0.9143,hybrid_bm25_custom_ada002_k_60
2,recall_5,all,0.9429,hybrid_bm25_custom_ada002_k_60
3,recall_7,all,0.9714,hybrid_bm25_custom_ada002_k_60
4,recall_9,all,0.9857,hybrid_bm25_custom_ada002_k_60


## Reranker

Referências:
* https://github.com/zetaalphavector/InPars/blob/master/inpars/rerank.py
* https://huggingface.co/unicamp-dl/mt5-base-en-pt-msmarco-v2
* https://github.com/castorini/pygaggle/blob/master/pygaggle/rerank/transformer.py
* https://github.com/castorini/pygaggle/blob/08339dd31f58ef40fbaa109726402e164eeba125/pygaggle/run/robust04_reranker_pipeline_gpu.py

In [None]:
import csv
import argparse
import torch
from math import ceil, exp
from typing import List
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration,
    T5Tokenizer,
    MT5ForConditionalGeneration
)

In [None]:
prediction_tokens = {
        'castorini/monot5-base-msmarco':             ['▁false', '▁true'],
        'castorini/monot5-base-msmarco-10k':         ['▁false', '▁true'],
        'castorini/monot5-large-msmarco':            ['▁false', '▁true'],
        'castorini/monot5-large-msmarco-10k':        ['▁false', '▁true'],
        'castorini/monot5-base-med-msmarco':         ['▁false', '▁true'],
        'castorini/monot5-3b-med-msmarco':           ['▁false', '▁true'],
        'castorini/monot5-3b-msmarco-10k':           ['▁false', '▁true'],
        'unicamp-dl/mt5-base-en-msmarco':            ['▁no'   , '▁yes'],
        'unicamp-dl/ptt5-base-pt-msmarco-10k-v2':    ['▁não'  , '▁sim'],
        'unicamp-dl/ptt5-base-pt-msmarco-100k-v2':   ['▁não'  , '▁sim'],
        'unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2':['▁não'  , '▁sim'],
        'unicamp-dl/mt5-base-en-pt-msmarco-v2':      ['▁no'   , '▁yes'],
        'unicamp-dl/mt5-base-mmarco-v2':             ['▁no'   , '▁yes'],
        'unicamp-dl/mt5-base-en-pt-msmarco-v1':      ['▁no'   , '▁yes'],
        'unicamp-dl/mt5-base-mmarco-v1':             ['▁no'   , '▁yes'],
        'unicamp-dl/ptt5-base-pt-msmarco-10k-v1':    ['▁não'  , '▁sim'],
        'unicamp-dl/ptt5-base-pt-msmarco-100k-v1':   ['▁não'  , '▁sim'],
        'unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1': ['▁não'  , '▁sim'],
        'unicamp-dl/mt5-3B-mmarco-en-pt':            ['▁'  , '▁true'],
        'unicamp-dl/mt5-13b-mmarco-100k':            ['▁', '▁true'],
        }

In [None]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

In [None]:
class Reranker:
    def __init__(self, silent=False, batch_size=8, fp16=False, torchscript=False, device=None):
        self.silent = silent
        self.batch_size = batch_size
        self.fp16 = fp16
        self.torchscript = torchscript
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    @classmethod
    def from_pretrained(cls, model_name_or_path, **kwargs):
        config = AutoConfig.from_pretrained(model_name_or_path)
        print(f"architecture: {config.architectures}")
        print(f"architecture: MonoT5Reranker")
        return MonoT5Reranker(model_name_or_path, **kwargs)

In [None]:
class MonoT5Reranker(Reranker):
    name: str = 'MonoT5'
    prompt_template: str = "Query: {query} Document: {text} Relevant:"

    def __init__(
        self,
        model_name_or_path='unicamp-dl/mt5-base-en-pt-msmarco-v2',
        token_false=None,
        token_true=True,
        torch_compile=False,
        **kwargs
    ):
        super().__init__(**kwargs)
        if not self.device:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            print(self.device)
        model_args = {}
        if self.fp16:
            model_args["torch_dtype"] = torch.float16
        self.model = MT5ForConditionalGeneration.from_pretrained(model_name_or_path, **model_args)
        self.torch_compile = torch_compile
        if torch_compile:
            self.model = torch.compile(self.model)
        self.model.to(self.device)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
        self.token_false_id, self.token_true_id = self.get_prediction_tokens(
            model_name_or_path, self.tokenizer, token_false, token_true,
        )

    def get_prediction_tokens(self, model_name_or_path, tokenizer, token_false=None, token_true=None):
        if not (token_false and token_true):
            if model_name_or_path in prediction_tokens:
                token_false, token_true = prediction_tokens[model_name_or_path]
                token_false_id = tokenizer.get_vocab()[token_false]
                token_true_id  = tokenizer.get_vocab()[token_true]
                return token_false_id, token_true_id
            else:
                # raise Exception(f"We don't know the indexes for the non-relevant/relevant tokens for\
                #         the checkpoint {model_name_or_path} and you did not provide any.")
                return self.get_prediction_tokens('castorini/monot5-base-msmarco', self.tokenizer)
        else:
            token_false_id = tokenizer.get_vocab()[token_false]
            token_true_id  = tokenizer.get_vocab()[token_true]
            return token_false_id, token_true_id

    @torch.inference_mode()
    def rescore(self, pairs: List[List[str]]):
        scores = []
        for batch in tqdm(
            chunks(pairs, self.batch_size),
            disable=self.silent,
            desc="Rescoring",
            total=ceil(len(pairs) / self.batch_size),
        ):
            prompts = [
                self.prompt_template.format(query=query, text=text)
                for (query, text) in batch
            ]
            tokens = self.tokenizer(
                prompts,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=self.tokenizer.model_max_length,
                pad_to_multiple_of=(8 if self.torch_compile else None),
            ).to(self.device)
            output = self.model.generate(
                **tokens,
                max_new_tokens=1,
                return_dict_in_generate=True,
                output_scores=True,
            )
            batch_scores = output.scores[0]
            batch_scores = batch_scores[:, [self.token_false_id, self.token_true_id]]
            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
            scores += batch_scores[:, 1].exp().tolist()
        return scores

In [None]:
class TRECRun:
    def __init__(self, run_file, sep=r"\s+"):

        self.run_file = run_file
        self.df = pd.read_csv(
            run_file,
            sep=sep,
            quoting=csv.QUOTE_NONE,
            keep_default_na=False,
            names=("qid", "_1", "docid", "rank", "score", "ranker"),
            dtype=str,
        )

    def rerank(self, ranker, queries, corpus, top_k=1000):
        # Converts run to float32 and subtracts a large number to ensure the BM25 scores
        # are lower than those provided by the neural ranker.
        self.df["score"] = (
            self.df["score"]
            .astype("float32")
            .apply(lambda x: x-10000)
        )

        # Reranks only the top-k documents for each query
        subset = (
            self.df[["qid", "docid"]]
            .groupby("qid")
            .head(top_k)
            .apply(lambda x: [queries[int(x["qid"])], corpus[int(x["docid"])]], axis=1)
        )
        scores = ranker.rescore(subset.values.tolist())

        self.df.loc[subset.index, "score"] = scores

        self.df["ranker"] = ranker.name
        self.df = (
            self.df
            .groupby("qid")
            .apply(lambda x: x.sort_values("score", ascending=False))
            .reset_index(drop=True)
        )

        self.df["rank"] = self.df.groupby("qid").cumcount() + 1

    def save(self, path):
        self.df.to_csv(path, index=False, sep="\t", header=False, float_format='%.15f')

In [None]:
def parse_args(args):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model", default='castorini/monot5-small-msmarco-100k', type=str, required=False, help="Reranker model."
        )
    parser.add_argument(
        "--input_run", default=None, type=str, help="Initial run to be reranked."
        )
    parser.add_argument(
        "--output_run", default=None, type=str, required=True, help="Path to save the reranked run."
        )
    parser.add_argument(
        "--fp16", default=True, type=bool, help="Whether to use FP16 weights during inference."
        )
    parser.add_argument(
        "--torch_compile", default=True, type=bool, help="Whether to compile the model with `torch.compile`."
        )
    parser.add_argument(
        "--batch_size", default=16, type=int, help="Batch size for inference."
        )
    parser.add_argument(
        "--top_k", default=1_000, type=int, help="Top-k documents to be reranked for each query."
        )
    return parser.parse_args(args)

In [None]:
# Save run file
experiment = "mt5-base-en-pt-msmarco-v2-reranker"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"evals/eval.hp.{experiment}.txt"

def dev_parser():
    parser = parse_args([
        '--model', 'unicamp-dl/mt5-base-en-pt-msmarco-v2',
        '--input_run', 'runs/run.hp.bm25.lucene.txt',
        '--output_run', run_file,
        '--batch_size', f'16',
        '--top_k', f'50'
        ])
    return parser

args = dev_parser()
print(args)
input_run = args.input_run
top_k = args.top_k
output_run = args.output_run

model = Reranker.from_pretrained(
    model_name_or_path=args.model,
    batch_size=args.batch_size,
    fp16=args.fp16,
    torch_compile=args.torch_compile
)

!rm -rf $run_file
!rm -rf $eval_file

run = TRECRun(input_run)
run.rerank(model, topics, corpus, top_k=top_k)
run.save(output_run)

!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1
trec_aux = pd.read_csv(eval_file, sep = "\t", names = ["metric", "all", "value"])
trec_aux["experiment"] = experiment
trec_results[experiment] = trec_aux.copy()

Namespace(model='unicamp-dl/mt5-base-en-pt-msmarco-v2', input_run='runs/run.hp.bm25.lucene.txt', output_run='runs/run.hp.mt5-base-en-pt-msmarco-v2-reranker.txt', fp16=True, torch_compile=True, batch_size=16, top_k=50)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

architecture: ['MT5ForConditionalGeneration']
architecture: MonoT5Reranker


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Rescoring: 100%|██████████| 438/438 [03:28<00:00,  2.10it/s]


In [None]:
trec_results[experiment]

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.9196,mt5-base-en-pt-msmarco-v2-reranker
1,recall_3,all,0.9714,mt5-base-en-pt-msmarco-v2-reranker
2,recall_5,all,0.9857,mt5-base-en-pt-msmarco-v2-reranker
3,recall_7,all,0.9929,mt5-base-en-pt-msmarco-v2-reranker
4,recall_9,all,1.0,mt5-base-en-pt-msmarco-v2-reranker


## SPLADE (TO DO)

In [None]:
with open("run_splade.json", 'r') as file:
    run_splade = json.loads(file.read())

In [None]:
# Save run file
output = f'/content/run.hp.splade.ensemble.distil.pt.txt'

with open(output,'w') as f_out:
    for query_id in tqdm(run_splade, desc=f'Writing file on {output}'):
        rank = 1
        hits = sorted(run_splade[query_id].items(), key=lambda x: x[1], reverse=True)
        for doc_id, score in hits:
            f_out.write(f'{int(query_id)}\tQ0\t{int(doc_id)}\t{rank}\t{score}\t{"SPLADE"}\n')
            rank+=1

Writing file on /content/run.hp.splade.ensemble.distil.pt.txt: 100%|██████████| 140/140 [00:00<00:00, 4597.00it/s]


In [None]:
!head -n 10 run.hp.splade.ensemble.distil.pt.txt

0	Q0	0	1	23.49915885925293	SPLADE
0	Q0	8	2	18.343889236450195	SPLADE
0	Q0	2	3	16.050294876098633	SPLADE
0	Q0	3	4	15.079992294311523	SPLADE
0	Q0	10	5	14.750470161437988	SPLADE
0	Q0	4	6	14.708215713500977	SPLADE
0	Q0	1	7	14.512762069702148	SPLADE
0	Q0	23	8	14.292257308959961	SPLADE
0	Q0	12	9	12.748653411865234	SPLADE
0	Q0	120	10	12.246784210205078	SPLADE


In [None]:
!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -m map -m recip_rank -m P.1,2,3,5,10 -m recall.1,2,3,5,10 /content/qrels.tsv /content/run.hp.splade.ensemble.distil.pt.txt > eval.hp.splade.ensemble.distil.pt.txt 2>&1

## Compile results

Remember:
* bm25.lucene == 1.0_bm25_0.0_ada002
* ada002 == 0.0_bm25_1.0_ada002
* bm25.lucene == 1.0_bm25_0.0_ada002_custom
* ada002_custom == 0.0_bm25_1.0_ada002_custom

In [None]:
results = pd.DataFrame()
for experiment in trec_results.keys():
    results = pd.concat([results, trec_results[experiment]], axis = 0)

results = results[~results["experiment"].isin(["0.0_bm25_1.0_ada002", "1.0_bm25_0.0_ada002", "0.0_bm25_1.0_ada002_custom", "1.0_bm25_0.0_ada002_custom"])].copy()

In [None]:
metric  = 'all' #@param ['all', 'recip_rank', 'recall_3', 'recall_5', 'recall_7','recall_9'] {allow-input: true}

if metric == "all":
    for metric in results["metric"].unique():
        display(results[results["metric"].str.strip() == metric.strip()].sort_values(by = "value", ascending = False))
        print("\n")
else:
    display(results[results["metric"].str.strip() == metric].sort_values(by = "value", ascending = False))

Unnamed: 0,metric,all,value,experiment
0,recip_rank,all,0.9196,mt5-base-en-pt-msmarco-v2-reranker
0,recip_rank,all,0.8821,hybrid_bm25_custom_ada002_k_0
0,recip_rank,all,0.8794,bm25.lucene
0,recip_rank,all,0.8436,hybrid_bm25_custom_ada002_k_60
0,recip_rank,all,0.8177,hybrid_bm25_ada002_k_0
0,recip_rank,all,0.7702,hybrid_bm25_ada002_k_60
0,recip_rank,all,0.6518,custom_ada002
0,recip_rank,all,0.5839,ada002






Unnamed: 0,metric,all,value,experiment
1,recall_3,all,0.9786,hybrid_bm25_custom_ada002_k_0
1,recall_3,all,0.9714,mt5-base-en-pt-msmarco-v2-reranker
1,recall_3,all,0.9429,hybrid_bm25_ada002_k_0
1,recall_3,all,0.9143,bm25.lucene
1,recall_3,all,0.9143,hybrid_bm25_custom_ada002_k_60
1,recall_3,all,0.8714,hybrid_bm25_ada002_k_60
1,recall_3,all,0.75,custom_ada002
1,recall_3,all,0.6214,ada002






Unnamed: 0,metric,all,value,experiment
2,recall_5,all,0.9929,hybrid_bm25_custom_ada002_k_0
2,recall_5,all,0.9857,mt5-base-en-pt-msmarco-v2-reranker
2,recall_5,all,0.9786,hybrid_bm25_ada002_k_0
2,recall_5,all,0.9714,bm25.lucene
2,recall_5,all,0.9429,hybrid_bm25_custom_ada002_k_60
2,recall_5,all,0.9143,hybrid_bm25_ada002_k_60
2,recall_5,all,0.8357,custom_ada002
2,recall_5,all,0.7286,ada002






Unnamed: 0,metric,all,value,experiment
3,recall_7,all,0.9929,hybrid_bm25_custom_ada002_k_0
3,recall_7,all,0.9929,mt5-base-en-pt-msmarco-v2-reranker
3,recall_7,all,0.9857,bm25.lucene
3,recall_7,all,0.9857,hybrid_bm25_ada002_k_0
3,recall_7,all,0.9714,hybrid_bm25_custom_ada002_k_60
3,recall_7,all,0.9286,hybrid_bm25_ada002_k_60
3,recall_7,all,0.9,custom_ada002
3,recall_7,all,0.8,ada002






Unnamed: 0,metric,all,value,experiment
4,recall_9,all,1.0,mt5-base-en-pt-msmarco-v2-reranker
4,recall_9,all,0.9929,hybrid_bm25_ada002_k_0
4,recall_9,all,0.9929,hybrid_bm25_custom_ada002_k_0
4,recall_9,all,0.9857,bm25.lucene
4,recall_9,all,0.9857,hybrid_bm25_custom_ada002_k_60
4,recall_9,all,0.95,hybrid_bm25_ada002_k_60
4,recall_9,all,0.9143,custom_ada002
4,recall_9,all,0.8714,ada002






In [None]:
!zip -r runs_v4.zip runs
!zip -r evals_v4.zip evals

main_dir = path.replace("data/","")
shutil.copyfile("runs_v4.zip", f"{main_dir}runs_v4.zip")
shutil.copyfile("evals_v4.zip", f"{main_dir}evals_v4.zip")
print(f"\nSaving 'runs' and 'evals' to {main_dir}")

  adding: runs/ (stored 0%)
  adding: runs/run.hp.hybrid_bm25_ada002_k_0.txt (deflated 76%)
  adding: runs/run.hp.hybrid_bm25_custom_ada002_k_0.txt (deflated 76%)
  adding: runs/run.hp.hybrid_bm25_custom_ada002_k_60.txt (deflated 74%)
  adding: runs/run.hp.ada002.txt (deflated 66%)
  adding: runs/run.hp.mt5-base-en-pt-msmarco-v2-reranker.txt (deflated 73%)
  adding: runs/run.hp.hybrid_bm25_ada002_k_60.txt (deflated 74%)
  adding: runs/run.hp.bm25.lucene.txt (deflated 76%)
  adding: runs/run.hp.custom_ada002.txt (deflated 70%)
  adding: evals/ (stored 0%)
  adding: evals/eval.hp.ada002.txt (deflated 60%)
  adding: evals/eval.hp.hybrid_bm25_custom_ada002_k_0.txt (deflated 66%)
  adding: evals/eval.hp.hybrid_bm25_custom_ada002_k_60.txt (deflated 61%)
  adding: evals/eval.hp.hybrid_bm25_ada002_k_60.txt (deflated 60%)
  adding: evals/eval.hp.custom_ada002.txt (deflated 61%)
  adding: evals/eval.hp.bm25.lucene.txt (deflated 64%)
  adding: evals/eval.hp.mt5-base-en-pt-msmarco-v2-reranker.txt 

## context to LLM

In [None]:
file_run_reranker    = pd.read_csv("runs/run.hp.mt5-base-en-pt-msmarco-v2-reranker.txt", sep = "\t", header = None, names = ["query_id", "q0", "doc_id", "rank", "score", "system"])
file_run_hybrid      = pd.read_csv("runs/run.hp.hybrid_bm25_custom_ada002_k_0.txt", sep = "\t", header = None, names = ["query_id", "q0", "doc_id", "rank", "score", "system"])
file_run_bm25        = pd.read_csv("runs/run.hp.bm25.lucene.txt", sep = " ", header = None, names = ["query_id", "q0", "doc_id", "rank", "score", "system"])
file_run_ada_custom  = pd.read_csv("runs/run.hp.custom_ada002.txt", sep = "\t", header = None, names = ["query_id", "q0", "doc_id", "rank", "score", "system"])
file_run_ada         = pd.read_csv("runs/run.hp.ada002.txt", sep = "\t", header = None, names = ["query_id", "q0", "doc_id", "rank", "score", "system"])

In [None]:
def make_json_query_bestchunks(file_run, filename, json_mode="text"):
    n = [3,5,7,9]
    for ix in n:

        query_context = {}

        for query_id in topics.keys():
            hits = file_run[file_run["query_id"] == query_id]
            hits = hits.sort_values(by = "rank")
            if json_mode == "text":
                query_context[topics.get(query_id)] = [corpus.get(doc) for doc in list(hits["doc_id"].values)[:ix]]
            else:
                query_context[int(query_id)] = [int(doc_id) for doc_id in list(hits["doc_id"].values)[:ix]]

        try:
          file_path = f'/content/drive/MyDrive/Artigos/RAG-stuffs/data/data_retriever/{filename}_{ix}_contexts_to_llm.json'
          with open(file_path, 'w', encoding='utf-8') as json_file:
              json.dump(query_context, json_file, ensure_ascii=False, indent=1)
        except:
          file_path = f'/content/data_retriever/{filename}_{ix}_contexts_to_llm.json'
          with open(file_path, 'w', encoding='utf-8') as json_file:
              json.dump(query_context, json_file, ensure_ascii=False, indent=1)

In [None]:
make_json_query_bestchunks(file_run=file_run_ada, filename="ada")

In [None]:
!zip -r data_retriever.zip data_retriever

main_dir = path.replace("data/","")
shutil.copyfile("data_retriever.zip", f"{main_dir}data_retriever.zip")
print(f"\nSaving 'data_retriever' to {main_dir}")

  adding: data_retriever/ (stored 0%)
  adding: data_retriever/ada_5_contexts_to_llm.json (deflated 73%)
  adding: data_retriever/reranker_9_contexts_to_llm.json (deflated 69%)
  adding: data_retriever/ada_7_contexts_to_llm.json (deflated 71%)
  adding: data_retriever/hybrid_3_contexts_to_llm.json (deflated 67%)
  adding: data_retriever/bm25_3_contexts_to_llm.json (deflated 72%)
  adding: data_retriever/hybrid_7_contexts_to_llm.json (deflated 67%)
  adding: data_retriever/hybrid_5_contexts_to_llm.json (deflated 67%)
  adding: data_retriever/reranker_5_contexts_to_llm.json (deflated 74%)
  adding: data_retriever/ada_3_contexts_to_llm.json (deflated 75%)
  adding: data_retriever/ada_custom_5_contexts_to_llm.json (deflated 74%)
  adding: data_retriever/reranker_7_contexts_to_llm.json (deflated 71%)
  adding: data_retriever/ada_custom_3_contexts_to_llm.json (deflated 74%)
  adding: data_retriever/ada_9_contexts_to_llm.json (deflated 69%)
  adding: data_retriever/hybrid_9_contexts_to_llm.js

In [None]:
exp_json = "reranker_3_contexts_to_llm.json"
try:
    path_folder = "/content/drive/MyDrive/Artigos/RAG-stuffs/data/data_retriever/"
    with open(path_folder + exp_json, 'r', encoding='utf-8') as json_file:
        loaded_data = json.load(json_file)
except:
    path_folder = "/content/data_retriever/"
    with open(path_folder + exp_json, 'r', encoding='utf-8') as json_file:
        loaded_data = json.load(json_file)

In [None]:
qa_prompt = """Com muita calma, responda a seguinte pergunta dentro do `contexto` que será fornecido abaixo.
Utilize somente informações do contexto para responder.

Pergunta: ```{pergunta}```

"""

get_context = """
## inicio do contexto {ix}.
{contexto}
## fim do contexto {ix}.

###
"""

final_format = """\nO output deve ser ter somente a resposta."""


for query, chunks in loaded_data.items():
    for chunk in chunks:

        chunks_retrieved = ''
        for ix, chunk in enumerate(chunks, 1):
            chunks_retrieved += get_context.format(contexto=chunk, ix=ix)

            if ix== len(chunks):
                prompt = qa_prompt.format(pergunta=query) + chunks_retrieved + final_format

In [None]:
len(chunks)

3

In [None]:
print(prompt)

Com muita calma, responda a seguinte pergunta dentro do `contexto` que será fornecido abaixo.
Utilize somente informações do contexto para responder.

Pergunta: ```Qual é a razão específica pela qual Harry Potter está sorrindo ao se despedir de Hermione e imaginar suas férias com o Duda?```


## inicio do contexto 1.
 - Vejo vocês durante as férias, então. - Espero que você tenha... hã... umas boas férias - disse Hermione, olhando hesitante para tio Válter, espantada que alguém pudesse ser tão desagradável. - Ah, claro que sim - respondeu Harry, e eles ficaram surpresos com o sorriso que se espalhava pelo seu rosto. - Eles não sabem que não podemos fazer bruxarias em casa. Vou me divertir à beça com o Duda este verão...
## fim do contexto 1.

###

## inicio do contexto 2.
 a ele não tinham mudado nem um pingo. Isto não o preocupou. Parecia-lhe que sua vida voltaria ao normal no próximo ano, ou tão normal quanto ela poderia ser em Hogwarts. Foi a melhor noite da vida de Harry, melhor do

In [None]:
experiment = f"bm25.lucene"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"eval.hp.{experiment}.txt"
!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1

In [None]:
experiment = f"custom_ada002"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"eval.hp.{experiment}.txt"
!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1

In [None]:
experiment = f"hybrid_bm25_custom_ada002"
run_file = f"runs/run.hp.{experiment}.txt"
eval_file = f"eval.hp.{experiment}.txt"
!/content/pyserini/tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank -m recall.3,5,7,9 qrels.tsv $run_file > $eval_file 2>&1