# Bibliotecas e pacotes

In [None]:
!apt install openjdk-21-jdk-headless

In [None]:
import os

os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-21-openjdk-amd64'

In [None]:
!pip install pyserini faiss-cpu
!pip install -q groq
!pip install -q beautifulsoup4
!pip install langchain
!pip install langchain-groq

In [None]:
from google.colab import userdata, drive
from groq import Groq, RateLimitError
from tqdm import tqdm
from bs4 import BeautifulSoup
from pyserini.search.lucene import LuceneSearcher
from collections import Counter
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.tools import tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate

import json
import threading
import time
import json
import spacy
import argparse
import collections
import numpy as np
import re
import string
import sys
import unicodedata
import pandas as pd

import warnings
warnings.simplefilter('ignore')

# Atributos e hiper-parâmetros

In [None]:
LLM_MODEL_NAME = "llama3-70b-8192"
LLM_TEMPERATURE = 0

DOCUMENT_WINDOW_STRIDE = 3
DOCUMENT_WINDOW_SIZE = 2

RETRIEVER_TOP_K = 5

N_QUESTIONS = 50

In [None]:
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

In [None]:
# Prompt adaptado a partir da combinação dos seguintes prompts:
#   - https://github.com/run-llama/llama_index/blob/a87b63fce3cc3d24dc71ae170a8d431440025565/llama_index/agent/react/prompts.py
#   - https://smith.langchain.com/hub/hwchase17/react-chat (entrei em contato através do código do Fábio Grassiotto)

REACT_CHAT_SYSTEM_HEADER = """\

You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.
You are a Large Language Model trained by Meta AI.
As a language model, you are able to generate human-like text based on the input you receive, allowing yourself to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
You are constantly learning and improving, and your capabilities are constantly evolving. You are able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions.
Additionally, you are able to generate your own text based on the input it receives, allowing yourself to engage in discussions and provide explanations and descriptions on a wide range of topics.
Overall, you are a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether the user needs help with a specific question or just want to have a conversation about a particular topic, you are here to assist.

TOOLS:
------
You have access to a wide variety of tools.
You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.
You have access to the following tools:
{tools}

To use a tool, please use the following format:
```
Thought: Do I need to use a tool? Yes
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
```

When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
```
Thought: Do I need to use a tool? No
Final Answer: [your response here]
```

Your final answer must be short, no more than 10 words, and use numerals instead of words for numbers.
If you don't know any plausible answer, answer "Not enough information provided in the documents."

Begin!
New input: {input}
{agent_scratchpad}
"""

# Dataset Não Inviabilize

In [None]:
# load dataset transcriptions
# load dataset questions

# Indexação e pré-processamento

In [None]:
# preprocess (make chunks, ...) and index transcriptions

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def extract_data(num_questions, dataset, context_articles):
    questions_found = []
    num_questions_found = 0
    documents = []
    all_titles = []

    for item in tqdm(dataset):
        question = item['question']
        answer = item['answer']
        answer_type = answer['type']

        if answer_type == 'binary' or answer_type == 'value':
            final_answer = answer['answer_value']
        elif answer_type == 'span':
            final_answer = answer['answer_spans'][0]['text']
        elif answer_type == 'none':
            final_answer = 'none'
        else:
            final_answer = 'An error perhaps, bad type'

        if final_answer == 'none':
            # Skip this one.
            continue
        else:
            # Thats a good question.

            # First add some extra info in the context-part
            for context in item['context']:
                if context['passage'] == "main":
                    # Cleanup html tags
                    clean_text = remove_html_tags(context['text'])
                    documents.append({
                        "title": item['title'].lower(),
                        "content": clean_text
                    })

            all_titles.append(item['title'].lower())

            # And then grab the text from the articles json
            for link in item["question_links"]:
                if link.lower() in context_articles and link.lower() not in all_titles:
                  # Cleanup html tags
                  soup = BeautifulSoup(context_articles[link.lower()], 'html.parser')
                  clean_text = soup.get_text()

                  documents.append({
                    "title": link,
                    "content": clean_text
                  })
                all_titles.append(link.lower())

            questions_found.append({"Question": question, "Answer": final_answer})
            num_questions_found += 1

            if num_questions_found == num_questions:
                # found our questions
                break

    return questions_found, documents, all_titles

In [None]:
questions_to_ask, documents, all_titles = extract_data(N_QUESTIONS, test_questions, context_articles)

In [None]:
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

In [None]:
def sliding_window_split(documents, stride, window_size):
    treated_documents = []

    for j, document in enumerate(tqdm(documents)):
        doc_text = document['content']
        doc = nlp(doc_text[:10000])
        sentences = [sent.text.strip() for sent in doc.sents]
        for i in range(0, len(sentences), stride):
            segment = ' '.join(sentences[i:i+window_size]).strip()
            treated_documents.append({
                "title": document['title'],
                "contents": document['title']+ ". " + segment,
                "segment": segment
            })
            if i+window_size >= len(sentences):
                break

    return treated_documents

In [None]:
def add_id_and_filter_empty(documents):
    filtered_documents = []
    for i, doc in enumerate(documents):
        if doc['segment'] != "":
            filtered_doc = { **doc }
            filtered_doc['id'] = i
            filtered_documents.append(filtered_doc)
    return filtered_documents

In [None]:
treated_documents = add_id_and_filter_empty(sliding_window_split(documents, stride=DOCUMENT_WINDOW_STRIDE, window_size=DOCUMENT_WINDOW_SIZE))

In [None]:
!mkdir iirc_index_content

In [None]:
with open("iirc_index_content/contents.jsonl",'w') as file:
    for doc in treated_documents:
        file.write(json.dumps(doc)+"\n")

In [None]:
!python3 -m pyserini.index.lucene -collection JsonCollection -generator DefaultLuceneDocumentGenerator -threads 1 -input iirc_index_content -index iirc_index -storeRaw

# Retriever com BM25 + MonoPTT5

In [None]:
# use BM25 to retrieve something like 1000 documents
# rerank with MonoPTT5 and get only top K (K could be 3 or 5)

In [None]:
class PyseriniRetriever:
    def __init__(self, top_k):
        self._searcher = LuceneSearcher('./iirc_index')
        self._top_k = top_k

    def __call__(self, query):
        hits = self._searcher.search(query, k=self._top_k)
        return [ json.loads(hit.lucene_document.get('raw')) for hit in hits ]

In [None]:
retriever = PyseriniRetriever(RETRIEVER_TOP_K)

# Abordagens

## Naive RAG

## RAG baseado em ReAct

In [None]:
@tool
def get_topk_with_bm25(question: str) -> list[str]:
    """Returns a sequence of five document passages with texts to help solve a question."""
    searched_documents = retriever(question)
    results = [ f"Document passage {i}: {doc['contents']}" for i, doc in enumerate(searched_documents, 1) ]
    return results

In [None]:
def get_agent_executor(verbose=False):
    prompt = PromptTemplate.from_template(REACT_CHAT_SYSTEM_HEADER)
    llm = ChatGroq(
        temperature=LLM_TEMPERATURE,
        model_name=LLM_MODEL_NAME,
        api_key=GROQ_API_KEY
    )

    tools = [get_topk_with_bm25]
    agent = create_react_agent(llm, [get_topk_with_bm25], prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=verbose, handle_parsing_errors=True)

    return agent_executor

In [None]:
agent_executor = get_agent_executor(verbose=True)

# Avaliação

## F1 Score e Exact Match

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    def remove_accents(input_str):
        nfkd_form = unicodedata.normalize('NFKD', input_str)
        only_ascii = nfkd_form.encode('ASCII', 'ignore')
        return only_ascii.decode("utf-8")

    return white_space_fix(remove_articles(remove_punc(lower(remove_accents(s)))))


def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
def evaluate_agent_executor(executor):
    df = pd.DataFrame(columns=['question', 'answer', 'LLM answer', 'F1', 'Exact Match'])

    for item in tqdm(questions_to_ask):
        question = item.get('Question')
        answer = item.get('Answer')

        agent_answer = executor.invoke({"input": "Question: " + question})
        llm_answer = agent_answer['output']
        f1_score = compute_f1(llm_answer, answer)
        e_match_score = compute_exact(llm_answer, answer)

        row = pd.Series(
            [question, answer, llm_answer, f1_score, e_match_score],
            index=df.columns
        )
        df = pd.concat(
            [df, pd.DataFrame([row])],
            ignore_index=True
        )

    return df

In [None]:
df = evaluate_agent_executor(agent_executor)

In [None]:
df.head()

In [None]:
print("      Metrics     ")
print("------------------")
print(f"F1 score:\n\tAvg: {df['F1'].mean():.2f}.\n\tStd: {df['F1'].std():.2f}.")
print(f"Exact Match score:\n\tAvg: {df['Exact Match'].mean():.2f}.\n\tStd: {df['Exact Match'].std():.2f}.")

## RAGAs