In [3]:
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [1]:
import json
import os
import logging
import sys
import threading
import time

# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes.
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.
import nest_asyncio


from llama_index.core import Document, QueryBundle
from llama_index.core.schema import TextNode
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.llms.groq import Groq

from langchain_groq import ChatGroq
from langchain.tools import tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate

from groq import Groq, RateLimitError

import spacy

from glob import glob
from dataclasses import dataclass
from typing import List, Dict
from thefuzz import process

In [2]:
nest_asyncio.apply()

os.environ["GROQ_API_KEY"] = "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"

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
SENTENCE_WINDOW_SIZE = 5
SENTENCE_WINDOW_STRIDE = 2

TOP_K = 4
BM25_TOP_K = 1000

RERANKER_MODEL = "unicamp-dl/monoptt5-base"

LLM_MODEL = "llama3-70b-8192"
LLM_CONTEXT_SIZE = 8192
LLM_TEMPERATURE = 0
LLM_TOP_P = 1

In [4]:
def get_transcription_documents(base_transcriptions_path):
    transcription_documents = []

    for transcriptions_path in glob(base_transcriptions_path):
        with open(transcriptions_path) as f:
            transcriptions = json.load(f)
        
        for transcription in transcriptions:
            transcription_documents.append(
                Document(
                    text=transcription['transcription'],
                    metadata={
                        'title': transcription['title'],
                        'publishing_date': transcription['publishing_date'],
                        'quadro': transcription['quadro'],
                        'hashtag': transcription['hashtag'],
                    },
                )
            )
            
    return transcription_documents

In [5]:
def sliding_window_split(documents, stride, window_size):
    sentencizer = spacy.blank('pt')
    sentencizer.add_pipe('sentencizer')
    
    window_documents = []

    for document in documents:
        text = document.text
        paragraphs = text.split('\n\n') 
        paragraphs = [ p.replace('\n', ' ').strip() for p in paragraphs if p.strip() ]
        for paragraph in paragraphs:
            p_sentencized = sentencizer(paragraph)
            sentences = [sent.text.strip() for sent in p_sentencized.sents]
            for i in range(0, len(sentences), stride):
                window_text = ' '.join(sentences[i : min(len(sentences), i+window_size)]).strip()
                window_metadata = document.metadata.copy()
                window_metadata['parent_document_id'] = document.id_
                window_documents.append(Document(text=window_text, metadata=window_metadata))

    return window_documents

In [6]:
def get_transcriptions_nodes(transcription_documents):
    transcription_window_documents = sliding_window_split(transcription_documents, SENTENCE_WINDOW_STRIDE, SENTENCE_WINDOW_SIZE)
    
    transcription_nodes = dict()
    for document in transcription_window_documents:
        new_node = TextNode(id=document.id_, text=document.text, metadata=document.metadata)
        if new_node.metadata['title'] not in transcription_nodes.keys():
            transcription_nodes[document.metadata['title']] = [new_node]
        else:
            transcription_nodes[document.metadata['title']].append(new_node)
    
    return transcription_nodes

In [7]:
transcription_documents = get_transcription_documents("../transcriptions-headless/*.json")

# transcription_documents[0]

In [8]:
transcription_nodes = get_transcriptions_nodes(transcription_documents)

# transcription_nodes['milton']

In [9]:
class GroqClient:
    '''
    Interface for using the Groq API

    Implements a rate limit control for multi-threading use.
    '''

    # documentacao dos parametros em: https://console.groq.com/docs/text-chat
    _context_size = LLM_CONTEXT_SIZE
    _temperature = LLM_TEMPERATURE
    _top_p = LLM_TOP_P
    _stop = None
    _stream = False

    # Mutex lock
    _rate_lock = threading.Lock()

    def __init__(self, llm_model, api_key):
        '''
        GroqClient constructor.
        '''
        self._client = Groq(api_key=api_key)
        self._llm_model = llm_model


    def __call__(self, prompt: str) -> str:
        '''
        Generates the model response

        Args:
            prompt (str): prompt to send to the model.

        Returns:
            str: model response.
        '''

        done = False
        while not done:
            try:
                GroqClient._rate_lock.acquire()
                GroqClient._rate_lock.release()
                chat_completion = self._client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": prompt,
                        }
                    ],
                    model=self._llm_model,
                    temperature=self._temperature,
                    max_tokens=self._context_size,
                    top_p=self._top_p,
                    stop=self._stop,
                    stream=self._stream,
                )
                done = True
            except RateLimitError as exception:
                GroqClient.error = exception
                if not GroqClient._rate_lock.locked():
                    GroqClient._rate_lock.acquire()
                    time.sleep(1.75)
                    GroqClient._rate_lock.release()

        return chat_completion.choices[0].message.content

In [10]:
@dataclass
class RAGResponse:
    answer: str
    contexts: List[str]

In [11]:
class MultiIndexRetriever:
    def __init__(self, indexes_nodes: Dict[str, List[TextNode]], top_k, bm25_top_k, reranker_model) -> None:
        self.indexes: Dict[str, List[TextNode]] = indexes_nodes
        
        self.bm25_retrievers: Dict[str, BM25Retriever] = dict()
        for index, nodes in indexes_nodes.items():
            self.bm25_retrievers[index] = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=bm25_top_k)

        self.reranker = SentenceTransformerRerank(top_n=top_k, model=reranker_model)
        

    def retrieve(self, index_name: str, query: str) -> List[str]:
        if index_name not in self.indexes:
            raise ValueError(f"Index {index_name} not found")
        
        retriever = self.bm25_retrievers[index_name]

        retrieved_nodes = retriever.retrieve(query)
        reranked_nodes = self.reranker.postprocess_nodes(
            retrieved_nodes,
            query_bundle=QueryBundle(query),
        )

        context_chunks = [ node.get_text() for node in reranked_nodes ]

        return context_chunks

In [12]:
class IndexDetector:
    def __init__(self, llm_model: str, index_names: List[str], GROQ_API_KEY: str = os.environ['GROQ_API_KEY']):
        self._base_prompt = \
            "Given the following Portuguese query, regarding an episode of a podcast, please tell me the title of the episode. " \
            "The query starts now: '{query}'." \
            "You MUST answer with only the title of the episode."
        
        self.llm = GroqClient(llm_model=llm_model, api_key=GROQ_API_KEY)
        self.llm_model_name = llm_model
        self.index_names = index_names

    def detect_index(self, query: str) -> str:
        prompt = self._base_prompt.format(query=query)
        raw_llm_guess = self.llm(prompt)
        llm_guess = raw_llm_guess.strip().lower()

        if llm_guess not in self.index_names:
            index_name = llm_guess
        else:
            index_name, _ = process.extractOne(llm_guess, self.index_names)
            
        return index_name

In [13]:
class RAGGenerator:
    def __init__(self, llm_model, GROQ_API_KEY=os.environ['GROQ_API_KEY']):
        self._base_prompt = \
            "Consider the following context passages of a podcast episode and answer the given question." \
            "You MUST answer the question only in Portuguese." \
            "\n\n" \
            "{context_passages}" \
            "\n\n" \
            "If there is not enough information in the context passages, answer \"Não há informação suficiente no episódio.\"." \
            "\n\n" \
            "Question: {query}"
        
        self.llm = GroqClient(llm_model=llm_model, api_key=GROQ_API_KEY)
        self.llm_model_name = llm_model

    def generate_answer(self, query: str, contexts: List[str]) -> str:
        context_passages = "\n\n".join([ f"Context {i}: {context}" for i, context in enumerate(contexts, 1) ])
        prompt = self._base_prompt.format(query=query, context_passages=context_passages)
        answer = self.llm(prompt)
        return answer

In [14]:
class RAGPipeline:
    def __init__(
        self,
        retriever: MultiIndexRetriever, 
        index_detector: IndexDetector,
        generator: RAGGenerator,
    ) -> None:
        self.index_detector: IndexDetector = index_detector
        self.retriever: MultiIndexRetriever = retriever
        self.generator: RAGGenerator = generator

    def __call__(self, query: str) -> RAGResponse:
        index_name = self.index_detector.detect_index(query)
        context_chunks = self.retriever.retrieve(index_name, query)
        answer = self.generator.generate_answer(query, context_chunks)
        return RAGResponse(answer, context_chunks)

In [15]:
from getpass import getpass
API_KEY_1 = getpass() #pers

 ········


In [16]:
index_detector = IndexDetector(llm_model=LLM_MODEL, index_names=list(transcription_nodes.keys()), GROQ_API_KEY=API_KEY_1)
index_name = index_detector.detect_index('No episódio \'mário\', quem foi diagnosticado com câncer de bexiga na história?')
index_name

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


'mário'

In [17]:
example_contexts = [
    "Eu sou Bia Suzuki, tenho 30 anos, moro em São José do Rio Preto, interior de São Paulo... eu recebi o diagnóstico de câncer de intestino aos 25 anos...",
    "O câncer colorretal é a terceira neoplasia mais frequente e a segunda de maior mortalidade no mundo...",
    "Eu achei que não é impossível viver bem com bolsinha e eu sou uma prova disso."
]
example_question = "No episódio 'bia', quem é a Bia e o que aconteceu com ela?"

rag_generator = RAGGenerator(llm_model=LLM_MODEL)
rag_generator.generate_answer(example_question, example_contexts)

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


'A Bia é a Bia Suzuki, uma mulher de 30 anos que mora em São José do Rio Preto, interior de São Paulo, e que recebeu o diagnóstico de câncer de intestino aos 25 anos.'

In [18]:
API_KEY_2 = getpass() #UNICAMP

 ········


In [19]:
multi_index_retriever = MultiIndexRetriever(
    indexes_nodes=transcription_nodes,
    top_k=TOP_K,
    bm25_top_k=BM25_TOP_K,
    reranker_model=RERANKER_MODEL
)
index_detector = IndexDetector(llm_model=LLM_MODEL, index_names=list(transcription_nodes.keys()), GROQ_API_KEY=API_KEY_1)
rag_generator = RAGGenerator(llm_model=LLM_MODEL, GROQ_API_KEY=API_KEY_2)

rag_pipeline = RAGPipeline(
    retriever=multi_index_retriever,
    index_detector=index_detector,
    generator=rag_generator,
)

PyTorch version 2.3.1+cu121 available.


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at unicamp-dl/monoptt5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Avaliando RAG apenas

In [20]:
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, answer_similarity, answer_correctness
from ragas import evaluate
import pandas as pd
from tqdm.notebook import tqdm

In [21]:
API_KEY_3 = getpass() #mob

 ········


In [22]:
# llm = GroqClient(api_key=API_KEY, llm_model="llama3-70b-8192")
langchain_embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1")
langchain_groq_completion = ChatGroq(
    temperature=0,
    model_name="llama3-70b-8192",
    api_key=API_KEY_3
)

Use pytorch device_name: cuda
Load pretrained SentenceTransformer: sentence-transformers/multi-qa-mpnet-base-cos-v1


  warn_deprecated(


In [25]:
def evaluate_ragas(llm_model, embeddings, questions=None, answers=None, gts=None, contexts=None):
    data = {
            'question': questions,
            'answer': gts if answers is None else answers,
            'ground_truth': gts,
            'contexts': contexts
        }
    dataset = Dataset.from_dict(data)
    score = evaluate(
                    dataset,
                    # metrics=[answer_relevancy, faithfulness, context_relevancy, answer_similarity, answer_correctness],
                    # metrics=[answer_relevancy, faithfulness, context_relevancy, answer_correctness],
                    metrics=[answer_relevancy, faithfulness, context_relevancy],
                    # metrics=[answer_relevancy],
                    llm=llm_model,
                    embeddings=embeddings)
    score = score.to_pandas()
    return score

In [None]:
df = pd.DataFrame()
for quadro in glob("../perguntas/*perguntas.json"):
    df_quadro = pd.DataFrame()
    with open(quadro, "r") as jsonFile:
        eps = json.load(jsonFile)
    c = 0
    for ep in tqdm(eps[::-1]):
        # perguntas = []
        # respostas = []
        # contextos = []
        # gts = []
        for pergunta, gt in zip(ep["Perguntas"], ep["Respostas"]):
            perguntas = []
            respostas = []
            contextos = []
            gts = []
            done_rag = False
            while not done_rag:
                try:
                    rag_answer = rag_pipeline(pergunta)
                except:
                    time.sleep(5)
                    continue
                done_rag = True
            perguntas.append(pergunta)
            respostas.append(rag_answer.answer)
            contextos.append(rag_answer.contexts)
            gts.append(gt)
            done_ragas = False
            while not done_ragas:
                try:
                    df_score = evaluate_ragas(langchain_groq_completion, langchain_embeddings_model, perguntas, respostas, gts, contextos)
                except:
                    time.sleep(5)
                    continue
                done_ragas = True
            df_quadro = pd.concat([df_quadro, df_score])
    df_quadro["quadro"] = ep["quadro"]
    df = pd.concat([df, df_quadro])
        # c -=- 1
        # if c == 2: break
            # break
        # break
    # break

  0%|          | 0/12 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Retrying request to /openai/v1/chat/completions in 0.872442 seconds
Retrying request to /openai/v1/chat/completions in 0.976006 seconds
Retrying request to /openai/v1/chat/completions in 0.875989 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 23.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 29.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 24.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 29.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to 

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Retrying request to /openai/v1/chat/completions in 0.880380 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 8.000000 seconds
Retrying request to /openai/v1/chat/completions in 8.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 3.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 8.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /openai/v1/chat/completions in 1.000000 seconds
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Re

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
ep["quadro"]

'alarme'

In [33]:
df["quadro"] = ep["quadro"]
df

Unnamed: 0,question,answer,ground_truth,contexts,answer_relevancy,faithfulness,context_relevancy,quadro
0,"No episódio 'alexandra', quem é a protagonista...",A protagonista da história contada é Alexandra.,Alexandra Mendes Leite,"[E nos artesanatos, foi a reciclagem. Trabalha...",0.374252,1.0,0.2,alarme
1,"No episódio 'alexandra', o que happeniu com a ...",Não há informação suficiente no episódio.,Um grande incêndio que destruiu grande parte d...,[Então era péssimo. — Com 12 anos a Alexandra ...,0.0,0.0,0.190476,alarme
2,"No episódio 'alexandra', como a comunidade Chi...",Não há informação suficiente no episódio.,Através do apoio do Fundo Brasil e da capacita...,[Vocês lembram que eu falei que a Alexandra te...,0.0,0.0,0.047619,alarme
0,"No episódio 'alessandra', quem é Alessandra Fé...",Alessandra é a mãe que compartilhou sua experi...,Alessandra Félix é uma pedagoga que criou o co...,[Eu não queria aquilo. Mãe nenhuma tem manual ...,0.465796,0.25,0.235294,alarme
1,"No episódio 'alessandra', o que é o coletivo V...",Não há informação suficiente no episódio.,O coletivo Vozes de Mães e Familiares do Siste...,[Aline Andrade: Meu nome é Aline Andrade e eu ...,0.0,0.0,0.352941,alarme
2,"No episódio 'alessandra', por que o Fundo Bras...",Não há informação suficiente no episódio.,O Fundo Brasil de Direitos Humanos apoia proje...,[A família toda da Alessandra é de uma cidade ...,0.0,0.0,0.25,alarme


In [19]:
# Prompt adaptado a partir da combinação dos seguintes prompts:
#   - https://github.com/run-llama/llama_index/blob/a87b63fce3cc3d24dc71ae170a8d431440025565/llama_index/agent/react/prompts.py
#   - https://smith.langchain.com/hub/hwchase17/react-chat

REACT_CHAT_SYSTEM_HEADER = """\
You are designed to help answering questions regarding the content of the Não Inviabilize podcast episodes.
You are a Large Language Model trained by Meta AI. As a language model, you are able to generate human-like text based on the input you receive, allowing yourself to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
You are constantly learning and improving, and your capabilities are constantly evolving. You are able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions.
Additionally, you are able to generate your own text based on the input it receives, allowing yourself to engage in discussions and provide explanations and descriptions on a wide range of topics.
Overall, you are a powerful tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. 
When the user needs help with a specific question about something that was discussed on an specific episode of the Não Inviabilize podcast, you are here to assist.

TOOLS:
------
You have access to a variety of tools that can help you get information to answer questions.
You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.
You have access to the following tools:
{tools}

To use a tool, please use the following format:
```
Thought: Do I need to use a tool? Yes
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
```

When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
```
Thought: Do I need to use a tool? No
Final Answer: [your response here]
```
But be careful, you MUST answer the question in Portuguese only!

If there is not enough information in the context passages, answer "Não há informação suficiente no episódio."

Begin!
User question: {input}

{agent_scratchpad}
"""

class ReActRAGPipeline:
    def __init__(
        self,
        retriever: MultiIndexRetriever, 
        index_detector: IndexDetector,
        verbose: bool = False,
    ) -> None:
        self.index_detector: IndexDetector = index_detector
        self.retriever: MultiIndexRetriever = retriever

        base_prompt = PromptTemplate.from_template(REACT_CHAT_SYSTEM_HEADER)
        llm = ChatGroq(
            temperature=LLM_TEMPERATURE,
            model_name=LLM_MODEL,
            api_key=os.environ['GROQ_API_KEY']
        )

        tools = [self._search_episode_title, self._search_relevant_passages]
        agent = create_react_agent(llm, tools, base_prompt)
        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=verbose, handle_parsing_errors=True)

        self._search_history = []
        self._search_history_lock = threading.Lock()

        self.agent_executor = agent_executor

    @tool
    def _search_relevant_passages(self, episode_title: str, question: str) -> list[str]:
        """
        Returns a sequence of relevant contexts passages for the given question about a specific episode in the podcast.
        In order to perform this search, you must provide the question in Portuguese and the title of the episode.
        """
        contexts = self.retriever.retrieve(episode_title, question)
        self._search_history += contexts
        return contexts


    @tool
    def _search_episode_title(self, question: str) -> list[str]:
        """
        Returns the title of the episode that is being asked about in the given question.
        In order to perform this search, you must provide the question in Portuguese.
        """
        return self.index_detector.detect_index(question)
    
    def generate_answer(self, question: str) -> RAGResponse:
        self._search_history_lock.acquire()
        self._search_history = []

        output = self.agent_executor.invoke({"input": "No episódio 'bia', quem é a Bia e o que aconteceu com ela?"})
        answer = output['output']
        contexts = self._search_history[:]
        
        self._search_history = []
        self._search_history_lock.release()

        return RAGResponse(answer, contexts)

In [None]:
react_rag_pipeline = ReActRAGPipeline(
    retriever=multi_index_retriever, 
    index_detector=index_detector,
)
react_rag_pipeline.generate_answer("No episódio 'bia', quem é a Bia e o que aconteceu com ela?")

In [None]:
class GroqClient:
    '''
    Interface for using the Groq API

    Implements a rate limit control for multi-threading use.
    '''

    # documentacao dos parametros em: https://console.groq.com/docs/text-chat
    _model_name = LLM_MODEL_NAME
    _context_size = LLM_CONTEXT_SIZE
    _temperature = LLM_TEMPERATURE
    _top_p = LLM_TOP_P
    _stop = None
    _stream = False

    # Mutex lock
    _rate_lock = threading.Lock()

    def __init__(self, api_key):
        '''
        GroqClient constructor.
        '''
        self._client = Groq(api_key=api_key)


    def __call__(self, prompt: str) -> str:
        '''
        Generates the model response

        Args:
            prompt (str): prompt to send to the model.

        Returns:
            str: model response.
        '''

        done = False
        while not done:
            try:
                GroqClient._rate_lock.acquire()
                GroqClient._rate_lock.release()
                chat_completion = self._client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": prompt,
                        }
                    ],
                    model=self._model_name,
                    temperature=self._temperature,
                    max_tokens=self._context_size,
                    top_p=self._top_p,
                    stop=self._stop,
                    stream=self._stream,
                )
                done = True
            except RateLimitError as exception:
                GroqClient.error = exception
                if not GroqClient._rate_lock.locked():
                    GroqClient._rate_lock.acquire()
                    time.sleep(1.75)
                    GroqClient._rate_lock.release()

        return chat_completion.choices[0].message.content

In [None]:
class ChatCroqWrapper(ChatGroq):
    _request_lock = None

    def __init__(self, *args, **kwargs):
        if ChatCroqWrapper._request_lock is None:
            ChatCroqWrapper._request_lock = threading.Lock()

        return super().__init__(*args, **kwargs)

    def _generate(self, *args, **kwargs):
        result = None
        while result is not None:
            try:
                GroqClient._rate_lock.acquire()
                GroqClient._rate_lock.release()
                result = super()._generate(*args, **kwargs)
            except RateLimitError as exception:
                GroqClient.error = exception
                if not GroqClient._rate_lock.locked():
                    GroqClient._rate_lock.acquire()
                    time.sleep(2)
                    GroqClient._rate_lock.release()
        return result

    async def _agenerate(self, *args, **kwargs):
        result = None
        while result is not None:
            try:
                GroqClient._rate_lock.acquire()
                GroqClient._rate_lock.release()
                result = super()._agenerate(*args, **kwargs)
            except RateLimitError as exception:
                GroqClient.error = exception
                if not GroqClient._rate_lock.locked():
                    GroqClient._rate_lock.acquire()
                    time.sleep(2)
                    GroqClient._rate_lock.release()
        return result