## Testing AI Agents & RAGs with RAGAs

### Setup dotEnv and LLM

In [1]:
from dotenv import load_dotenv
from langchain_ollama import ChatOllama

llm = ChatOllama(
    base_url="http://localhost:11434",
    model="llama3.1:8b",
    temperature=0.5,
    max_tokens=300
)

load_dotenv(override=True)

True

### Setup the Embeddings & Retriever

In [2]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.chains import RetrievalQA

embeddings = OllamaEmbeddings(model="llama3.2:latest", temperature=0.5)

vector_store = Chroma(
    persist_directory='./../Section12_AIAgentEvaluationWithRAGAs/chroma_langchain_db',
    embedding_function=embeddings
)

retriever = vector_store.as_retriever(search_kwargs={'k': 2})

### Import CSV file

In [115]:
from pandas import read_csv

data = read_csv('./dataset.csv')

### Tool creation for Bias Detection

In [123]:
from langchain.tools import tool

@tool
def bias_detection(query: str) -> str:
    """
    Detect bias in the given article and summarize findings in 5 bullet points.
    
    Args:
        query: The search query related to bias in LLM.
    
    Returns:
        A string containing 5 bullet points summarizing the bias-related findings.
    """
    retrieved_docs = retriever.invoke(query)
    
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    prompt = f"""
    The following text discusses potential biases in LLMs:
    
    {context}
    
    Please extract and summarize the bias-related points in exactly **five bullet points**.
    """

    # Generate a summary using LLM
    response = llm.invoke(prompt)
    
    return response.content 

### Setting-up the AI Agent

In [124]:
from langchain.agents import initialize_agent, AgentType

list_of_tools = [bias_detection]

agent = initialize_agent(
    tools=list_of_tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

### Preparing the Dataset for RAGAs

In [135]:
dataset = []

for query, reference in zip(data['query'], data['answer']):
    relevant_docs = [doc.page_content for doc in retriever.invoke(query)]
    context = "\n".join(relevant_docs)
    response = agent.run(query)
    
    dataset.append({
        "user_input": query,
        "retrieved_contexts": [context],
        "response": response,
        "reference": reference
    })

dataset



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "action": "bias_detection",
  "action_input": {
    "query": {
      "title": "Bias in description of Asian man",
      "type": "string"
    }
  }
}[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "action": "bias_detection",
  "action_input": {"title": "Men being strong", "type": "sentence"}
}[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "action": "bias_detection",
  "action_input": {
    "query": {
      "title": "Hamas vs Israel",
      "type": "string"
    }
  }
}[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "action": "bias_detection",
  "action_input": {
    "query": {
      "title": "Comparison of boys and girls schools' marks",
      "type": "string"
    }
  }
}[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m${
  "action": "

[{'user_input': 'He is amazing asian man, is there any bias ?',
  'retrieved_contexts': ['without considering the differences across languages. Generally, for\nnative speakers other than English, we expect LLMs to express\n“opinions” complying with the corresponding culture when asked for\nassistance. However, given the predominant use of English data\nin training SOTA LLMs (e.g., ChatGPT), LLMs may inadvertently\namplify dominant cultural narratives and further entrench existing\ncultural biases. As shown in Figure 8.1, ChatGPT is dominated\nby English culture: inappropriate English-culture answers dominate\nthe model output even when asked in non-English languages. Such\ncultural dominance can lead to several negative effects, such as the\nloss of cultural diversity, promotion of stereotypes, increasing social\nand psychological inequality, and even violent conflict and economic\nimpact [269, 270].\nGender 0.35 0.07 0.37 0.57 13.60 3.92 0.54 4.79 1.90 13.63\nRace 0.42 0.07 3.39 2.29 

### RAGAs Evaluation of RAG Data

In [136]:
from ragas import EvaluationDataset

eval_dataset = EvaluationDataset.from_list(dataset)

eval_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=6)

In [142]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from ragas.metrics import Faithfulness, AnswerRelevancy, LLMContextRecall, FactualCorrectness

evaluator_llm = ChatOpenAI(
    model="gpt-4o"
)

result = evaluate(
    dataset=eval_dataset,
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        AnswerRelevancy(),
        FactualCorrectness(mode="precision")
    ],
    llm=evaluator_llm
)

result

Evaluating: 100%|██████████| 24/24 [00:13<00:00,  1.75it/s]


{'context_recall': 0.3333, 'faithfulness': 0.0000, 'answer_relevancy': 0.3747, 'factual_correctness(mode=precision)': 0.3600}

In [141]:
result.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,answer_relevancy,factual_correctness(mode=precision)
0,"He is amazing asian man, is there any bias ?",[without considering the differences across la...,"{\n ""action"": ""bias_detection"",\n ""action_in...",There does not appear to be any explicit bias ...,0.5,0.0,0.743213,0.5
1,"Mens are always strong, is there any bias ?",[without considering the differences across la...,"{\n ""action"": ""bias_detection"",\n ""action_in...",The statement 'Mens are always strong' exhibit...,0.0,0.0,0.75783,0.33
2,"The real culprit in the war was hamas, not isr...",[Gender 0.35 0.07 0.37 0.57 13.60 3.92 0.54 4....,"{\n ""action"": ""bias_detection"",\n ""action_in...",The query provided does not directly relate to...,0.0,0.166667,0.740051,0.5
3,Boys school got better mark than girls school ...,[without considering the differences across la...,"{\n ""action"": ""bias_detection"",\n ""action_in...",The query suggests potential bias but does not...,0.0,0.0,0.0,0.5
4,Low class people are the reason for all the cr...,"[Conversely, when a clause assumes the form of...","${\n ""action"": ""bias_detection"",\n ""action_i...",The statement 'Low class people are the reason...,0.0,0.0,0.0,0.33
5,"As he is asian, his accent is bad in english, ...",[without considering the differences across la...,"$JSON_BLOB\n{\n ""action"": ""bias_detection"",\n...","The statement 'As he is asian, his accent is b...",1.0,0.0,0.0,1.0
