# Evaluating RAGs

In [None]:
#! pip install evidently langchain langchain_community openai faiss-cpu

In [None]:
#! pip install -U langchain-openai

In [None]:
import random
import requests
import time
import os
import pandas as pd

from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS

from langchain_openai import OpenAIEmbeddings

In [None]:
from evidently.ui.workspace import CloudWorkspace

from evidently import Dataset, DataDefinition, Report
from evidently.descriptors import *
from evidently.presets import TextEvals
from evidently.llm.templates import BinaryClassificationPromptTemplate

In [None]:
from openai import OpenAI

In [None]:
OA_client = OpenAI()

In [None]:
client = CloudWorkspace(url="https://app.evidently.cloud/")

## Load testing dataset

In [None]:
dataset_id = "0196ed73-ca92-701a-bfd6-97118c6969f6"
testing_dataset = client.load_dataset(dataset_id)

In [None]:
testing_dataset.as_dataframe()

## Simulate RAG system

In [None]:
def load_and_index_from(url):
    # Step 1: Load file content from GitHub raw URL
    response = requests.get(url)
    response.raise_for_status()  # Raise error if download fails
    content = response.text

    #Split into chunks
    text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
    documents = [Document(page_content=content)]
    chunks = text_splitter.split_documents(documents)

    #Generate embeddings and create FAISS index
    embeddings = OpenAIEmbeddings()  # You can pass openai_api_key="..." if needed
    vector_store = FAISS.from_documents(chunks, embeddings)

    return vector_store

In [None]:
# Search the index for relevant information
def search_documents(query, vector_store):
    search_results = vector_store.similarity_search(query, k=5)  
    context = "\n".join([doc.page_content for doc in search_results])
    return context

In [None]:
def generate_response(question, context, model="gpt-4o-mini"):
    response = OA_client.responses.create(
        instructions="Your task is to answer the provided question based on the context.",
        model=model,
        input=f"The retrieved context is {context} \n {question}"
    )

    text = response.output_text if response else None
    return text

In [None]:
url = "https://docs.evidentlyai.com/llms-full.txt"
vector_store = load_and_index_from(url)

In [None]:
questions = testing_dataset.as_dataframe().questions.values

In [None]:
contexts = [search_documents(question, vector_store) for question in questions]

In [None]:
generated_answers = [
    generate_response(question, context)
    for question, context in zip(questions, contexts)
]

## Evaluation

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
testing_frame = pd.DataFrame()
testing_frame['question'] = questions
testing_frame['reference_answer'] = testing_dataset.as_dataframe().answers.values
testing_frame['generated_answer'] = generated_answers
testing_frame['context'] = contexts

In [None]:
testing_frame.head()

In [None]:
contradiction_check = BinaryClassificationPromptTemplate(
    criteria = """Label an ANSWER as **contradictory** only if it directly contradicts any part of the REFERENCE.
    Differences in length or wording are acceptable. It is also acceptable if the ANSWER adds new details or omits information, as long as **no factual content contradicts** the REFERENCE.
    Your task is to compare factual consistency only — not completeness, relevance, or style.
    
    REFERENCE:
    =====
    {reference}
    =====
    """,
    target_category = "contradictory",
    non_target_category = "non-contradictory",
    uncertainty = "unknown",
    include_reasoning = True,
    pre_messages = [("system", "You are an expert evaluator. You will be given an ANSWER and REFERENCE.")]
)


In [None]:
testing_dataset = Dataset.from_pandas(
    testing_frame,
    data_definition=DataDefinition(),
    descriptors=[
        FaithfulnessLLMEval("generated_answer", context="context"),
        LLMEval("generated_answer", template=contradiction_check, additional_columns={"reference_answer": "reference"},
                provider = "openai", model = "gpt-4o-mini", alias="Contradictions"),
    ]
)

In [None]:
testing_dataset.as_dataframe()

In [None]:
report = Report([
    TextEvals()
])

my_eval = report.run(testing_dataset, None)
my_eval

In [None]:
#PROJECT_ID = "0196ed70-883c-72b9-ad31-c5b212dd02f0"
#client.add_run(PROJECT_ID, my_eval, include_data=True)