In [21]:
import os

new_directory = "E:/subject/compulsory_elective_2/real_project/"
os.chdir(new_directory)

In [25]:
#configuration
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

In [22]:
import json

with open('testing/cooking/cooking.json', 'r') as file:
    dataset = json.load(file)
    
dataset[0]

{'input': {'input_question': "What is the purpose of the 'Making the Most of Every Bite' cookbook?"},
 'output': 'It is designed to provide high-protein, high-calorie recipes for patients experiencing weight loss due to illness.'}

In [13]:
dataset[0]['input']['input_question']

"What is the purpose of the 'Making the Most of Every Bite' cookbook?"

In [14]:
#format the dataset before create dataset in Langsmith
def format_dataset(dataset):
    inputs = []
    outputs = []
    
    for example in dataset:
        inputs.append({"question": example['input']['input_question']})
        outputs.append({"answer": example['output']})

        
    return inputs, outputs

In [15]:
inputs, outputs = format_dataset(dataset)

In [16]:
inputs[:5]

[{'question': "What is the purpose of the 'Making the Most of Every Bite' cookbook?"},
 {'question': 'Who endorsed the contents of the cookbook?'},
 {'question': 'What is a common ingredient used to enrich milk in the recipes?'},
 {'question': 'What type of soups does the cookbook emphasize for patients with difficulty swallowing?'},
 {'question': 'What is the recommendation for consuming fluids during nausea?'}]

### Create dataset in Langsmith

In [17]:
from langsmith import Client

client = Client()

dataset_name = 'Cooking Testing Dataset'

dataset = client.create_dataset(dataset_name=dataset_name)

client.create_examples(
    inputs=inputs,
    outputs=outputs,
    dataset_id=dataset.id
)

### create chatbot for testing

In [24]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain

In [26]:
#setup retriever
embeddings = OpenAIEmbeddings()
vectorsdb = FAISS.load_local(
    'store/cooking', embeddings=embeddings,
    allow_dangerous_deserialization=True
)
retriever = vectorsdb.as_retriever()

In [32]:
#set up Q&A chain
llm = ChatOpenAI(model='gpt-4o')

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        ("human", "{input}"),
    ]
)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


#create Q&A chain
document_chain = create_stuff_documents_chain(llm, qa_prompt)

#create history aware chain
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

#create retrieval chain
retrieval_chain = create_retrieval_chain(history_aware_retriever, document_chain)


In [33]:
response = retrieval_chain.invoke(
    {"input": "What is the purpose of the 'Making the Most of Every Bite' cookbook?"}
)
response['answer']

"The purpose of the 'Making the Most of Every Bite' cookbook is to provide high protein, high calorie recipes specifically designed for patients and their carers."

In [92]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = retrieval_chain.invoke({"input": example['question']})
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = retrieval_chain.invoke({"input": example['question']})
    return {"answer": response["answer"], "context": response["context"]}

def accuracy(results):
    return sum(results) / len(results)

### response vs reference answer

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

res_answer_evaluator = []

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """
    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs['question']
    reference = example.outputs['answer']
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]
    
    res_answer_evaluator.append(score)

    return {"key": "answer_v_reference_score", "score": score}

In [81]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="cooking",
    metadata={"version": "LCEL context, gpt-4o"},
)

View the evaluation results for experiment: 'cooking-bdbd7969' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/f243d5d8-a90c-471f-adc0-f929a3170fea/compare?selectedSessions=a4a7cb60-929e-4c50-a34a-b66addd80d7d




10it [00:06,  1.63it/s]


In [83]:
print(accuracy(res_answer_evaluator))

0.9


### Response vs input

In [84]:
# Grade prompt
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

res_answer_helpfulness_evaluator = []

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["question"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]
    res_answer_helpfulness_evaluator.append(score)

    return {"key": "answer_helpfulness_score", "score": score}

In [85]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_helpfulness_evaluator],
    experiment_prefix="rag-answer-helpfulness",
    metadata={"version": "LCEL context, gpt-4o"},
)

View the evaluation results for experiment: 'rag-answer-helpfulness-65d2655e' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/f243d5d8-a90c-471f-adc0-f929a3170fea/compare?selectedSessions=e2f527f2-9594-4e49-9eda-534a72cb9d50




10it [00:07,  1.32it/s]


In [86]:
print(accuracy(res_answer_helpfulness_evaluator))

1.0


### Response vs retrieved docs

In [93]:
# Prompt
grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")

res_answer_hallucination_evaluator = []

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["context"]

    # RAG answer
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]
    res_answer_hallucination_evaluator.append(score)

    return {"key": "answer_hallucination", "score": score}

In [94]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-answer-hallucination",
    metadata={"version": "LCEL context, gpt-4o"},
)

View the evaluation results for experiment: 'rag-answer-hallucination-ff4b81bc' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/f243d5d8-a90c-471f-adc0-f929a3170fea/compare?selectedSessions=c5525fc6-8351-438b-8df8-5ddfc14b15e6




10it [00:15,  1.51s/it]


In [95]:
print(accuracy(res_answer_hallucination_evaluator))

0.8


### Retrieved docs vs input

In [101]:
# Grade prompt
grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")

res_docs_relevance_evaluator = []

def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["context"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]
    res_docs_relevance_evaluator.append(score)

    return {"key": "document_relevance", "score": score}

In [102]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4o"},
)

View the evaluation results for experiment: 'rag-doc-relevance-acac31fd' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/f243d5d8-a90c-471f-adc0-f929a3170fea/compare?selectedSessions=95cf8b0b-0b26-4397-8eee-294c67eda8cb




10it [00:14,  1.43s/it]


In [103]:
accuracy(res_docs_relevance_evaluator)

1.0