In [7]:
import os

new_directory = "E:/subject/compulsory_elective_2/real_project/"
os.chdir(new_directory)

In [8]:
#configuration
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

In [9]:
import json

with open('testing/credit/credit.json', 'r') as file:
    dataset = json.load(file)
    
dataset[0]

{'input': {'input_question': 'Which machine learning algorithm was found to have the highest accuracy in predicting credit scores in the study?'},
 'output': 'Logistic Regression, with an accuracy of 94%.'}

In [4]:
dataset[0]['input']['input_question']

'Which machine learning algorithm was found to have the highest accuracy in predicting credit scores in the study?'

In [10]:
#format the dataset before create dataset in Langsmith
def format_dataset(dataset):
    inputs = []
    outputs = []
    
    for example in dataset:
        inputs.append({"question": example['input']['input_question']})
        outputs.append({"answer": example['output']})

        
    return inputs, outputs

In [11]:
inputs, outputs = format_dataset(dataset)

In [12]:
inputs[:5]

[{'question': 'Which machine learning algorithm was found to have the highest accuracy in predicting credit scores in the study?'},
 {'question': 'What is the purpose of converting tabular data into images for credit scoring?'},
 {'question': 'What feature selection method proposed in the study outperformed PCA for credit scoring?'},
 {'question': 'What is the primary advantage of the Deep Genetic Cascade Ensemble of Classifiers (DGCEC) in credit scoring?'},
 {'question': 'What are the two major issues with machine learning models in credit scoring as identified in the survey?'}]

### Create dataset in Langsmith

In [13]:
from langsmith import Client

client = Client()

dataset_name = 'Credit Testing Dataset'

dataset = client.create_dataset(dataset_name=dataset_name)

client.create_examples(
    inputs=inputs,
    outputs=outputs,
    dataset_id=dataset.id
)

### create chatbot for testing

In [14]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain

In [15]:
#setup retriever
embeddings = OpenAIEmbeddings()
vectorsdb = FAISS.load_local(
    'store/credit', embeddings=embeddings,
    allow_dangerous_deserialization=True
)
retriever = vectorsdb.as_retriever()

In [50]:
#set up Q&A chain
llm = ChatOpenAI(model='gpt-4o-mini')

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        ("human", "{input}"),
    ]
)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


#create Q&A chain
document_chain = create_stuff_documents_chain(llm, qa_prompt)

#create history aware chain
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

#create retrieval chain
retrieval_chain = create_retrieval_chain(history_aware_retriever, document_chain)


In [None]:
response = retrieval_chain.invoke(
    {"input": "Which machine learning algorithm was found to have the highest accuracy in predicting credit scores in the study?"}
)
response['answer']

In [52]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = retrieval_chain.invoke({"input": example['question']})
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = retrieval_chain.invoke({"input": example['question']})
    return {"answer": response["answer"], "context": response["context"]}

def accuracy(results):
    return sum(results) / len(results)

### response vs reference answer

In [None]:
import time
from langchain import hub
from langchain_openai import ChatOpenAI

# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

res_answer_evaluator = []

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """
    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs['question']
    reference = example.outputs['answer']
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]
    
    res_answer_evaluator.append(score)
    time.sleep(6)

    return {"key": "answer_v_reference_score", "score": score}

In [54]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="credit",
    metadata={"version": "LCEL context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'credit-11b59611' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/571c21de-ea79-4616-a339-2c2211fabbad/compare?selectedSessions=4e36c71f-0bd2-4b0e-82e8-67e6f5b1b666




10it [00:07,  1.26it/s]


In [55]:
print(accuracy(res_answer_evaluator))

0.4


### Response vs input

In [25]:
# Grade prompt
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

res_answer_helpfulness_evaluator = []

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["question"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]
    res_answer_helpfulness_evaluator.append(score)

    return {"key": "answer_helpfulness_score", "score": score}

In [26]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_helpfulness_evaluator],
    experiment_prefix="rag-answer-helpfulness",
    metadata={"version": "LCEL context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'rag-answer-helpfulness-efeadf2d' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/571c21de-ea79-4616-a339-2c2211fabbad/compare?selectedSessions=73ee0ddf-323f-4186-8665-943b3e64fde5




10it [00:08,  1.22it/s]


In [27]:
print(accuracy(res_answer_helpfulness_evaluator))

0.8


### Response vs retrieved docs

In [28]:
# Prompt
grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")

res_answer_hallucination_evaluator = []

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["context"]

    # RAG answer
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]
    res_answer_hallucination_evaluator.append(score)

    return {"key": "answer_hallucination", "score": score}

In [29]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-answer-hallucination",
    metadata={"version": "LCEL context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'rag-answer-hallucination-b58d408f' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/571c21de-ea79-4616-a339-2c2211fabbad/compare?selectedSessions=cd812306-1687-4d9f-849b-26d94a6484e3




10it [00:08,  1.15it/s]


In [30]:
print(accuracy(res_answer_hallucination_evaluator))

0.9


### Retrieved docs vs input

In [31]:
# Grade prompt
grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")

res_docs_relevance_evaluator = []

def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["context"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]
    res_docs_relevance_evaluator.append(score)

    return {"key": "document_relevance", "score": score}

In [32]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'rag-doc-relevance-80a19db5' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/571c21de-ea79-4616-a339-2c2211fabbad/compare?selectedSessions=e3aa80de-1e7c-4ae0-bc4f-85b07af12b70




10it [00:09,  1.10it/s]


In [33]:
accuracy(res_docs_relevance_evaluator)

1.0