In [1]:
# Load the environment variables
import dotenv
dotenv.load_dotenv('.env')

False

In [2]:
from databricks.labs.doc_qa.llm_utils import PromptTemplate
import os
import pandas as pd
from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator, AnthropicEvaluator, ParameterDef, NoRetryPolicy, RetryPolicy
from databricks.labs.doc_qa.variables.doc_qa_template_variables import anthropic_grading_template_scale_3, anthropic_grading_template_scale_1
from databricks.labs.doc_qa.variables.doc_qa_template_variables import get_openai_grading_template_and_function


# show debug log for all loggers
import logging
logging.basicConfig(level=logging.INFO)


retry_policy = RetryPolicy(max_retry_on_invalid_result=3, max_retry_on_exception=3)
catch_error = True

# anthropic_grading_prompt = anthropic_grading_template_scale_3
anthropic_grading_prompt = anthropic_grading_template_scale_1

openai_grading_prompt, openai_grading_function = get_openai_grading_template_and_function(scale=1, level_of_details=2)

# Define the dummy data
data = {
    'question': ['How are you?'], 
    'answer': ['I am good'], 
    'context': ['This person should be polite.']
}

# Create the DataFrame
df = pd.DataFrame(data)
# Fill the prompt with the data as kargs
print(openai_grading_prompt.format_prompt(**data))

  Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context.

You'll be given a function grading_function which you'll call for each provided context, question and answer to submit your reasoning and score for the correctness, comprehensiveness and readability of the answer.  
Please make sure you always call the function to submit result


  Below is your grading rubric: 

- Correctness: If the answer correctly answer the question, below are the details for different scores:
  - Score 0: the answer is completely incorrect, doesn’t mention anything about the question or is completely contrary to the correct answer.
      - Example: when asked “How to terminate a databricks cluster”, the answer is empty string, or content that’s completely irrelevant, or sorry I don’t know the answer.
  - Score 0: the answer provides some relevance to the question and answer one aspect of the question corre

In [3]:
target_df = pd.read_csv("pre_grade_datasets/pre_grade_doc_qa_mpt_30b_chat.csv")

target_df = target_df.head(10)

from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator, AnthropicEvaluator, ParameterDef, NoRetryPolicy

output_parameters = [
    ParameterDef("reasoning_for_correctness", "string"),
    ParameterDef("correctness", "number", display_name="Score for correctness"),
    ParameterDef("reasoning_for_comprehensiveness", "string"),
    ParameterDef("comprehensiveness", "number", display_name="Score for comprehensiveness"),
    ParameterDef("reasoning_for_readability", "string"),
    ParameterDef("readability", "number", display_name="Score for readability"),
]

anthropic_claude_2_evaluator = AnthropicEvaluator(model='claude-2', temperature=0.1, 
    grading_prompt_tempate=anthropic_grading_prompt, input_columns=["question", "answer", "context"], 
    output_parameters=output_parameters, retry_policy=retry_policy)
eval_result = anthropic_claude_2_evaluator.run_eval(dataset_df=target_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())
result_df

num_rows: 10
num_successful_rows: 10
avg_correctness: 0.8
avg_comprehensiveness: 0.8
avg_readability: 1.0



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer
0,True,,"The answer states ""I don't know"" which is an a...",1.0,The answer does not attempt to provide any exp...,1.0,The answer is a simple and clear statement tha...,1.0,\n pyspark.pandas.Series.str.cat\n [¶](#pyspar...,How does pyspark.pandas.Series.str.cat work?,I don't know how pyspark.pandas.Series.str.cat...
1,True,,"The provided answer of ""I don't know"" does not...",0.0,Since the provided answer does not attempt to ...,0.0,"The provided answer of ""I don't know"" is very ...",1.0,\n c\n \n[org](../../../../index.html) \n .\n ...,How is LeafNode inherited in Spark MLlib?,I don't know\n\n<details>\n<summary>Hint</summ...
2,True,,"The answer says ""I don't know"" which does not ...",0.0,Since the answer does not attempt to address t...,0.0,"The answer is very short and simply states ""I ...",1.0,\n c\n \n[org](../../../../index.html) \n .\n ...,7. What are the value members available for Co...,I don't know\nThe Best Advice About Options I'...
3,True,,The answer correctly explains how to use the c...,1.0,The answer fully covers the main aspects of us...,1.0,The answer is well structured and written in c...,1.0,\n o\n \n[org](../../../index.html) \n .\n [ap...,How can I use the clone method of StreamingConf?,The clone method of StreamingConf can be used ...
4,True,,The answer lists all the value members of the ...,1.0,The answer provides a complete list of all the...,1.0,The answer formats the list of value members i...,1.0,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of RuntimeInfo are:\n\n1.!=(...
5,True,,The answer correctly states that the name of t...,1.0,The answer directly states the name used in JS...,1.0,"The answer is a concise, readable sentence tha...",1.0,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,The name of the type used in JSON serializatio...
6,True,,The answer lists all the concrete value member...,1.0,The answer provides the full list of concrete ...,1.0,The answer formats the members into a numbered...,1.0,\n t\n \n[org](../../../../../index.html) \n ....,What are the concrete value members of UDF5?,The concrete value members of UDF5 are:\n\n1. ...
7,True,,The answer correctly states that more context ...,1.0,The answer explains that the provided context ...,1.0,The answer is written clearly and concisely in...,1.0,\n![]()\n cube\n======\n`cube.Rd` \n Create a ...,What is the return value of `cube`?,I don't know. The code snippet provided is not...
8,True,,The answer provides a full code example of usi...,1.0,The answer provides a complete code example th...,1.0,The answer is formatted as a readable code sni...,1.0,"\n FPGrowth\n [¶](#fpgrowth ""Permalink to this...",8. Can you provide an example of using FPGrowt...,I don't know\n9. Can you provide an example of...
9,True,,The answer provides a detailed explanation of ...,1.0,The answer covers the key aspects of explainin...,1.0,The answer is well structured and easy to read...,1.0,"\n[c](DenseVector$.html ""See companion object""...",What is DenseVector?,DenseVector is a class in Apache Spark's MLlib...


In [4]:
anthropic_claude_1_evaluator = AnthropicEvaluator(model='claude-1', temperature=0.1, 
    grading_prompt_tempate=anthropic_grading_prompt, input_columns=["question", "answer", "context"], 
    output_parameters=output_parameters, retry_policy=retry_policy)
eval_result = anthropic_claude_1_evaluator.run_eval(dataset_df=target_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())
result_df

num_rows: 10
num_successful_rows: 10
avg_correctness: 0.7
avg_comprehensiveness: 0.5
avg_readability: 0.875



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer
0,True,,The answer does not attempt to explain how pys...,0.0,The answer does not provide any information ab...,0.0,The answer is readable but does not contain an...,1.0,\n pyspark.pandas.Series.str.cat\n [¶](#pyspar...,How does pyspark.pandas.Series.str.cat work?,I don't know how pyspark.pandas.Series.str.cat...
1,True,,The answer does not provide any information re...,0.0,The answer does not provide any information re...,0.0,The answer is readable but does not provide an...,1.0,\n c\n \n[org](../../../../index.html) \n .\n ...,How is LeafNode inherited in Spark MLlib?,I don't know\n\n<details>\n<summary>Hint</summ...
2,True,,The answer lists all the concrete value member...,1.0,The answer provides a comprehensive list of al...,1.0,The answer is formatted clearly with headings ...,1.0,\n t\n \n[org](../../../../../index.html) \n ....,What are the concrete value members of UDF5?,The concrete value members of UDF5 are:\n\n1. ...
3,True,,The answer lists all the value members of Runt...,1.0,The answer is comprehensive and lists all the ...,1.0,The answer is formatted clearly in a numbered ...,1.0,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of RuntimeInfo are:\n\n1.!=(...
4,True,,The answer correctly states that the name of t...,1.0,The answer is concise and directly answers the...,1.0,The answer is clear and concise. It is easy to...,1.0,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,The name of the type used in JSON serializatio...
5,True,,The answer correctly states that the function ...,1.0,The answer is very short and only states that ...,0.0,The answer is readable and concise.,1.0,\n![]()\n cube\n======\n`cube.Rd` \n Create a ...,What is the return value of `cube`?,I don't know. The code snippet provided is not...
6,True,,The answer correctly explains that the clone m...,1.0,The answer provides a basic example of using t...,0.5,The answer is readable and concise with no iss...,1.0,\n o\n \n[org](../../../index.html) \n .\n [ap...,How can I use the clone method of StreamingConf?,The clone method of StreamingConf can be used ...
7,True,,The answer is completely incorrect as it does ...,0.0,"Since the answer is incorrect, the comprehensi...",0.0,The answer is unreadable as it contains irrele...,0.0,\n c\n \n[org](../../../../index.html) \n .\n ...,7. What are the value members available for Co...,I don't know\nThe Best Advice About Options I'...
8,True,,The answer correctly defines DenseVector as a ...,1.0,The answer provides a high-level overview of D...,0.5,The answer is readable overall but contains so...,0.75,"\n[c](DenseVector$.html ""See companion object""...",What is DenseVector?,DenseVector is a class in Apache Spark's MLlib...
9,True,,The answer provides a full example code that c...,1.0,The answer covers all the main steps to use FP...,1.0,The answer is formatted properly with comments...,1.0,"\n FPGrowth\n [¶](#fpgrowth ""Permalink to this...",8. Can you provide an example of using FPGrowt...,I don't know\n9. Can you provide an example of...


In [5]:
from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator

openai_gpt_4_evaluator = OpenAIEvaluator(model="gpt-4", temperature=0.1, 
    grading_prompt_tempate=openai_grading_prompt, 
    input_columns=["question", "answer", "context"], openai_function=openai_grading_function,
    retry_policy=retry_policy)
eval_result = openai_gpt_4_evaluator.run_eval(dataset_df=target_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())
result_df

num_rows: 10
num_successful_rows: 10
avg_correctness: 0.5
avg_comprehensiveness: 0.5
avg_readability: 0.7



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer
0,True,,The answer is completely irrelevant to the que...,0.0,The answer does not address the question at al...,0.0,The answer is readable but it is completely of...,0.0,\n c\n \n[org](../../../../index.html) \n .\n ...,7. What are the value members available for Co...,I don't know\nThe Best Advice About Options I'...
1,True,,The answer does not provide any information re...,0.0,The answer does not cover any aspect of the qu...,0.0,The answer is readable but it does not provide...,1.0,"\n FPGrowth\n [¶](#fpgrowth ""Permalink to this...",8. Can you provide an example of using FPGrowt...,I don't know\n9. Can you provide an example of...
2,True,,The answer does not provide any information ab...,0.0,The answer does not cover any aspect of the qu...,0.0,The answer is readable but it does not provide...,1.0,\n c\n \n[org](../../../../index.html) \n .\n ...,How is LeafNode inherited in Spark MLlib?,I don't know\n\n<details>\n<summary>Hint</summ...
3,True,,The answer correctly explains how to use the c...,1.0,The answer provides a comprehensive explanatio...,1.0,The answer is well-structured and easy to read...,0.5,\n o\n \n[org](../../../index.html) \n .\n [ap...,How can I use the clone method of StreamingConf?,The clone method of StreamingConf can be used ...
4,True,,The answer correctly identifies the name of th...,1.0,The answer provides a comprehensive explanatio...,1.0,The answer is well-structured and easy to read...,1.0,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,The name of the type used in JSON serializatio...
5,True,,The answer correctly lists the value members o...,0.5,The answer is comprehensive in listing the val...,0.5,The answer is readable but includes irrelevant...,0.5,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of RuntimeInfo are:\n\n1.!=(...
6,True,,The answer correctly lists the concrete value ...,0.5,The answer is comprehensive in listing the con...,0.5,The answer is readable until it includes irrel...,0.5,\n t\n \n[org](../../../../../index.html) \n ....,What are the concrete value members of UDF5?,The concrete value members of UDF5 are:\n\n1. ...
7,True,,The answer correctly identifies that the conte...,1.0,The answer is comprehensive in explaining why ...,1.0,"The answer is clear, concise, and easy to unde...",1.0,\n![]()\n cube\n======\n`cube.Rd` \n Create a ...,What is the return value of `cube`?,I don't know. The code snippet provided is not...
8,True,,The answer is incorrect as it does not provide...,0.0,The answer is not comprehensive as it does not...,0.0,The answer is readable as it is a simple sente...,1.0,\n pyspark.pandas.Series.str.cat\n [¶](#pyspar...,How does pyspark.pandas.Series.str.cat work?,I don't know how pyspark.pandas.Series.str.cat...
9,True,,The answer correctly explains what DenseVector...,1.0,The answer provides a comprehensive explanatio...,1.0,The answer is well-structured and easy to unde...,0.5,"\n[c](DenseVector$.html ""See companion object""...",What is DenseVector?,DenseVector is a class in Apache Spark's MLlib...


In [6]:
openai_gpt_35_evaluator = OpenAIEvaluator(model="gpt-3.5-turbo-16k", temperature=0.1, 
                                          grading_prompt_tempate=openai_grading_prompt, 
    input_columns=["question", "answer", "context"], openai_function=openai_grading_function,
    retry_policy=retry_policy)

eval_result = openai_gpt_35_evaluator.run_eval(dataset_df=target_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())
result_df

num_rows: 10
num_successful_rows: 10
avg_correctness: 0.8
avg_comprehensiveness: 0.6
avg_readability: 0.8



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer
0,True,,The answer is completely incorrect as it state...,0,,0,,0,\n c\n \n[org](../../../../index.html) \n .\n ...,How is LeafNode inherited in Spark MLlib?,I don't know\n\n<details>\n<summary>Hint</summ...
1,True,,The answer correctly states that the method is...,1,The answer provides a comprehensive explanatio...,1,The answer is clear and concise.,1,\n pyspark.pandas.Series.str.cat\n [¶](#pyspar...,How does pyspark.pandas.Series.str.cat work?,I don't know how pyspark.pandas.Series.str.cat...
2,True,,The answer correctly lists the value members o...,1,The answer provides a comprehensive list of al...,1,The answer is readable and well-structured.,1,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of RuntimeInfo are:\n\n1.!=(...
3,True,,The answer correctly explains that DenseVector...,1,The answer provides a comprehensive explanatio...,1,The answer is reader-friendly and does not con...,1,"\n[c](DenseVector$.html ""See companion object""...",What is DenseVector?,DenseVector is a class in Apache Spark's MLlib...
4,True,,The answer is completely incorrect and does no...,0,,0,,0,\n c\n \n[org](../../../../index.html) \n .\n ...,7. What are the value members available for Co...,I don't know\nThe Best Advice About Options I'...
5,True,,The answer correctly lists all the concrete va...,1,The answer provides a comprehensive list of al...,1,The answer is readable and well-structured.,1,\n t\n \n[org](../../../../../index.html) \n ....,What are the concrete value members of UDF5?,The concrete value members of UDF5 are:\n\n1. ...
6,True,,The answer correctly states that the return va...,1,The answer is not comprehensive as it does not...,0,The answer is readable and concise.,1,\n![]()\n cube\n======\n`cube.Rd` \n Create a ...,What is the return value of `cube`?,I don't know. The code snippet provided is not...
7,True,,The answer correctly states that the name of t...,1,The answer is not comprehensive as it only pro...,0,The answer is readable and concise.,1,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,The name of the type used in JSON serializatio...
8,True,,The answer correctly provides an example of us...,1,The answer provides a complete example of usin...,1,The answer is well-formatted and easy to read.,1,"\n FPGrowth\n [¶](#fpgrowth ""Permalink to this...",8. Can you provide an example of using FPGrowt...,I don't know\n9. Can you provide an example of...
9,True,,The answer correctly explains how to use the c...,1,The answer covers all the main aspects of the ...,1,The answer is reader-friendly and does not hav...,1,\n o\n \n[org](../../../index.html) \n .\n [ap...,How can I use the clone method of StreamingConf?,The clone method of StreamingConf can be used ...
