In [1]:
# Load the environment variables
import dotenv
dotenv.load_dotenv('.env')

from databricks.labs.doc_qa.llm_utils import PromptTemplate
import pandas as pd
from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator, AnthropicEvaluator, ParameterDef, NoRetryPolicy, RetryPolicy
from databricks.labs.doc_qa.variables.doc_qa_template_variables import anthropic_grading_template_scale_3, anthropic_grading_template_scale_1
from databricks.labs.doc_qa.variables.doc_qa_template_variables import get_openai_grading_template_and_function


# show debug log for all loggers
import logging
logging.basicConfig(level=logging.INFO)


retry_policy = RetryPolicy(max_retry_on_invalid_result=3, max_retry_on_exception=3)
catch_error = True

openai_grading_prompt, openai_grading_function = get_openai_grading_template_and_function(scale=3, level_of_details=2)
openai_gpt_4_evaluator = OpenAIEvaluator(model="gpt-4", temperature=0.1, 
    grading_prompt_tempate=openai_grading_prompt, 
    input_columns=["question", "answer", "context"], openai_function=openai_grading_function,
    retry_policy=retry_policy)


In [4]:
gpt_35_df = pd.read_csv("pre_grade_datasets/pre_grade_doc_qa_gpt_35.csv")
from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator

eval_result = openai_gpt_4_evaluator.run_eval(dataset_df=gpt_35_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())

# for each row of result_df, look for the row with same value for question and context from gpt_35_df, and use the "source" column from gpt_35_df as the "source" column for result_df
result_df["source"] = ""
for index, row in result_df.iterrows():
    question = row["question"]
    context = row["context"]
    source = gpt_35_df.loc[(gpt_35_df["question"] == question) & (gpt_35_df["context"] == context)]["source"].values[0]
    result_df.at[index, "source"] = source
# Save to path "post_grade_datasets/doc_qa_gpt_35.csv"
result_df.to_csv("post_grade_datasets/doc_qa_gpt_35.csv", index=False)
result_df

num_rows: 114
num_successful_rows: 114
avg_correctness: 2.991228070175439
avg_comprehensiveness: 2.982456140350877
avg_readability: 3.0



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer,source
0,True,,The answer correctly identifies the value memb...,3,The answer provides a comprehensive list of th...,3,The answer is well-structured and easy to read.,3,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of the RuntimeInfo class are...,https://spark.apache.org/docs/latest/api/scala...
1,True,,The answer correctly identifies the value memb...,3,The answer provides a comprehensive list of th...,3,The answer is well-structured and easy to read.,3,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,The value members of the RuntimeInfo class are...,https://spark.apache.org/docs/latest/api/scala...
2,True,,The answer correctly identifies the name of th...,3,The answer is concise and directly answers the...,3,"The answer is clear, concise, and easy to unde...",3,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,The name of the type used in JSON serializatio...,https://spark.apache.org/docs/latest/api/scala...
3,True,,The answer correctly identifies the return val...,3,The answer is concise and directly addresses t...,3,"The answer is clear, concise, and easy to unde...",3,\n![]()\n cube\n======\n`cube.Rd` \n Create a ...,What is the return value of `cube`?,The return value of `cube` is a GroupedData ob...,https://spark.apache.org/docs/latest/api/R/ref...
4,True,,The answer correctly lists all the linear supe...,3,The answer is comprehensive and lists all the ...,3,The answer is well-structured and easy to read.,3,\n o\n \n[org](../../../index.html) \n .\n [ap...,What are the linear supertypes of OffHeapStora...,The linear supertypes of OffHeapStorageMemory ...,https://spark.apache.org/docs/latest/api/scala...
...,...,...,...,...,...,...,...,...,...,...,...,...
109,True,,The answer correctly lists all the linear supe...,3,The answer is comprehensive and covers all the...,3,The answer is well-structured and easy to read.,3,"\n[c](GradientDescent$.html ""See companion obj...",7. What are the linear supertypes of Gradient ...,The linear supertypes of Gradient Descent in S...,https://spark.apache.org/docs/latest/api/scala...
110,True,,The answer correctly explains how to fit a mod...,3,"The answer is comprehensive, covering all aspe...",3,The answer is well-structured and easy to read...,3,"\n[c](OneHotEncoder$.html ""See companion objec...",10. How can you fit a model using OneHotEncode...,"To fit a model using OneHotEncoder in Spark, y...",https://spark.apache.org/docs/latest/api/scala...
111,True,,The answer correctly explains the BisectingKMe...,3,The answer covers all the main aspects of the ...,3,The answer is well-structured and easy to unde...,3,\n p\n \n[org](../../../../index.html) \n .\n ...,1. What is BisectingKMeans algorithm in Spark?,The BisectingKMeans algorithm in Spark is a cl...,https://spark.apache.org/docs/latest/api/scala...
112,True,,The answer correctly explains the precision of...,3,The answer provides a comprehensive explanatio...,3,The answer is well-structured and easy to read...,3,\n QuantileDiscretizer\n [¶](#quantilediscreti...,7. What is the precision of the approximation ...,The precision of the approximation in Quantile...,https://spark.apache.org/docs/latest/api/pytho...


In [7]:
vicuna_df = pd.read_csv("pre_grade_datasets/pre_grade_doc_qa_vicuna_33b.csv")
from databricks.labs.doc_qa.evaluators.templated_evaluator import OpenAIEvaluator

eval_result = openai_gpt_4_evaluator.run_eval(dataset_df=vicuna_df, concurrency=20, catch_error=catch_error)
result_df = eval_result.to_dataframe()
print(eval_result.summary())

# for each row of result_df, look for the row with same value for question and context from gpt_35_df, and use the "source" column from gpt_35_df as the "source" column for result_df
result_df["source"] = ""
for index, row in result_df.iterrows():
    question = row["question"]
    context = row["context"]
    source = vicuna_df.loc[(vicuna_df["question"] == question) & (vicuna_df["context"] == context)]["source"].values[0]
    result_df.at[index, "source"] = source
result_df.to_csv("post_grade_datasets/doc_qa_vicuna_33b.csv", index=False)
result_df

num_rows: 115
num_successful_rows: 115
avg_correctness: 1.4260869565217391
avg_comprehensiveness: 1.4173913043478261
avg_readability: 1.5043478260869565



Unnamed: 0,is_successful,error_msg,reasoning_for_correctness,correctness,reasoning_for_comprehensiveness,comprehensiveness,reasoning_for_readability,readability,context,question,answer,source
0,True,,The answer is completely incorrect and doesn't...,0,The answer doesn't provide any information rel...,0,The answer is readable but it doesn't provide ...,1,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,,https://spark.apache.org/docs/latest/api/scala...
1,True,,The answer correctly lists the value members o...,3,The answer is comprehensive and covers all the...,3,The answer is well-structured and easy to read.,3,\n c\n \n[org](../../../../../index.html) \n ....,What are the value members of RuntimeInfo?,"Based on the provided context, the value memb...",https://spark.apache.org/docs/latest/api/scala...
2,True,,The answer provided is 'nan' which is incorrec...,0,The answer does not provide any relevant infor...,0,The answer is readable but it does not provide...,1,"\n[c](ShortType$.html ""See companion object"")\...",What is the name of the type used in JSON seri...,,https://spark.apache.org/docs/latest/api/scala...
3,True,,The answer correctly identifies the concrete v...,3,The answer lists and explains the concrete val...,3,The answer is well-structured and easy to read...,3,\n t\n \n[org](../../../../../index.html) \n ....,What are the concrete value members of UDF5?,"Based on the given context and code snippet, ...",https://spark.apache.org/docs/latest/api/scala...
4,True,,The answer correctly explains the usage of `py...,3,The answer provides a comprehensive explanatio...,3,The answer is well-structured and easy to read...,3,\n pyspark.pandas.Series.str.startswith\n [¶](...,What are the examples of using `pyspark.pandas...,"Using `pyspark.pandas.Series.str.startswith`,...",https://spark.apache.org/docs/latest/api/pytho...
...,...,...,...,...,...,...,...,...,...,...,...,...
110,True,,"The answer is empty, so it doesn't address the...",0,"The answer is empty, so it doesn't provide any...",0,"The answer is empty, so it doesn't provide any...",0,\n o\n \n[org](../../../../index.html) \n .\n ...,Can you tell me how to construct SparkAWSCrede...,\n```,https://spark.apache.org/docs/latest/api/scala...
111,True,,"The answer is completely incorrect, doesn’t me...",0,"The answer is completely incorrect, doesn’t me...",0,"The answer is completely incorrect, doesn’t me...",0,"\n[o](RegressionEvaluator.html ""See companion ...",2. What is the difference between RegressionEv...,,https://spark.apache.org/docs/latest/api/scala...
112,True,,The answer is completely irrelevant to the que...,0,The answer does not provide any information re...,0,The answer is not readable at all. It's just a...,0,"\n[o](SparkContext.html ""See companion class"")...",7. How can the getOrCreate() function be used ...,A I The The The The The The ...,https://spark.apache.org/docs/latest/api/scala...
113,True,,The answer is completely incorrect and doesn't...,0,The answer doesn't provide any relevant inform...,0,The answer is completely unreadable and doesn'...,0,\n Source code for pyspark.mllib.random\n=====...,6. How can you generate an RDD of vectors cont...,A I ...,https://spark.apache.org/docs/latest/api/pytho...
