# Test evaluation task suite for AI data designer

Setup API key to run tests below

In [1]:
import sys
import os

# Get the current working directory of the notebook
notebook_dir = os.getcwd()

# Define the root directory and add it to the path
root_dir = os.path.abspath(os.path.join(notebook_dir, '..', '..', '..'))
sys.path.insert(0, root_dir)

# set environment variable 'GRETEL_PROD_API_KEY' from https://console.gretel.ai/users/me/key
os.environ['GRETEL_PROD_API_KEY'] = 'grtucca6490cbab557f7b190245703880475f2a5c4ca0820c37b6d236da479ec68bd'

### Evaluate Synthetic Dataset

In [23]:
# Reload packages if you've made changes to the evaluation.py file.
# Alternatively you can restart the kernel to pick up changes

from importlib import reload
import evaluation
reload(evaluation)

import pandas as pd
from pprint import pprint
from navigator_helpers.llms.llm_suite import GretelLLMSuite
from evaluation import BaseEvaluationTaskSuite, NL2PythonEvaluationTaskSuite, NL2SQLEvaluationTaskSuite
from datasets import load_dataset

In [None]:
datasets_dict = {
    "synthetic_text_to_sql": {
        "path": "gretelai/synthetic_text_to_sql",
        "code_lang": "sql",
        "kwargs":{
            "instruction_col_name": "sql_prompt",
            "code_col_name": "sql",
            "context_col_name": "sql_context"
        }
    },
    "gsm8k": {
        "path": "openai/gsm8k",
        "kwargs": {
            "instruction_col_name": "question",
            "code_col_name": "answer",
        }
    },
    "synthetic_gsm8k": {
        "path": "gretelai/synthetic-gsm8k-reflection-405b",
        "kwargs": {
            "instruction_col_name": "question",
            "code_col_name": "answer",
        }
    },
    "xlcost_text_to_code": {
        "path": "codeparrot/xlcost-text-to-code",
        "code_lang": "python",
        "kwargs": {
            "instruction_col_name": "text",
            "code_col_name": "code",
        }
    },
}

# Prompt user to select a dataset
print("Available datasets:")
for key in datasets_dict.keys():
    print(f" - {key}")

selected_dataset = input("\nEnter the name of the dataset to load: ").strip()

# Load the selected dataset
if selected_dataset in datasets_dict:
    dataset_dict = datasets_dict[selected_dataset]
    dataset_path = dataset_dict["path"]
    kwargs = dataset_dict["kwargs"]
    code_lang = dataset_dict["code_lang"] if "code_lang" in dataset_dict.keys() else None
    dataset = load_dataset(dataset_path, split="train")
    
    # Optionally, select a subset and convert to pandas DataFrame
    dataset_1000 = dataset.select(range(1000))
    dataset_1000_pd = dataset_1000.to_pandas()
    
    print(f"Loaded dataset '{selected_dataset}' successfully!")
else:
    print("Error: Dataset not found. Please enter a valid dataset name.")

In [None]:
llm_suite = GretelLLMSuite()

In [None]:
results_1 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).row_uniqueness()
pprint(results_1)

In [None]:

results_2 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).feature_cardinality()
pprint(results_2)

In [None]:

results_3 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).feature_distribution()
pprint(results_3)

In [None]:

results_4 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).num_words_per_record()
pprint(results_4)

### Testing SQL Validation

In [26]:
dataset_10_pd = dataset_1000_pd.head(10)

In [None]:
if code_lang == "sql":
    task_5 = NL2SQLEvaluationTaskSuite(
        llm_suite=llm_suite, dataset=dataset_10_pd, code_lang="sql"
    )
elif code_lang == "python":
    task_5 = NL2PythonEvaluationTaskSuite(
        llm_suite=llm_suite, dataset=dataset_10_pd, code_lang="python"
    )
else:
    task_5 = BaseEvaluationTaskSuite(llm_suite, dataset_10_pd)
results_5 = task_5.llm_as_a_judge_evaluation(**kwargs)
table_5 = task_5.output_dataset

In [None]:

print(results_5)
table_5.head()