# Test evaluation task suite for AI data designer

Setup API key to run tests below

In [2]:
import sys
import os

# Define the root directory and add it to the path
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..', '..', '..'))
sys.path.insert(0, root_dir)

# set environment variable 'GRETEL_PROD_API_KEY' from https://console.gretel.ai/users/me/key
gretel_prod_api_key = input("Enter your Gretel API key from https://console.gretel.ai/users/me/key: ")
os.environ['GRETEL_PROD_API_KEY'] = gretel_prod_api_key

### Evaluate Synthetic Dataset

In [13]:
# Reload packages if you've made changes to the evaluation.py file.
# Alternatively you can restart the kernel to pick up changes

from importlib import reload
import evaluation
reload(evaluation)

import pandas as pd
from pprint import pprint
from navigator_helpers.llms.llm_suite import GretelLLMSuite
from evaluation import BaseEvaluationTaskSuite, NL2PythonEvaluationTaskSuite, NL2SQLEvaluationTaskSuite
from datasets import load_dataset

In [14]:
datasets_dict = {
    "synthetic_text_to_sql": {
        "dataset_kwargs": {
            "path": "gretelai/synthetic_text_to_sql",
            "split": "train"
        },
        "code_lang": "sql",
        "eval_kwargs":{
            "instruction_col_name": "sql_prompt",
            "code_col_name": "sql",
            "context_col_name": "sql_context"
        }
    },
    "gsm8k": {
        "dataset_kwargs": {
            "path": "openai/gsm8k",
            "name": "main",
            "split": "train"
        },
        "eval_kwargs": {
            "instruction_col_name": "question",
            "code_col_name": "answer",
        }
    },
    "synthetic_gsm8k": {
        "dataset_kwargs": {
            "path": "gretelai/synthetic-gsm8k-reflection-405b",
            "split": "train"
        },
        "eval_kwargs": {
            "instruction_col_name": "question",
            "code_col_name": "answer",
        }
    },
    "xlcost_text_to_code": {
        "dataset_kwargs": {
            "path": "codeparrot/xlcost-text-to-code",
            "split": "train"
        },
        "code_lang": "python",
        "eval_kwargs": {
            "instruction_col_name": "text",
            "code_col_name": "code",
        }
    },
}

# Prompt user to select a dataset
print("Available datasets:")
for key in datasets_dict.keys():
    print(f" - {key}")

selected_dataset = input("\nEnter the name of the dataset to load: ").strip()

# Load the selected dataset
if selected_dataset in datasets_dict:
    dataset_dict = datasets_dict[selected_dataset]
    eval_kwargs = dataset_dict["eval_kwargs"]
    code_lang = dataset_dict["code_lang"] if "code_lang" in dataset_dict.keys() else None
    dataset = load_dataset(**dataset_dict["dataset_kwargs"])
    
    # Optionally, select a subset and convert to pandas DataFrame
    dataset_1000 = dataset.select(range(1000))
    dataset_1000_pd = dataset_1000.to_pandas()
    
    print(f"Loaded dataset '{selected_dataset}' successfully!")
else:
    print("Error: Dataset not found. Please enter a valid dataset name.")

Available datasets:
 - synthetic_text_to_sql
 - gsm8k
 - synthetic_gsm8k
 - xlcost_text_to_code
Loaded dataset 'synthetic_text_to_sql' successfully!


In [15]:
llm_suite = GretelLLMSuite()

2024-10-09 23:09:37.538 - INFO - 🦜 Initializing LLM suite
2024-10-09 23:09:37.540 - INFO - 📖 Natural language LLM: gretelai-mistral-nemo-2407
2024-10-09 23:09:37.541 - INFO - 💻 Code LLM: gretelai-mistral-nemo-2407
2024-10-09 23:09:37.542 - INFO - ⚖️ Judge LLM: gretelai-mistral-nemo-2407


In [16]:
results_1 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).row_uniqueness()
pprint(results_1)

{'non_semantically_unique_ids': [(2, 905), (78, 595), (192, 939), (272, 378)],
 'non_unique_ids': [],
 'percent_semantically_unique': 99.6,
 'percent_unique': 100.0}


In [17]:

results_2 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).feature_cardinality()
pprint(results_2)

{'domain': 0.1,
 'domain_description': 0.1,
 'id': 1.0,
 'sql': 1.0,
 'sql_complexity': 0.008,
 'sql_complexity_description': 0.008,
 'sql_context': 0.996,
 'sql_explanation': 1.0,
 'sql_prompt': 1.0,
 'sql_task_type': 0.004,
 'sql_task_type_description': 0.004}


In [18]:

results_3 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).feature_distribution()
pprint(results_3)

({'domain': {'aerospace': 6,
             'agriculture': 8,
             'aquaculture': 15,
             'archeology': 8,
             'arctic research': 4,
             'artificial intelligence': 12,
             'arts and culture': 8,
             'arts operations and management': 15,
             'automotive': 16,
             'beauty industry': 11,
             'biotechnology': 11,
             'blockchain': 6,
             'cannabis industry': 7,
             'charitable organizations': 15,
             'chemicals': 9,
             'civil engineering': 9,
             'climate change': 12,
             'construction': 17,
             'cosmetics': 4,
             'cultural preservation': 8,
             'cybersecurity': 13,
             'defense contractors': 5,
             'defense industry': 9,
             'defense operations': 13,
             'defense security': 16,
             'disability services': 11,
             'education': 9,
             'energy': 8,
             'e

In [19]:

results_4 = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).num_words_per_record()
pprint(results_4)

{'average_words_per_record': 13.046100000000001,
 'word_counts_per_column': {'domain': 1.687,
                            'domain_description': 13.197,
                            'sql': 15.635,
                            'sql_complexity': 1.694,
                            'sql_complexity_description': 8.028,
                            'sql_context': 32.493,
                            'sql_explanation': 35.021,
                            'sql_prompt': 13.867,
                            'sql_task_type': 2.896,
                            'sql_task_type_description': 5.943}}


In [22]:
results = BaseEvaluationTaskSuite(llm_suite, dataset_1000_pd).evaluate_all()
pprint(results)

{'feature_cardinality': {'domain': 0.1,
                         'domain_description': 0.1,
                         'id': 1.0,
                         'sql': 1.0,
                         'sql_complexity': 0.008,
                         'sql_complexity_description': 0.008,
                         'sql_context': 0.996,
                         'sql_explanation': 1.0,
                         'sql_prompt': 1.0,
                         'sql_task_type': 0.004,
                         'sql_task_type_description': 0.004},
 'feature_distribution': ({'domain': {'aerospace': 6,
                                      'agriculture': 8,
                                      'aquaculture': 15,
                                      'archeology': 8,
                                      'arctic research': 4,
                                      'artificial intelligence': 12,
                                      'arts and culture': 8,
                                      'arts operations and m

In [None]:
results = NL2PythonEvaluationTaskSuite(llm_suite, dataset_1000_pd, code_lang="python").evaluate_all()
pprint(results)

### Testing LLM-as-a-Judge Evaluation

In [28]:
dataset_10_pd = dataset_1000_pd.head(10)

In [None]:
if code_lang == "sql":
    task_5 = NL2SQLEvaluationTaskSuite(
        llm_suite=llm_suite, dataset=dataset_10_pd, code_lang="sql"
    )
elif code_lang == "python":
    task_5 = NL2PythonEvaluationTaskSuite(
        llm_suite=llm_suite, dataset=dataset_10_pd, code_lang="python"
    )
else:
    task_5 = BaseEvaluationTaskSuite(llm_suite, dataset_10_pd)
results_5 = task_5.llm_as_a_judge_evaluation(**eval_kwargs)
table_5 = task_5.output_dataset

In [None]:
print(results_5)
table_5.head()