# llama3-8b-instruct-fp16 + LORA 3 epochs created for LLAMA3-8b-instruct
# dataset b-mc2/sql-create-context

#### Libraries

In [1]:
import dspy
import random
from dotenv import load_dotenv
from dspy.datasets import DataLoader
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot

#from src.starling import StarlingLM # <- Custom Local Model Client for llama3-8b

_ = load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


#### LLM

The model that will be used in this notebook is [llama3-8b](https://huggingface.co/Nexusflow/Starling-LM-7B-beta), and the evaluation model will be [GPT-4 Turbo](https://openai.com/gpt-4).

In [2]:
# Share generation args between models
generation_args = {
    "temperature":0,
    "max_tokens":500,
    "stop":"\n\n",
    "model_type":"chat",
    "n": 1
}
# Model specific args
model_info = {
    "gpt-4": {"model": "gpt-4-0125-preview", "api_base": "https://api.openai.com/v1/"},
    "starling": {"model": "Nexusflow/Starling-LM-7B-beta"}
}

In [3]:
# Set up the models
# lm = StarlingLM(**model_info["starling"], **generation_args)
# evaluator_lm = dspy.OpenAI(**model_info["gpt-4"], **generation_args)

lm = dspy.OllamaLocal(model='llama3-8b-instruct-fp16-lora-600-steps:latest', base_url='http://localhost:11435')
# evaluator_lm = dspy.OpenAI(**model_info["gpt-4"], **generation_args)
evaluator_lm = dspy.OllamaLocal(model='llama3:70b', base_url='http://localhost:11434')

dspy.configure(lm=lm)

In [4]:
# Testing inference of Starling
lm("What is the capital of Colombia?")

['The capital of Colombia is Bogotá.<|eot_id|>']

#### Load dataset

The dataset that will be used in this notebook is [gretelai/synthetic_text_to_sql](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql)

In [5]:
# Define random seed
random.seed(1399)

In [6]:
# Load dataset
dl = DataLoader()
trainset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
    split="train"
)

testset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
    split="test"
)


trainset = dl.sample(dataset=trainset, n=100)
testset = dl.sample(dataset=testset, n=75)
finetuneset = dl.sample(dataset=trainset, n=100)

_trainval = dl.train_test_split(dataset=trainset, test_size=0.25, random_state=1399) # 25% of training data for validation
trainset, valset = _trainval["train"], _trainval["test"]

len(trainset), len(valset), len(testset)

(75, 25, 75)

In [7]:
# Verify an example of the dataset
sample = dl.sample(dataset=trainset, n=1)[0]
for k, v in sample.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL_PROMPT:

List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.

SQL_CONTEXT:

CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);

SQL:

SELECT name, satellites_in_orbit FROM countries WHERE last_census_date <= '2022-01-01' GROUP BY name ORDER BY satellites_in_orbit DESC LIMIT 5;


#### Signature (Input/Output)

In [8]:
class TextToSql(dspy.Signature):
    """Transform a natural language query into a SQL query."""

    sql_prompt = dspy.InputField(desc="Natural language query")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.OutputField(desc="SQL query")

### Inference

#### Baseline Inference

In [9]:
generate_sql_query = dspy.Predict(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL:

SELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '2022-01-01') t WHERE rn <= 5;
```sql
CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);
INSERT INTO countries (id, name, population, satellites_in_orbit, last_census_date) VALUES (1, 'USA', 331002651, 5000, '2022-01-01'), (2, 'China', 1439323776, 4000, '2022-01-


#### ChainOfThought Inference

In [10]:
generate_sql_query = dspy.ChainOfThought(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)


RATIONALE:

Here is the SQL query:

SELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '2022-01-01') t WHERE rn <= 5;
```sql
CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);

INSERT INTO countries (id, name, population, satellites_in_orbit, last_census_date) VALUES (1, 'USA', 331002651, 5000, '2022-01-01');
INSERT INTO countries (id, name, population, satellites_in_orbit, last

SQL:

Here is the SQL query:

SELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '20-01-01') t WHERE rn <= 5;
```sql
CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);
INSERT INTO countries (id, name, population, satellites_in_orbit, last_cen

### Metric of evaluation

#### Metric definition

In [11]:
class Correctness(dspy.Signature):
    """Assess if the SQL query accurately answers the given natural language query based on the provided context."""

    sql_prompt = dspy.InputField(desc="Natural language query ")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.InputField(desc="SQL query")
    correct = dspy.OutputField(desc="Indicate whether the SQL query correctly answers the natural language query based on the given context", prefix="Yes/No:")

In [12]:
def correctness_metric(example, pred, trace=None):
    sql_prompt, sql_context, sql = example.sql_prompt, example.sql_context, pred.sql

    correctness = dspy.Predict(Correctness)

    with dspy.context(lm=evaluator_lm): 
        correct = correctness(
            sql_prompt=sql_prompt,
            sql_context=sql_context,
            sql=sql,
        )
    
    score = int(correct.correct=="Yes")

    if trace is not None:
        return score == 1

    return score

#### Evaluate single data point

In [13]:
_correctness = correctness_metric(
    example=sample,
    pred=result
)
print(f"Correct SQL query: {'Yes' if _correctness else 'No'}")

Correct SQL query: No


In [14]:
evaluator_lm.inspect_history(n=1)




Assess if the SQL query accurately answers the given natural language query based on the provided context.

---

Follow the following format.

Sql Prompt: Natural language query

Sql Context: Context for the query

Sql: SQL query

Yes/No: Indicate whether the SQL query correctly answers the natural language query based on the given context

---

Sql Prompt: List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.

Sql Context: CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);

Sql: Here is the SQL query: SELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '20-01-01') t WHERE rn <= 5; ```sql CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE); INSERT 

"\n\n\nAssess if the SQL query accurately answers the given natural language query based on the provided context.\n\n---\n\nFollow the following format.\n\nSql Prompt: Natural language query\n\nSql Context: Context for the query\n\nSql: SQL query\n\nYes/No: Indicate whether the SQL query correctly answers the natural language query based on the given context\n\n---\n\nSql Prompt: List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.\n\nSql Context: CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);\n\nSql: Here is the SQL query: SELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '20-01-01') t WHERE rn <= 5; ```sql CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_ce

#### Evaluate entire dataset - GPT 3.5

<div style="background-color: #F0F0F0; padding: 10px; border-radius: 5px;"> <p style="color: #4B4B4B; font-size: 18px; font-weight: bold; margin: 0;"> 📊 Baseline Evaluation </p> <p style="color: #4B4B4B; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>Starling7B</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>70.07% correctness in test (75 samples).</strong> </p> </div>

In [16]:
print("GPT 3.5 Turbo - Validation Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

GPT 3.5 Turbo - Validation Score: 



  0%|          | 0/25 [00:00<?, ?it/s]

Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:09<00:00,  2.77it/s]

Average Metric: 20 / 25  (80.0%)



  df = df.applymap(truncate_cell)


In [17]:
print("GPT 3.5 Turbo - Test Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

GPT 3.5 Turbo - Test Score: 



Average Metric: 53 / 75  (70.7): 100%|██████████| 75/75 [00:14<00:00,  5.06it/s] 

Average Metric: 53 / 75  (70.7%)



  df = df.applymap(truncate_cell)


#### Evaluate entire dataset - llama3-8b

<div style="background-color: #FFCCCB; padding: 10px; border-radius: 5px;"> <p style="color: #8B0000; font-size: 18px; font-weight: bold; margin: 0;"> ⚠️ Evaluation Stage 1 </p> <p style="color: #8B0000; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>llama3-8b</strong> achieves an <strong>72% correctness in validation (25 samples)</strong> and <strong>50.67% correctness in test (75 samples).</strong> </p> </div>

In [15]:
print("llama3-8b - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

llama3-8b - Validation Score: 



Average Metric: 11 / 25  (44.0): 100%|██████████| 25/25 [02:11<00:00,  5.28s/it]


44.0

In [16]:
print("llama3-8b - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

llama3-8b - Test Score: 



Average Metric: 29 / 75  (38.7): 100%|██████████| 75/75 [06:03<00:00,  4.85s/it]


38.67

### Optimize for Text2SQL

#### Create program

In [17]:
# Define the program ~ You can think of this a Pytorch model.
class TextToSqlProgram(dspy.Module):
    def __init__(self):
        super().__init__()
        self.program = dspy.ChainOfThought(signature=TextToSql)
    
    def forward(self, sql_prompt, sql_context):
        return self.program(
            sql_prompt=sql_prompt,
            sql_context=sql_context
        )

### FewShot

In [18]:
# Execute the optimizer -> this only adds few shots to the prompt
optimizer = LabeledFewShot(k=4)
optmized_program = optimizer.compile(student=TextToSqlProgram(), trainset=trainset)

In [19]:
optmized_program(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

Prediction(
    rationale="Follow the following format.\n\n### Sql Prompt: Natural language query\nList the top 5 countries with the highest number of satellites in orbit as of 20-01-01, ordered by the number of satellites in descending order.\n\n### Sql Context: Context for the query\nCREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);\n\n\n### Sql: SQL query\nSELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '20-01-01') t WHERE rn <= 5;\n```sql\nCREATE TABLE countries(id INT,",
    sql="Here is the SQL query:\n\nSELECT name, satellites_in_orbit FROM (SELECT name, satellites_in_orbit, ROW_NUMBER() OVER (ORDER BY satellites_in_orbit DESC) as rn FROM countries WHERE last_census_date <= '20-01-01') t WHERE rn <= 5;\n```sql\nCREATE TABLE countries(id INT,\nname VARCHAR(255),\npopulation INT,\nsatellites_i

#### What is happening inside?

In [20]:
lm.inspect_history(n=1)




Transform a natural language query into a SQL query.

---

Sql Prompt: What is the total number of electric vehicles sold by manufacturer 'XYZ'?
Sql Context: CREATE TABLE sales_data (manufacturer VARCHAR(10), vehicle_type VARCHAR(10), quantity INT);
Sql: SELECT manufacturer, SUM(quantity) FROM sales_data WHERE vehicle_type = 'Electric' AND manufacturer = 'XYZ' GROUP BY manufacturer;

Sql Prompt: What is the total number of amphibians in the 'animals' table with a population size greater than 1000?
Sql Context: CREATE TABLE animals (id INT, name VARCHAR(50), species VARCHAR(50), population_size INT); INSERT INTO animals (id, name, species, population_size) VALUES (1, 'Frog', 'Anura', 1200);
Sql: SELECT COUNT(*) FROM animals WHERE species = 'Anura' AND population_size > 1000;

Sql Prompt: What is the minimum depth a marine species can live at?
Sql Context: CREATE TABLE species (id INT, name VARCHAR(255), habitat VARCHAR(255), depth FLOAT); INSERT INTO species (id, name, habitat, depth

"\n\n\nTransform a natural language query into a SQL query.\n\n---\n\nSql Prompt: What is the total number of electric vehicles sold by manufacturer 'XYZ'?\nSql Context: CREATE TABLE sales_data (manufacturer VARCHAR(10), vehicle_type VARCHAR(10), quantity INT);\nSql: SELECT manufacturer, SUM(quantity) FROM sales_data WHERE vehicle_type = 'Electric' AND manufacturer = 'XYZ' GROUP BY manufacturer;\n\nSql Prompt: What is the total number of amphibians in the 'animals' table with a population size greater than 1000?\nSql Context: CREATE TABLE animals (id INT, name VARCHAR(50), species VARCHAR(50), population_size INT); INSERT INTO animals (id, name, species, population_size) VALUES (1, 'Frog', 'Anura', 1200);\nSql: SELECT COUNT(*) FROM animals WHERE species = 'Anura' AND population_size > 1000;\n\nSql Prompt: What is the minimum depth a marine species can live at?\nSql Context: CREATE TABLE species (id INT, name VARCHAR(255), habitat VARCHAR(255), depth FLOAT); INSERT INTO species (id, nam

#### Evaluate the optimized program


<div style="background-color: #FFF8DC; padding: 10px; border-radius: 5px;"> <p style="color: #DAA520; font-size: 18px; font-weight: bold; margin: 0;"> 🌟 Evaluation Stage 2 </p> <p style="color: #DAA520; font-size: 16px; margin: 5px 0 0;"> With <strong>Few Shot</strong> optimization, <strong>llama3-8b</strong> achieves an <strong>64% correctness in validation (25 samples)</strong> and <strong>60% correctness in test (75 samples).</strong> </p> </div>

In [21]:
print("llama3-8b + FewShotOptimizer - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

llama3-8b + FewShotOptimizer - Validation Score: 



Average Metric: 11 / 25  (44.0): 100%|██████████| 25/25 [02:10<00:00,  5.21s/it]


44.0

In [22]:
print("llama3-8b + FewShotOptimizer - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

llama3-8b + FewShotOptimizer - Test Score: 



Average Metric: 20 / 75  (26.7): 100%|██████████| 75/75 [07:15<00:00,  5.81s/it]


26.67

### BootstrapFewShotWithRandomSearch

[DSPy docs](https://dspy-docs.vercel.app/docs/building-blocks/optimizers) recommend that in a setup like the one with have at hand, with ~50 samples, the best option is to use `BootstrapFewShotWithRandomSearch`:

![image](assets/dspy.png)

In [23]:
optimizer2 = BootstrapFewShotWithRandomSearch(metric=correctness_metric, max_bootstrapped_demos=2, num_candidate_programs=8, num_threads=5)
optmized_program_2 = optimizer2.compile(student = TextToSqlProgram(), trainset=trainset, valset=valset)

  0%|          | 0/25 [00:00<?, ?it/s]

Average Metric: 11 / 25  (44.0): 100%|██████████| 25/25 [02:12<00:00,  5.31s/it]
Average Metric: 12 / 25  (48.0): 100%|██████████| 25/25 [03:10<00:00,  7.63s/it]
  5%|▌         | 4/75 [01:09<20:39, 17.46s/it]
Average Metric: 10 / 25  (40.0): 100%|██████████| 25/25 [03:00<00:00,  7.21s/it]
  4%|▍         | 3/75 [00:46<18:30, 15.42s/it]
Average Metric: 11 / 25  (44.0): 100%|██████████| 25/25 [02:47<00:00,  6.68s/it]
 12%|█▏        | 9/75 [02:42<19:48, 18.01s/it]
Average Metric: 7 / 25  (28.0): 100%|██████████| 25/25 [02:54<00:00,  6.99s/it]
  7%|▋         | 5/75 [01:37<22:38, 19.41s/it]
Average Metric: 11 / 25  (44.0): 100%|██████████| 25/25 [02:47<00:00,  6.71s/it]
 11%|█         | 8/75 [02:32<21:18, 19.09s/it]
Average Metric: 12 / 25  (48.0): 100%|██████████| 25/25 [02:54<00:00,  7.00s/it]
  4%|▍         | 3/75 [00:59<23:55, 19.94s/it]
Average Metric: 10 / 25  (40.0): 100%|██████████| 25/25 [02:41<00:00,  6.47s/it]
  3%|▎         | 2/75 [00:31<19:02, 15.64s/it]
Average Metric: 14 / 25 

In [24]:
optmized_program_2(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

Prediction(
    rationale="Here is the SQL query:\n\nCREATE TABLE emergency_contacts (id INT, name TEXT, phone_number TEXT);\n\nFollow the following format.\n\nSql Prompt: List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.\nSql Context: CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE); INSERT INTO countries (id, name, population, satellites_in_orbit) VALUES (1, 'USA', 331002651, 5000), (2, 'China', 1439323776, 4000), (3, 'Russia', 145934027,",
    sql="Here is the SQL query:\n\nSELECT name, satellites_in_orbit FROM countries ORDER BY satellites_in_orbit DESC LIMIT 5;\n```sql\nCREATE TABLE countries (id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);\nINSERT INTO countries VALUES (1, 'United States', 33.1002651, 5000, '20-01-01'), (2, 'China', 14.39e+7, 4000, '20-01-01'), (3, 'Russia', 14

In [25]:
lm.inspect_history(n=1)




Transform a natural language query into a SQL query.

---

Sql Prompt: How many accounts are associated with each risk category and what is the total assets value for each category?
Sql Context: CREATE TABLE account_risk (id INT, account_id INT, risk_category VARCHAR(255)); INSERT INTO account_risk (id, account_id, risk_category) VALUES (1, 1, 'High'), (2, 2, 'Medium'), (3, 3, 'Low'), (4, 4, 'High'), (5, 5, 'Medium'); CREATE TABLE accounts (id INT, customer_id INT, total_assets DECIMAL(10, 2)); INSERT INTO accounts (id, customer_id, total_assets) VALUES (1, 1, 100000), (2, 2, 150000), (3, 3, 80000), (4, 4, 120000), (5, 5, 90000);
Sql: SELECT r.risk_category, COUNT(r.account_id) AS num_accounts, SUM(a.total_assets) AS total_assets FROM account_risk r INNER JOIN accounts a ON r.account_id = a.id GROUP BY r.risk_category;

Sql Prompt: What is the total time spent on yoga and swimming activities for each member?
Sql Context: CREATE TABLE activity_time (member_id INT, activity VARCHAR(20

'\n\n\nTransform a natural language query into a SQL query.\n\n---\n\nSql Prompt: How many accounts are associated with each risk category and what is the total assets value for each category?\nSql Context: CREATE TABLE account_risk (id INT, account_id INT, risk_category VARCHAR(255)); INSERT INTO account_risk (id, account_id, risk_category) VALUES (1, 1, \'High\'), (2, 2, \'Medium\'), (3, 3, \'Low\'), (4, 4, \'High\'), (5, 5, \'Medium\'); CREATE TABLE accounts (id INT, customer_id INT, total_assets DECIMAL(10, 2)); INSERT INTO accounts (id, customer_id, total_assets) VALUES (1, 1, 100000), (2, 2, 150000), (3, 3, 80000), (4, 4, 120000), (5, 5, 90000);\nSql: SELECT r.risk_category, COUNT(r.account_id) AS num_accounts, SUM(a.total_assets) AS total_assets FROM account_risk r INNER JOIN accounts a ON r.account_id = a.id GROUP BY r.risk_category;\n\nSql Prompt: What is the total time spent on yoga and swimming activities for each member?\nSql Context: CREATE TABLE activity_time (member_id I

#### Evaluate the optimized program

<div style="background-color: #E0F8E0; padding: 10px; border-radius: 5px;"> <p style="color: #006400; font-size: 18px; font-weight: bold; margin: 0;"> ✅ Evaluation Stage 3 </p> <p style="color: #006400; font-size: 16px; margin: 5px 0 0;"> With <strong>BootstrapFewShotWithRandomSearch</strong> optimization, <strong>llama3-8b</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>68% correctness in test (75 samples).</strong> </p> </div>

In [26]:
print("llama3-8b + BootstrapFewShotWithRandomSearch - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)

llama3-8b + BootstrapFewShotWithRandomSearch - Validation Score: 



Average Metric: 14 / 25  (56.0): 100%|██████████| 25/25 [02:26<00:00,  5.87s/it]


56.0

In [27]:
print("llama3-8b + BootstrapFewShotWithRandomSearch - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)

llama3-8b + BootstrapFewShotWithRandomSearch - Test Score: 



Average Metric: 38 / 75  (50.7): 100%|██████████| 75/75 [07:35<00:00,  6.07s/it]


50.67