llama-3-8b-Instruct-bnb-4bit-synthetic_text_to_sql-lora-3epochs-Q5_K_M

#### Libraries

In [None]:
import dspy
import random
from dotenv import load_dotenv
from dspy.datasets import DataLoader
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot

#from src.starling import StarlingLM # <- Custom Local Model Client for llama3-8b

_ = load_dotenv()

In [2]:
import phoenix as px

phoenix_session = px.launch_app()

from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

print(phoenix_session.url)

WARNI [phoenix.session.session] Existing running Phoenix instance detected! Shutting it down and starting a new instance...


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


ModuleNotFoundError: No module named 'openinference.instrumentation.dspy'

#### LLM

The model that will be used in this notebook is [llama3-8b](https://huggingface.co/Nexusflow/Starling-LM-7B-beta), and the evaluation model will be [GPT-4 Turbo](https://openai.com/gpt-4).

In [None]:
# Share generation args between models
generation_args = {
    "temperature":0,
    "max_tokens":500,
    "stop":"\n\n",
    "model_type":"chat",
    "n": 1
}
# Model specific args
model_info = {
    "gpt-4": {"model": "gpt-4-0125-preview", "api_base": "https://api.openai.com/v1/"},
    "starling": {"model": "Nexusflow/Starling-LM-7B-beta"}
}

In [None]:
# Set up the models
# lm = StarlingLM(**model_info["starling"], **generation_args)
# evaluator_lm = dspy.OpenAI(**model_info["gpt-4"], **generation_args)

# lm = dspy.OllamaLocal(model="llama-3-8b-Instruct-bnb-4bit-synthetic_text_to_sql-lora-3epochs-Q5_K_M:latest", base_url='http://localhost:11435')
lm = dspy.OllamaLocal(model="llama-3-8b-Instruct-bnb-4bit-synthetic_text_to_sql-lora-3epochs-Q5_K_M:latest", base_url='http://localhost:11435')
# evaluator_lm = dspy.OpenAI(**model_info["gpt-4"], **generation_args)
evaluator_lm = dspy.OllamaLocal(model='llama3:70b', base_url='http://localhost:11434')

dspy.configure(lm=lm)

In [None]:
# Testing inference of Starling
lm("What is the capital of Colombia?")

#### Load dataset

The dataset that will be used in this notebook is [gretelai/synthetic_text_to_sql](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql)

In [None]:
# Define random seed
random.seed(1399)

In [None]:
# # Load dataset
# dl = DataLoader()
# trainset = dl.from_huggingface(
#     dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
#     fields=("sql_prompt", "sql_context", "sql"), # Fields needed
#     input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
#     split="train"
# )

# testset = dl.from_huggingface(
#     dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
#     fields=("sql_prompt", "sql_context", "sql"), # Fields needed
#     input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
#     split="test"
# )


# trainset = dl.sample(dataset=trainset, n=100)
# testset = dl.sample(dataset=testset, n=75)
# finetuneset = dl.sample(dataset=trainset, n=100)

# _trainval = dl.train_test_split(dataset=trainset, test_size=0.25, random_state=1399) # 25% of training data for validation
# trainset, valset = _trainval["train"], _trainval["test"]

# len(trainset), len(valset), len(testset)

In [None]:
# Load dataset
dl = DataLoader()

testset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
    split="test"
)
# print(len(testset))

testset = dl.sample(dataset=testset, n=200)

# Calculate the sizes of each set
total_size = len(testset)
train_size = int(total_size * 0.4)  # 40% of the data
val_size = int(total_size * 0.2)  # 20% of the data

# Split the dataset into train, validation and test sets
trainset = testset[:train_size]
valset = testset[train_size:train_size + val_size]
testset = testset[train_size + val_size:]

print(len(trainset), len(valset), len(testset))

In [None]:
# Verify an example of the dataset
sample = trainset[0]
for k, v in sample.items():
    print(f"\n{k.upper()}:\n")
    print(v)

In [None]:
# # Verify an example of the dataset
# sample = dl.sample(dataset=trainset, n=1)[0]
# for k, v in sample.items():
#     print(f"\n{k.upper()}:\n")
#     print(v)

#### Signature (Input/Output)

In [None]:
class TextToSql(dspy.Signature):
    """Transform a natural language query into a SQL query."""

    sql_prompt = dspy.InputField(desc="Natural language query")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.OutputField(desc="SQL query")

### Inference

#### Baseline Inference

In [None]:
generate_sql_query = dspy.Predict(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)

#### ChainOfThought Inference

In [None]:
generate_sql_query = dspy.ChainOfThought(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)

### Metric of evaluation

#### Metric definition

In [None]:
class Correctness(dspy.Signature):
    """Assess if the SQL query accurately answers the given natural language query based on the provided context."""

    sql_prompt = dspy.InputField(desc="Natural language query ")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.InputField(desc="SQL query")
    correct = dspy.OutputField(desc="Indicate whether the SQL query correctly answers the natural language query based on the given context", prefix="Yes/No:")

In [None]:
def correctness_metric(example, pred, trace=None):
    sql_prompt, sql_context, sql = example.sql_prompt, example.sql_context, pred.sql

    correctness = dspy.Predict(Correctness)

    with dspy.context(lm=evaluator_lm): 
        correct = correctness(
            sql_prompt=sql_prompt,
            sql_context=sql_context,
            sql=sql,
        )
    
    score = int(correct.correct=="Yes")

    if trace is not None:
        return score == 1

    return score

#### Evaluate single data point

In [None]:
_correctness = correctness_metric(
    example=sample,
    pred=result
)
print(f"Correct SQL query: {'Yes' if _correctness else 'No'}")

In [None]:
evaluator_lm.inspect_history(n=1)

#### Evaluate entire dataset - GPT 3.5

<div style="background-color: #F0F0F0; padding: 10px; border-radius: 5px;"> <p style="color: #4B4B4B; font-size: 18px; font-weight: bold; margin: 0;"> 📊 Baseline Evaluation </p> <p style="color: #4B4B4B; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>Starling7B</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>70.07% correctness in test (75 samples).</strong> </p> </div>

In [None]:
print("GPT 3.5 Turbo - Validation Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

In [None]:
print("GPT 3.5 Turbo - Test Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

#### Evaluate entire dataset - llama3-8b

<div style="background-color: #FFCCCB; padding: 10px; border-radius: 5px;"> <p style="color: #8B0000; font-size: 18px; font-weight: bold; margin: 0;"> ⚠️ Evaluation Stage 1 </p> <p style="color: #8B0000; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>llama3-8b</strong> achieves an <strong>72% correctness in validation (25 samples)</strong> and <strong>50.67% correctness in test (75 samples).</strong> </p> </div>

In [None]:
print("llama3-8b - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

In [None]:
print("llama3-8b - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

### Optimize for Text2SQL

#### Create program

In [None]:
# Define the program ~ You can think of this a Pytorch model.
class TextToSqlProgram(dspy.Module):
    def __init__(self):
        super().__init__()
        self.program = dspy.ChainOfThought(signature=TextToSql)
    
    def forward(self, sql_prompt, sql_context):
        return self.program(
            sql_prompt=sql_prompt,
            sql_context=sql_context
        )

### FewShot

In [None]:
# Execute the optimizer -> this only adds few shots to the prompt
optimizer = LabeledFewShot(k=4)
optmized_program = optimizer.compile(student=TextToSqlProgram(), trainset=trainset)

In [None]:
optmized_program(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

#### What is happening inside?

In [None]:
lm.inspect_history(n=1)

#### Evaluate the optimized program


<div style="background-color: #FFF8DC; padding: 10px; border-radius: 5px;"> <p style="color: #DAA520; font-size: 18px; font-weight: bold; margin: 0;"> 🌟 Evaluation Stage 2 </p> <p style="color: #DAA520; font-size: 16px; margin: 5px 0 0;"> With <strong>Few Shot</strong> optimization, <strong>llama3-8b</strong> achieves an <strong>64% correctness in validation (25 samples)</strong> and <strong>60% correctness in test (75 samples).</strong> </p> </div>

In [None]:
print("llama3-8b + FewShotOptimizer - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

In [None]:
print("llama3-8b + FewShotOptimizer - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

### BootstrapFewShotWithRandomSearch

[DSPy docs](https://dspy-docs.vercel.app/docs/building-blocks/optimizers) recommend that in a setup like the one with have at hand, with ~50 samples, the best option is to use `BootstrapFewShotWithRandomSearch`:

![image](assets/dspy.png)

In [None]:
optimizer2 = BootstrapFewShotWithRandomSearch(metric=correctness_metric, max_bootstrapped_demos=2, num_candidate_programs=8, num_threads=5)
optmized_program_2 = optimizer2.compile(student = TextToSqlProgram(), trainset=trainset, valset=valset)

In [None]:
optmized_program_2(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

In [None]:
lm.inspect_history(n=1)

#### Evaluate the optimized program

<div style="background-color: #E0F8E0; padding: 10px; border-radius: 5px;"> <p style="color: #006400; font-size: 18px; font-weight: bold; margin: 0;"> ✅ Evaluation Stage 3 </p> <p style="color: #006400; font-size: 16px; margin: 5px 0 0;"> With <strong>BootstrapFewShotWithRandomSearch</strong> optimization, <strong>llama3-8b</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>68% correctness in test (75 samples).</strong> </p> </div>

In [None]:
print("llama3-8b + BootstrapFewShotWithRandomSearch - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)

In [None]:
print("llama3-8b + BootstrapFewShotWithRandomSearch - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)