In [1]:
import os
import re
import time
import random
import pandas as pd
import dspy
import phoenix as px

from dspy import evaluate
from dspy.datasets import DataLoader
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot
# from utils_random_search import BootstrapFewShotWithRandomSearch
# from utils_evaluate import Evaluate as Evaluate_multiple
from dspy.teleprompt import LabeledFewShot
from openinference.semconv.resource import ResourceAttributes
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from openinference.semconv.trace import SpanAttributes
from phoenix.trace import using_project
from dspy.modeling import TextBackend

In [2]:
import logging
import sys

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

import os
os.environ["LITELLM_LOG"] = "DEBUG"
import litellm
litellm.set_verbose=True

In [3]:
import phoenix as px

from openinference.semconv.resource import ResourceAttributes
from openinference.instrumentation.dspy import DSPyInstrumentor
# from clank.so.openinference.semconv.resource import ResourceAttributes
# from clank.so-openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from openinference.semconv.trace import SpanAttributes

endpoint = "http://127.0.0.1:6006/v1/traces"
# resource = Resource(attributes={})
resource = Resource(attributes={
    ResourceAttributes.PROJECT_NAME: 'Span-test'
})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

In [4]:
# %load_ext autoreload
# %autoreload 2
# import sys; sys.path.append('/future/u/okhattab/repos/public/stanfordnlp/dspy')

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.modeling import JSONBackend, TextBackend

In [5]:
# backend = JSONBackend(model="ollama/llama3:70b", api_base="http://localhost:11434", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9})
# backend = JSONBackend(model="ollama/llama3:70b", api_base="http://localhost:11434", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9, "response_format": {"type": "json_object"}})


# backend = TextBackend((model="ollama/llama3:70b", api_base="http://localhost:11434", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9})
eval_backend = TextBackend(model="ollama/llama3:70b", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9})#, "response_format": {"type": "json_object"}})

backend = TextBackend(model="ollama/mistral:7b-instruct-v0.3-q5_K_M",  api_base="http://localhost:11435", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9})#, "response_format": {"type": "json_object"}})

dspy.settings.configure(context=eval_backend)
dspy.settings.configure(backend=backend)


In [6]:
from dspy.datasets import DataLoader
def load_and_sample_dataset(number_of_samples=200):
    """Load and sample the dataset from HuggingFace."""
    dl = DataLoader()
    testset = dl.from_huggingface(
        dataset_name="gretelai/synthetic_text_to_sql",
        fields=("sql_prompt", "sql_context", "sql"),
        input_keys=("sql_prompt", "sql_context"),
        split="test"
    )
    return dl.sample(dataset=testset, n=number_of_samples)

def debug_testset(dataset):
    """For testing purposes, return 5 samples from each set."""
    train_size = 2
    val_size = 2
    test_size = 2

    trainset = dataset[:train_size]
    valset = dataset[train_size:train_size + val_size]
    testset = dataset[train_size + val_size:train_size + val_size + test_size]
    
    return trainset, valset, testset
    
testset = load_and_sample_dataset(6)
trainset, valset, testset = debug_testset(testset)

In [7]:
class SQLMatch(dspy.Signature):
    """Signature for matching SQL queries."""
    sql_reference = dspy.InputField(desc="Reference SQL query")
    sql_predicted = dspy.InputField(desc="Predicted SQL query")
    match = dspy.OutputField(desc="Indicate whether the reference and predicted SQL query match", prefix="Yes/No:")

match_instruction = """
Given a reference SQL query and a predicted SQL query, determine if the predicted SQL query matches the reference SQL query. Output only 'Yes' if it matches, otherwise output only 'No'.
"""
SQLMatch = SQLMatch.with_instructions(match_instruction)

def match_metric(example, pred, trace=None):
    """Evaluate if the predicted SQL query matches the reference SQL query."""
    sql_reference, sql_predicted = example.sql, pred.sql
    match = dspy.Predict(SQLMatch)
    print("match_metric: ", sql_reference, sql_predicted)
    # with dspy.context(lm=evaluator_lm_backend):
    with dspy.context(lm=eval_backend):
        print("Context")
        is_match = match(sql_reference=sql_reference, sql_predicted=sql_predicted)
        print("is_match: ", is_match)
    match_output = is_match.match.strip()
    match_score = int(re.search(r'\bYes\b', match_output, re.IGNORECASE) is not None)
    return match_score


In [8]:
class TextToSql(dspy.Signature):
    """Signature for Text to SQL generation task."""
    sql_prompt = dspy.InputField(desc="Natural language query")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.OutputField(desc="SQL query")

In [9]:

# class TextToSqlProgram(dspy.Module):
#     """A module that represents the program for generating SQL from natural language."""
#     def __init__(self):
#         super().__init__()
#         self.program = dspy.ChainOfThought(signature=TextToSql)

#     def forward(self, sql_prompt, sql_context):
#         return self.program(sql_prompt=sql_prompt, sql_context=sql_context)

In [10]:
generate_sql_query = dspy.Predict(signature=TextToSql)

In [11]:
match_evaluate = Evaluate(devset=trainset, metric=match_metric, num_threads=16, display_progress=True, display_table=0, return_all_scores=True, return_outputs=True)

In [12]:
match_score, match_result = match_evaluate(generate_sql_query)

Model Kwargs: Model Kwargs:   {'temperature': 0.3, 'max_tokens': 500, 'top_p': 0.9, 'frequency_penalty': 0, 'num_retries': 5, 'repeat_penalty': 1.2, 'messages': [{'role': 'user', 'content': "Signature for Text to SQL generation task.\n\n---\n\nFollow the following format.\n\nSql Prompt: Natural language query\n\nSql Context: Context for the query\n\nSql: SQL query\n\n---\n\nSql Prompt: What is the average age of children in the refugee_support program who have been relocated to France?\n\nSql Context: CREATE TABLE refugee_support (child_id INT, name VARCHAR(50), age INT, gender VARCHAR(10), country VARCHAR(50)); INSERT INTO refugee_support (child_id, name, age, gender, country) VALUES (1, 'John Doe', 12, 'Male', 'Syria'), (2, 'Jane Doe', 15, 'Female', 'Afghanistan');\n\nSql:"}]}{'temperature': 0.3, 'max_tokens': 500, 'top_p': 0.9, 'frequency_penalty': 0, 'num_retries': 5, 'repeat_penalty': 1.2, 'messages': [{'role': 'user', 'content': "Signature for Text to SQL generation task.\n\n---\

  0%|          | 0/2 [00:00<?, ?it/s]






Request to litellm:Request to litellm:

litellm.completion(model='ollama/mistral:7b-instruct-v0.3-q5_K_M', api_key=None, api_base='http://localhost:11435', temperature=0.3, max_tokens=500, top_p=0.9, frequency_penalty=0, num_retries=5, repeat_penalty=1.2, messages=[{'role': 'user', 'content': "Signature for Text to SQL generation task.\n\n---\n\nFollow the following format.\n\nSql Prompt: Natural language query\n\nSql Context: Context for the query\n\nSql: SQL query\n\n---\n\nSql Prompt: Determine the number of circular economy initiatives in the Americas that are more than 5 years old.\n\nSql Context: CREATE TABLE CircularEconomyAmericas (id INT, country VARCHAR(50), region VARCHAR(50), initiative_age INT); INSERT INTO CircularEconomyAmericas (id, country, region, initiative_age) VALUES (1, 'USA', 'Americas', 7), (2, 'Canada', 'Americas', 3), (3, 'Brazil', 'Americas', 6);\n\nSql:"}])litellm.completion(model='ollama/mistral:7b-instruct-v0.3-q5_K_M', api_key=None, api_base='http://









SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: FalseFinal returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 0.9, 'repeat_penalty': 1.2}



POST Request Sent from LiteLLM:
curl -X POST \
http://localhost:11435/api/generate \
-d '{'model': 'mistral:7b-instruct-v0.3-q5_K_M', 'prompt': "### User:\nSignature for Text to SQL generation task.\n\n---\n\nFollow the following format.\n\nSql Prompt: Natural language query\n\nSql Context: Context for the query\n\nSql: SQL query\n\n---\n\nSql Prompt: Determine the number of circular economy initiatives in the Americas that are more than 5 years old.\n\nSql Context: CREATE TABLE CircularEconomyAmericas (id INT, country VARCHAR(50), region VARCHAR(50), initiative_age INT); INSERT INTO CircularEconomyAmericas (id, country, region, initiative_age) VALUES (1, 'USA', 'Americas', 7), (2, 'Canada', 'Americas', 3), (3, 'Brazil', 'Americas', 6);\n\nSql:\n### Response:", 'options': {'num_predict':



SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 0.9, 'repeat_penalty': 1.2}


POST Request Sent from LiteLLM:
curl -X POST \
http://localhost:11435/api/generate \
-d '{'model': 'mistral:7b-instruct-v0.3-q5_K_M', 'prompt': "### User:\nGiven a reference SQL query and a predicted SQL query, determine if the predicted SQL query matches the reference SQL query. Output only 'Yes' if it matches, otherwise output only 'No'.\n\n---\n\nFollow the following format.\n\nSql Reference: Reference SQL query\n\nSql Predicted: Predicted SQL query\n\nYes/No: Indicate whether the reference and predicted SQL query match\n\n---\n\nSql Reference: SELECT COUNT(*) FROM CircularEconomyAmericas WHERE initiative_age > 5 AND region = 'Americas';\n\nSql Predicted: Sql: ```sql SELECT COUNT(*) FROM CircularEconomyAmericas WHERE initiative_age > 5; ```\n\nYes/No:\n### Response:", 'options': {'num_



SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 0.9, 'repeat_penalty': 1.2}


POST Request Sent from LiteLLM:
curl -X POST \
http://localhost:11435/api/generate \
-d '{'model': 'mistral:7b-instruct-v0.3-q5_K_M', 'prompt': "### User:\nGiven a reference SQL query and a predicted SQL query, determine if the predicted SQL query matches the reference SQL query. Output only 'Yes' if it matches, otherwise output only 'No'.\n\n---\n\nFollow the following format.\n\nSql Reference: Reference SQL query\n\nSql Predicted: Predicted SQL query\n\nYes/No: Indicate whether the reference and predicted SQL query match\n\n---\n\nSql Reference: SELECT AVG(age) FROM refugee_support WHERE country = 'France';\n\nSql Predicted: Sql: ```sql SELECT AVG(age) FROM refugee_support WHERE country = 'France'; ```\n\nYes/No:\n### Response:", 'options': {'num_predict': 500, 'temperature': 0.3, 'top_

Average Metric: 1 / 1  (100.0):  50%|█████     | 1/2 [00:32<00:32, 32.80s/it]

Looking up model=ollama/mistral:7b-instruct-v0.3-q5_K_M in model_cost_map
Model=mistral:7b-instruct-v0.3-q5_K_M for LLM Provider=ollama not found in completion cost map.
Cached Response:  ModelResponse(id='chatcmpl-c3f6b9fa-96ea-46fe-b846-3a9dd753735d', choices=[Choices(finish_reason='stop', index=0, message=Message(content=' Yes', role='assistant'))], created=1719299700, model='ollama/mistral:7b-instruct-v0.3-q5_K_M', object='chat.completion', system_fingerprint=None, usage=Usage(prompt_tokens=59, completion_tokens=2, total_tokens=61))
is_match:  Prediction(
    demos=[],
    sql_reference="SELECT AVG(age) FROM refugee_support WHERE country = 'France';",
    sql_predicted="Sql:\n```sql\nSELECT AVG(age) FROM refugee_support WHERE country = 'France';\n```",
    match='Yes'
)


Average Metric: 2 / 2  (100.0): 100%|██████████| 2/2 [00:35<00:00, 17.96s/it]


2024-06-25 03:15:00,364 - dspy.evaluate.evaluate - INFO - 2024-06-25T07:15:00.364000Z [info     ] Average Metric: 2 / 2 (100.0%) [dspy.evaluate.evaluate] filename=evaluate.py lineno=200
