In [None]:
import logging
import sys

In [None]:
root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

In [None]:
import os
os.environ["LITELLM_LOG"] = "DEBUG"
import litellm
litellm.set_verbose=True

In [None]:
# %load_ext autoreload
# %autoreload 2
# import sys; sys.path.append('/future/u/okhattab/repos/public/stanfordnlp/dspy')

import dspy
from dspy.evaluate import Evaluate
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.modeling import JSONBackend
from dspy.modeling import TextBackend
from dspy.modeling import ChatBackend

In [5]:
import phoenix as px

from openinference.semconv.resource import ResourceAttributes
from openinference.instrumentation.dspy import DSPyInstrumentor
# from clank.so.openinference.semconv.resource import ResourceAttributes
# from clank.so-openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from openinference.semconv.trace import SpanAttributes

endpoint = "http://127.0.0.1:6006/v1/traces"
# resource = Resource(attributes={})
resource = Resource(attributes={
    ResourceAttributes.PROJECT_NAME: 'Span-test'
})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

In [6]:
gsm8k = GSM8K()


# backend = JSONBackend(model="ollama/codegemma:7b-code-q5_K_M", api_base="http://localhost:11435", params={"max_tokens": 500, "temperature": 0.1, "num_retries": 5})

# backend = TextBackend(model="ollama/llama3:70b", api_base="http://localhost:4000", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5})


backend = TextBackend(model="ollama/llama3:70b", params={"max_tokens": 500, "temperature": 0.3, "num_retries": 5, "repeat_penalty": 1.2, "top_p": 0.9})

# backend = ChatBackend(model="ollama/llama3:70b", api_base="http://localhost:11434", params={"max_tokens": 500, "temperature": 0.1, "num_retries": 5})

# backend = JSONBackend(
#     model="ollama/llama3:70b", 
#     params={
#         "max_tokens": 500, 
#         "temperature": 0.1, 
#         "num_retries": 5, 
#         "response_format": {
#             "type": "object",  # Added the "type" key here
#             "properties": {
#                 "question": {"title": "Question", "type": "string"},
#                 "rationale": {"title": "Rationale", "type": "string"},
#                 "answer": {"title": "Answer", "type": "string"}
#             }, 
#             "required": ["question", "rationale", "answer"]
#         }
#     }
# )


dspy.settings.configure(backend=backend)

trainset, devset = gsm8k.train[:10], gsm8k.dev[:10]

100%|██████████| 7473/7473 [00:00<00:00, 31149.01it/s]
100%|██████████| 1319/1319 [00:00<00:00, 32601.55it/s]


In [7]:
# import dspy
# from dspy.datasets.gsm8k import GSM8K, gsm8k_metric

# # Set up the LM.
# # turbo = dspy.OpenAI(model="llama3:70b", api_key="sk-1234", api_base="http://localhost:4001/",model_type='chat')
# turbo = dspy.OpenAI(model='gpt-3.5-turbo', api_key="sk-1234", max_tokens=1000, api_base='http://localhost:4000', model_type="chat")
# dspy.settings.configure(lm=turbo)

# # Load math questions from the GSM8K dataset.
# gsm8k = GSM8K()
# gsm8k_trainset, gsm8k_devset = gsm8k.train[:10], gsm8k.dev[:10]

In [8]:
NUM_THREADS = 16
evaluate = Evaluate(devset=devset[:], metric=gsm8k_metric, num_threads=NUM_THREADS, display_progress=True, display_table=0)

In [9]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)

In [10]:
RUN_FROM_SCRATCH = True

if RUN_FROM_SCRATCH:
    config = dict(max_bootstrapped_demos=4, max_labeled_demos=4, num_threads=NUM_THREADS)
    teleprompter = BootstrapFewShotWithRandomSearch(metric=gsm8k_metric, **config)
    cot_bs = teleprompter.compile(CoT(), trainset=trainset, valset=devset)
    # cot_bs.save('turbo_8_8_10_gsm8k_200_300.json')
else:
    cot_bs = CoT()
    cot_bs.load('turbo_8_8_10_gsm8k_200_300.json')

2024-06-25 01:09:51,069 - dspy.teleprompt.random_search - INFO - [2m2024-06-25T05:09:51.069167Z[0m [[32m[1minfo     [0m] [1mGoing to sample between       [0m [[0m[1m[34mdspy.teleprompt.random_search[0m][0m [36mfilename[0m=[35mrandom_search.py[0m [36mlineno[0m=[35m58[0m [36mpositional_args[0m=[35m(1, 'and', 4, 'traces per predictor.')[0m
2024-06-25 01:09:51,070 - dspy.teleprompt.random_search - INFO - [2m2024-06-25T05:09:51.070170Z[0m [[32m[1minfo     [0m] [1mWill attempt to train         [0m [[0m[1m[34mdspy.teleprompt.random_search[0m][0m [36mfilename[0m=[35mrandom_search.py[0m [36mlineno[0m=[35m61[0m [36mpositional_args[0m=[35m(16, 'candidate sets.')[0m


  0%|          | 0/10 [00:00<?, ?it/s]








Request to litellm:Request to litellm:
Request to litellm:

Request to litellm:
litellm.completion(model='ollama/llama3:70b', api_key=None, api_base=None, temperature=0.3, max_tokens=500, top_p=1, frequency_penalty=0, num_retries=5, messages=[{'role': 'user', 'content': "Given the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: ${answer}\n\n---\n\nQuestion: Martha's cat catches 3 rats and 7 birds. Cara's cat catches 3 less than five times as many animals as Martha's cat. How many animals does Cara's cat catch?\n\nReasoning: Let's think step by step in order to"}])litellm.completion(model='ollama/llama3:70b', api_key=None, api_base=None, temperature=0.3, max_tokens=500, top_p=1, frequency_penalty=0, num_retries=5, messages=[{'role': 'user', 'content': "Given the fields `question`, produce the fields `answer`.\n\n---\n\nFollow 













SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False2024-06-25 01:09:51,201 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:51.200633Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m




Average Metric: 0.0 / 1  (0.0):   0%|          | 0/10 [00:00<?, ?it/s]

SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: FalseFinal returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 1, 'repeat_penalty': 0}Final returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 1, 'repeat_penalty': 0}2024-06-25 01:09:51,214 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:51.213980Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m
Final returned optional params: {'num_predict': 500, 'temperature': 0.3, 'top_p': 1, 'repeat_penalty': 0}2024-06-25 01:09:51,214 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:51.214955Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.eval

Average Metric: 0.0 / 1  (0.0):  10%|█         | 1/10 [00:00<00:01,  8.40it/s]


2024-06-25 01:09:51,216 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:51.216523Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m


POST Request Sent from LiteLLM:
curl -X POST \
http://localhost:11434/api/generate \
-d '{'model': 'llama3:70b', 'prompt': "Given the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: ${answer}\n\n---\n\nQuestion: Martha's cat catches 3 rats and 7 birds. Cara's cat catches 3 less than five times as many animals as Martha's cat. How many animals does Cara's cat catch?\n\nReasoning: Let's think step by step in order to", 'options': {'num_predict': 500, 'temperature': 0.3, 'top_p': 1, 'repeat_pena

Average Metric: 0.0 / 2  (0.0):  10%|█         | 1/10 [00:00<00:01,  8.40it/s]






POST Request Sent from LiteLLM:
curl -X POST \
http://localhost:11434/api/generate \
-d '{'model': 'llama3:70b', 'prompt': "Given the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: ${answer}\n\n---\n\nQuestion: Trey is raising money for a new bike that costs $112. He plans to spend the next two weeks selling bracelets for $1 each. On average, how many bracelets does he need to sell each day?\n\nReasoning: Let's think step by step in order to", 'options': {'num_predict': 500, 'temperature': 0.3, 'top_p': 1, 'repeat_penalty': 0}, 'stream': False}'



Average Metric: 0.0 / 3  (0.0):  20%|██        | 2/10 [00:00<00:00,  8.40it/s]






Average Metric: 0.0 / 4  (0.0):  30%|███       | 3/10 [00:00<00:00,  8.40it/s]




Average Metric: 0.0 / 6  (0.0):  50%|█████     | 5/10 [00:00<00:00,  8.40it/s]

Looking up model=ollama/llama3:70b in model_cost_map
Success: model=ollama/llama3:70b in model_cost_map
prompt_tokens=77; completion_tokens=1
Returned custom cost for model=ollama/llama3:70b - prompt_tokens_cost_usd_dollar: 0.0, completion_tokens_cost_usd_dollar: 0.0
final cost: 0.0; prompt_tokens_cost_usd_dollar: 0.0; completion_tokens_cost_usd_dollar: 0.0
2024-06-25 01:09:53,623 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:53.622654Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m


Average Metric: 0.0 / 7  (0.0):  70%|███████   | 7/10 [00:02<00:01,  2.71it/s]

Looking up model=ollama/llama3:70b in model_cost_map
Success: model=ollama/llama3:70b in model_cost_map
prompt_tokens=56; completion_tokens=45
Returned custom cost for model=ollama/llama3:70b - prompt_tokens_cost_usd_dollar: 0.0, completion_tokens_cost_usd_dollar: 0.0
final cost: 0.0; prompt_tokens_cost_usd_dollar: 0.0; completion_tokens_cost_usd_dollar: 0.0
2024-06-25 01:09:56,016 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:56.016981Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m


Average Metric: 0.0 / 8  (0.0):  80%|████████  | 8/10 [00:04<00:01,  1.38it/s]

Looking up model=ollama/llama3:70b in model_cost_map
Success: model=ollama/llama3:70b in model_cost_map
prompt_tokens=59; completion_tokens=62
Returned custom cost for model=ollama/llama3:70b - prompt_tokens_cost_usd_dollar: 0.0, completion_tokens_cost_usd_dollar: 0.0
final cost: 0.0; prompt_tokens_cost_usd_dollar: 0.0; completion_tokens_cost_usd_dollar: 0.0
2024-06-25 01:09:59,322 - dspy.evaluate.evaluate - ERROR - [2m2024-06-25T05:09:59.322459Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Generation failed, recursively attempts to complete did not succeed.[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m


Average Metric: 0.0 / 9  (0.0):  90%|█████████ | 9/10 [00:08<00:01,  1.24s/it]

Looking up model=ollama/llama3:70b in model_cost_map
Success: model=ollama/llama3:70b in model_cost_map
prompt_tokens=67; completion_tokens=500
Returned custom cost for model=ollama/llama3:70b - prompt_tokens_cost_usd_dollar: 0.0, completion_tokens_cost_usd_dollar: 0.0
final cost: 0.0; prompt_tokens_cost_usd_dollar: 0.0; completion_tokens_cost_usd_dollar: 0.0


Exception: Generation failed, recursively attempts to complete did not succeed.

In [None]:
evaluate(cot_bs, devset=devset[:])

In [None]:
print(backend.history[-1].prompt.to_str())