In [2]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5500")
mlflow.set_experiment("rag")
mlflow.dspy.autolog(
    log_compiles=True,    # Track optimization process
    log_evals=True,       # Track evaluation results
    log_traces_from_compile=True  # Track program traces during optimization
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import dspy
import openai
import os

LLM_URL=os.getenv('LLM_URL', 'http://localhost:8080/v1')
API_KEY=os.getenv('API_KEY', 'fake')
LLM_MODEL=os.getenv('LLM_MODEL', 'openai/models/Llama-3.2-3B-Instruct-Q8_0.gguf')
MAX_TOKENS=os.getenv('MAX_TOKENS', 6000)
TEMPERATURE=os.getenv('TEMPERATURE', 0.2)
dspy.enable_logging()
lm = dspy.LM(model=LLM_MODEL,
             api_base=LLM_URL,  # ensure this points to your port
             api_key=API_KEY,
             temperature=TEMPERATURE,
             model_type='chat',
             stream=False)
dspy.configure(lm=lm)
#dspy.settings.configure(track_usage=True)

In [None]:
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")

[92m10:37:44 - LiteLLM:INFO[0m: utils.py:2991 - 
LiteLLM completion() model= Llama-3.2-3B-Instruct-Q8_0.gguf; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= Llama-3.2-3B-Instruct-Q8_0.gguf; provider = openai
INFO:httpx:HTTP Request: POST http://localhost:8080/v1/chat/completions "HTTP/1.1 200 OK"
[92m10:37:47 - LiteLLM:INFO[0m: utils.py:1213 - Wrapper: Completed Call, calling success_handler
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler
[92m10:37:47 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:37:47 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:37:47 - LiteLLM:INFO[0m: cost_calculator.py:655 -

[92m10:40:01 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:40:01 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:40:01 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: openai/Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:40:01 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: Llama-3.2-3B-Instruct-Q8_0.gguf
INFO:LiteLLM:selected model name for cost calculation: Llama-3.2-3B-Instruct-Q8_0.gguf
[92m10:42:04 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calc

In [9]:
dspy.inspect_history(n=1)





[34m[2025-06-03T10:37:47.458714][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)
Your output fields are:
1. `response` (str)
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## question ## ]]
what are high memory and low memory on linux?

Respond with the corresponding output fields, starting with the field `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## response ## ]]
High memory and low memory are two terms used to describe the amount of free memory available on a Linux system. High memory refers to a system with a significant amount of free memory, typically above 50% of the total system memory. This all

In [4]:
cot = dspy.ChainOfThought('question -> response')
cot(question="what is red hat linux?")

Prediction(
    reasoning='Red Hat Linux is a popular open-source operating system based on the Linux kernel. It is developed and maintained by Red Hat, a company that provides support, services, and subscriptions to users. Red Hat Linux is known for its stability, security, and ease of use, making it a popular choice for servers, desktops, and mobile devices.',
    response='Red Hat Linux is a variant of the Linux operating system that is designed to be highly stable, secure, and easy to use. It is available in several different editions, including Red Hat Enterprise Linux (RHEL), Red Hat Enterprise Linux Server (RHES), and Red Hat Enterprise Linux Workstation (RHELW). Red Hat Linux is widely used in enterprise environments, as well as by individuals and organizations of all sizes.'
)

In [5]:
import ujson
from dspy.utils import download

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl")

with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

In [6]:
# Inspect one datapoint.
data[0]

{'question': 'why igp is used in mpls?',
 'response': "An IGP exchanges routing prefixes between gateways/routers.  \nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.",
 'gold_doc_ids': [2822, 2823]}

In [7]:
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})

In [8]:
import random

random.Random(0).shuffle(data)
# 200, 200:500, 500:1000
trainset, devset, testset = data[:50], data[50:150], data[150:450]

len(trainset), len(devset), len(testset)

(50, 100, 300)

In [9]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

# Produce a prediction from our `cot` module, using the `example` above as input.
pred = cot(**example.inputs())

print(example)
print(pred)

# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})
Prediction(
    reasoning='Your text messages are being flagged as "maybe" because our system is programmed to flag messages that don\'t contain a clear question or statement. This is a precautionary measure to ensure that we don\'t send unsolicited messages to users. However, it\'s possible that the message you\'re referring to is a legitimate question or statement that was misinterpreted.',
    response='To resolve this issue, you can try rephrasing y

In [10]:
dspy.inspect_history(n=1)





[34m[2025-06-03T12:00:11.444069][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)
2. `ground_truth` (str)
3. `system_response` (str)
Your output fields are:
1. `reasoning` (str)
2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth
3. `system_response_key_ideas` (str): enumeration of key ideas in the system response
4. `discussion` (str): discussion of the overlap between ground truth and system response
5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response
6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## ground_truth ## ]]
{ground_truth}

[[ ## system_response ## ]]
{system_response}

[[ ## reasoning ## ]]
{reasoning}

[[ ## ground_truth_key_ideas ## ]]
{ground_truth_key_ideas}

[[ ## system_response_key_ideas ## ]]


In [12]:
import mlflow

with mlflow.start_run(run_name="rag_evaluation"):
    evaluate = dspy.Evaluate(
        devset=devset,
        metric=metric,
        num_threads=24,
        display_progress=True,
        # To record the outputs and detailed scores to MLflow
        return_all_scores=True,
        return_outputs=True,
    )

    # Evaluate the program as usual
    aggregated_score, outputs, all_scores = evaluate(cot)


    # Log the aggregated score
    mlflow.log_metric("semantic_f1_score", aggregated_score)
    # Log the detailed evaluation results as a table
    mlflow.log_table(
        {
            "Question": [example.question for example in devset],
            "Gold Response": [example.response for example in devset],
            "Predicted Response": outputs,
            "Semantic F1 Score": all_scores,
        },
        artifact_file="eval_results.json",
    )

Average Metric: 15.41 / 34 (45.3%):  34%|███▍      | 34/100 [00:02<00:03, 16.99it/s]



Average Metric: 49.38 / 100 (49.4%): 100%|██████████| 100/100 [00:05<00:00, 18.37it/s]

2025/06/03 12:08:28 INFO dspy.evaluate.evaluate: Average Metric: 49.38247399466397 / 100 (49.4%)



🏃 View run eval at: http://localhost:5500/#/experiments/414799578984116612/runs/e8923ca5fe75402f9ffe2db0d82b668d
🧪 View experiment at: http://localhost:5500/#/experiments/414799578984116612
🏃 View run rag_evaluation at: http://localhost:5500/#/experiments/414799578984116612/runs/e0a6ff97574c44aeb04efdb18da2c037
🧪 View experiment at: http://localhost:5500/#/experiments/414799578984116612


In [None]:
# Define an evaluator that we can re-use (non-mlflow)
#evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
#                         display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
#evaluate(cot)