In [4]:
import dspy
from dspy.utils import download

In [None]:
# Uncomment for downloading the necessary json files.
# download("https://huggingface.co/dspy/cache/blob/main/ragqa_arena_tech_corpus.jsonl")
# download("https://huggingface.co/dspy/cache/blob/main/ragqa_arena_tech_examples.jsonl")

In [3]:
#connect to LM from your DSPy code
options = {"num_ctx": 1024}
lm = dspy.LM('ollama_chat/deepseek-r1:7b', api_base='http://localhost:11434', num_ctx= 8192*2)
dspy.configure(lm=lm)

In [4]:
#Calling LM directly
lm("Say this is a test!", temperature=0.7)  # => ['This is a test!']
lm(messages=[{"role": "user", "content": "Say this is a test in Turkish three times."}])  # => ['This is a test!']

['<think>\nOkay, so I need to figure out how to respond to the user\'s request where they asked me to say "This is a test in Turkish" three times. Hmm, let\'s break this down.\n\nFirst, I should understand what exactly the user is asking for. They want me to repeat that specific sentence in Turkish three times. So my response needs to be straightforward and repetitive as per their instruction.\n\nWait, but why would someone ask that? Maybe they\'re testing if I can do basic language translation or if I\'m paying attention. Alternatively, it could be part of a larger task or assessment. Regardless, the immediate request is clear: repeat the sentence three times in Turkish.\n\nI should make sure each repetition is correct and uses proper Turkish grammar. Let me think about how "This is a test in Turkish" translates into Turkish. The phrase would be "Bu bir Türkçe test." So I need to say that three times accurately.\n\nAlso, considering the user might want it in a specific format or with 

In [6]:
#Calling LM directly
lm("Say this is a test!", temperature=0.7)  # => ['This is a test!']
lm(messages=[{"role": "user", "content": "Say this is a test in German three times."}])  # => ['This is a test!']

['<think>\nOkay, so I need to say "This is a test" in German three times. Hmm, let\'s think about how to approach this.\n\nFirst, I should figure out the correct German phrase for "This is a test." In English, it\'s pretty straightforward: "This is a test." But in German, it might be slightly different because of their sentence structure and word order.\n\nI remember that in German, subjects come before verbs. So maybe instead of saying "This is a test," the German equivalent would start with the subject. Let me think... Oh, right! It\'s often phrased as "Das ist ein Test." That makes sense because "das" means this, "ist" is to be, and "ein Test" means a test.\n\nSo each time I say it, I should repeat "Das ist ein Test." But the user asked for three times. So I just need to write that phrase three separate times in German.\n\nWait, but maybe they want me to structure it differently? Like using different tenses or something else? No, the instruction was straightforward: "Say this is a t

# Basic DSPy Modules

# Math

In [8]:
math = dspy.ChainOfThought("question -> answer:float")
output = math(question="Two dice are tossed. What is the probability that the sum equals two?Explain to me in detail")
print(output)

Prediction(
    reasoning='Each die has six faces (1 to 6). The total number of possible outcomes when tossing both dice is 6 * 6 = 36. Only one outcome (both dice showing 1) results in a sum of two. Therefore, the probability is 1/36 ≈ 0.0278.',
    answer=0.0278
)


In [6]:
# Converting the fraction to a float programmatically:
import fractions

math = dspy.ChainOfThought("question -> answer")
output = math(question="Two dice are tossed. What is the probability that the sum equals two?")

# Convert fraction to float
try:
    probability = float(fractions.Fraction(output.answer))
    print(probability)  # Expected: 0.027777...
except ValueError:
    print("Could not convert output to float:", output.answer)


Could not convert output to float: The probability that the sum equals two when two dice are tossed is \boxed{\dfrac{1}{36}}.


In [7]:
math = dspy.ChainOfThought("question -> reasoning:str, answer:float")

output = math(question="Two dice are tossed. What is the probability that the sum equals two? ")

print(output)  # Should return a structured output


Prediction(
    reasoning="To find the probability that the sum of two dice equals two, we first determine the total number of possible outcomes when tossing two dice, which is \\(6 \\times 6 = 36\\). The only favorable outcome for a sum of two is both dice showing 1. Thus, there's 1 favorable outcome out of 36 possible outcomes.",
    answer=0.0278
)


In [8]:
output.answer

0.0278

In [35]:
import ujson
import dspy

In [36]:
lm = dspy.LM('ollama_chat/deepseek-r1:7b', api_base='http://localhost:11434', num_ctx= 8192*2)
dspy.configure(lm=lm)

In [37]:
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")

print(response.response)

High memory refers to processes consuming a significant amount of RAM, while low memory indicates underutilization of system resources.


In [None]:
dspy.inspect_history(n=1)

In [None]:
cot = dspy.ChainOfThought('question -> response')
cot(question="should curly braces appear on their own line?")

## Evaluation

In [40]:
with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

In [41]:
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})

In [42]:
import random

random.Random(0).shuffle(data)
trainset, devset, testset = data[:200], data[200:500], data[500:1000]

len(trainset), len(devset), len(testset)

(200, 300, 500)

In [43]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

# Produce a prediction from our `cot` module, using the `example` above as input.
pred = cot(**example.inputs())

# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")

Question: 	 why are my text messages coming up as maybe?

Gold Response: 	 This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". 

However, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.

Predicted Response: 	 Please ensure you're typing correctly and try sending different types of messages to see if the issue persists.

Semantic F1 Score: 0.00


In [44]:
dspy.inspect_history(n=1)





[34m[2025-02-24T14:55:21.433997][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)
2. `ground_truth` (str)
3. `system_response` (str)

Your output fields are:
1. `reasoning` (str)
2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth
3. `system_response_key_ideas` (str): enumeration of key ideas in the system response
4. `discussion` (str): discussion of the overlap between ground truth and system response
5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response
6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## ground_truth ## ]]
{ground_truth}

[[ ## system_response ## ]]
{system_response}

[[ ## reasoning ## ]]
{reasoning}

[[ ## ground_truth_key_ideas ## ]]
{ground_truth_key_ideas}

[[ ## system_response_key_ideas ## ]

In [None]:
# Define an evaluator that we can re-use.
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
                            display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
evaluate(cot)

# RAG

## Basic Retrieval-Augmented Generation (RAG)

In [None]:
# %conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5'

In [None]:
import ujson
import dspy
from sentence_transformers import SentenceTransformer

In [9]:
lm = dspy.LM('ollama_chat/deepseek-r1:7b', api_base='http://localhost:11434', num_ctx= 8192*2)
dspy.configure(lm=lm)

In [18]:
max_characters = 6000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

with open("ragqa_arena_tech_corpus.jsonl") as f:
    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]
    print(f"Loaded {len(corpus)} documents. Will encode them below.")


# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cuda")
embedder = dspy.Embedder(model.encode)

search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)

Loaded 28436 documents. Will encode them below.
Training a 32-byte FAISS index with 337 partitions, based on 28436 x 1024-dim embeddings


In [19]:
class RAG(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question).passages
        return self.respond(context=context, question=question)

In [16]:
rag = RAG()
rag(question="Is Emin a friend of Mert? ")

Prediction(
    reasoning="The given context does not provide any information about Emin and Mert's relationship. There is no mention of either individual, nor any indication of a friendship between them.",
    response='Based on the provided context, there is no evidence that Emin is a friend of Mert.'
)

In [17]:
dspy.inspect_history()





[34m[2025-02-24T14:24:22.036385][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str)
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `response` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `context`, `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## context ## ]]
[1] «Just stand outside the door at some distance talking on your phone. Dont look at the door, dont look at the person coming to open it, dont look like you want to get in. Dont ask to be let in. Dont engage in conversation. Just let the person open the door and go through. Then in the last second before it closes and lock, you calmly walk through still talking on your 

In [20]:
rag = RAG()
rag(question="what are high memory and low memory on linux? ")

Prediction(
    reasoning="High Memory (also known as Application Space) is reserved for user applications to prevent interference with the kernel. Low Memory contains the kernel's essential data, which cannot be accessed or swapped out, making it a critical resource for system stability and performance.",
    response="High memory refers to physical RAM allocated exclusively for user applications, ensuring they don't interfere with the kernel. Low memory is where the kernel resides, containing essential data that can't be swapped out or used by applications. Applications should aim to use high memory as much as possible to avoid relying on low memory, which can cause performance issues."
)

In [21]:
dspy.inspect_history()





[34m[2025-02-24T14:27:13.713408][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str)
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `response` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `context`, `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## context ## ]]
[1] «Reading system memory usage in Activity Monitor from support.apple.com gives a detailed explanation about the different types of RAM. Free memory: This is RAM thats not being used. Wired memory: Information in this memory cant be moved to the hard disk, so it must stay in RAM. The amount of Wired memory depends on the applications you are using. Active memory: This 

In [6]:
rag = RAG()
rag(question="What is MVC? ")

Prediction(
    reasoning='MVC stands for Model-View-Controller. It is a design pattern that separates an application into three main components: the model, which handles data; the view, which presents data to the user; and the controller, which manages user interactions. This separation of concerns allows for better maintainability and modularity.',
    response='MVC stands for Model-View-Controller. It is a design pattern that separates an application into three main components: the model, which handles data; the view, which presents data to the user; and the controller, which manages user interactions. This separation of concerns allows for better maintainability and modularity.'
)

In [None]:
dspy.inspect_history()





[34m[2025-02-24T14:22:14.628784][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str)
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `response` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `context`, `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## context ## ]]
[1] «Dont let them know not to put that in, we need to be able to easily weed out the companies we dont want to work for! To me fast-paced environment = lots of unpaid overtime.»
[2] «Boss-speak is always tough... Zero is a number so you can do things with it. Null is a unicorn. It doesnt exist so you cant do anything at all with it.»
[3] «http://en.wikipedia.org/wiki/De

In [25]:
with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

In [26]:
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})

In [27]:
import random

random.Random(0).shuffle(data)
trainset, devset, testset = data[:200], data[200:500], data[500:1000]

len(trainset), len(devset), len(testset)

(200, 300, 500)

In [30]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

In [None]:
# Define an evaluator that we can re-use.
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
                         display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
evaluate(RAG())

## RAG with ColBERTv2 ( Wikipedia Searcher Agent)

In [None]:
def search_wikipedia(query: str) -> list[str]:
    results = dspy.ColBERTv2(url='https://huggingface.co/dspy/cache/blob/main/wiki.abstracts.2017.tar.gz')(query, k=3)
    return [x['text'] for x in results]

rag = dspy.ChainOfThought('context, question -> response')

question = "What's the name of the castle that David Gregory inherited?"
rag(context=search_wikipedia(question), question=question)

In [13]:
import wikipediaapi

def search_wikipedia(query: str) -> str:
    wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        user_agent="LLM-Assistant/1.0 (ecagatay93@email.com)"
    )
    page = wiki_wiki.page(query)

    if not page.exists():
        return "No relevant Wikipedia page found."

    return page.text[:10000]  # Limit content for processing

# Example usage
# question = "For which artist did Kendrick Lamar write the diss track 'Not Like Us'?"
context = search_wikipedia("2024 United States presidential election")

10000


In [15]:
rag = dspy.ChainOfThought('context, question -> response')

question = "What is the name of running mate of Donald Trump in 2024?"
rag(context=context, question=question)

Prediction(
    reasoning="The context mentions that Donald Trump ran for re-election in 2024 and was nominated along with his running mate, JD Vance. The article does not mention any other candidates as Trump's running mate.",
    response='JD Vance'
)

In [16]:
rag = dspy.ChainOfThought('context, question -> response')

question = "Did trump claim that the 2020 elections were rigged?"
rag(context=context, question=question)

Prediction(
    reasoning='The context provided mentions that Donald Trump falsely claimed there was voter fraud in the 2020 presidential election, which he denied after losing. This includes his baseless predictions about voter fraud in 2024 as well as his claims of election interference.',
    response='Yes, Trump claimed that the 2020 election was rigged and made false claims about voter fraud to deny the results.'
)

## Using a DSPy Optimizer to improve your RAG prompt.

In [47]:
import dspy

In [48]:
lm = dspy.LM('ollama_chat/deepseek-r1:7b', api_base='http://localhost:11434', num_ctx= 8192*2)
dspy.configure(lm=lm)

In [None]:
tp = dspy.MIPROv2(metric=metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_rag = tp.compile(RAG(), trainset=trainset,
                            max_bootstrapped_demos=2, max_labeled_demos=2,
                            requires_permission_to_run=False)

In [None]:
baseline = rag(question="cmd+tab does not work on hidden or minimized windows")
print(baseline.response)

In [None]:
pred = optimized_rag(question="cmd+tab does not work on hidden or minimized windows")
print(pred.response)

In [None]:
evaluate(optimized_rag)

In [None]:
cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # in USD, as calculated by LiteLLM for certain providers

In [None]:
optimized_rag.save("optimized_rag.json")

loaded_rag = RAG()
loaded_rag.load("optimized_rag.json")

loaded_rag(question="cmd+tab does not work on hidden or minimized windows")

# Classification

In [26]:
from typing import Literal

class Classify(dspy.Signature):
    """Classify sentiment of a given sentence."""

    sentence: str = dspy.InputField()
    sentiment: Literal['positive', 'negative', 'neutral'] = dspy.OutputField()
    confidence: float = dspy.OutputField()

classify = dspy.Predict(Classify)
classify(sentence="This book was super fun to read")

Prediction(
    sentiment='positive',
    confidence=0.8
)

# Information Extraction

In [23]:
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    title: str = dspy.OutputField()
    headings: str = dspy.OutputField(desc="a JSON string representing a list of headings")
    entities: str = dspy.OutputField(desc="a JSON string representing a list of entities and their metadata")


In [None]:
import json

module = dspy.Predict(ExtractInfo)
text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(text=text)

print("Raw response:", repr(response))
print(response.title)
print(response.headings)
print(response.entities)


In [137]:
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    title: str = dspy.OutputField()
    headings: list[str] = dspy.OutputField()
    entities: list[dict[str, str]] = dspy.OutputField(desc="a list of entities and their metadata")

In [138]:
import json

module = dspy.Predict(ExtractInfo)
text = "securite securite all stations this is United States Coast Guard Sector Long Island Sound break the coast guard recieved a report of a vessel that is submerged in the vicinity of Sag Harbor entrance in broad position four one tag zero one decimal eight two north zero seven two tag one eight decimal zero zero west all mariners are requested caution when transiting the area break this is United States Coast Guard Sector Long Island Sound out"
response = module(text=text)

print("Raw response:", repr(response))
print(response.title)
print(response.headings)
print(response.entities)

Raw response: Prediction(
    title='Submerged Vessel Incident Near Sag Harbor Entrance',
    headings=['Vessel Identification', 'Position', 'Mariner Caution'],
)
Submerged Vessel Incident Near Sag Harbor Entrance
['Vessel Identification', 'Position', 'Mariner Caution']


In [None]:
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    title: str = dspy.OutputField()
    headings: str = dspy.OutputField(desc="a JSON string representing a list of headings")
    entities: str = dspy.OutputField(desc="a JSON string representing a list of entities and their metadata")

In [141]:
import json

module = dspy.Predict(ExtractInfo)
text = "Kotka VTS, Onki, hyvää iltapäivää. Vessel Strandvej, this is Tavasland."
response = module(text=text)

print("Raw response:", repr(response))
print(response.title)
print(response.headings)
print(response.entities)

Raw response: Prediction(
    title='Kotka VTS',
    headings='[\n  "Kotka VTS",\n  "Vessel Strandvej"\n]',
    entities='[\n  {\n    "name": "Kotka VTS",\n    "type": "ship_name",\n    "description": "A vessel named Kotka VTS."\n  },\n  {\n    "name": "Onki",\n    "type": "person_name",\n    "description": "A person\'s name, Onki."\n  },\n  {\n    "name": "Tavasland",\n    "type": "location_or_area",\n    "description": "A location or area named Tavasland."\n  }\n]'
)
Kotka VTS
[
  "Kotka VTS",
  "Vessel Strandvej"
]
[
  {
    "name": "Kotka VTS",
    "type": "ship_name",
    "description": "A vessel named Kotka VTS."
  },
  {
    "name": "Onki",
    "type": "person_name",
    "description": "A person's name, Onki."
  },
  {
    "name": "Tavasland",
    "type": "location_or_area",
    "description": "A location or area named Tavasland."
  }
]


# Agents

In [None]:
def evaluate_math(expression: str):
    return dspy.PythonInterpreter({}).execute(expression)

def search_wikipedia(query: str):
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

react = dspy.ReAct("question -> answer: float", tools=[evaluate_math, search_wikipedia])

pred = react(question="What is 9362158 divided by the year of birth of David Gregory of Kinnairdy castle?")
print(pred.answer)

# Multi-Stage Pipelines