In [1]:
import glob
import os
import pandas as pd
import random

import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
turbo = dspy.OpenAI(model="gpt-3.5-turbo-1106", max_tokens=250, model_type="chat")
dspy.settings.configure(lm=turbo)

In [3]:
gpt4T = dspy.OpenAI(model="gpt-4-1106-preview", max_tokens=350, model_type="chat")

In [4]:
def load_scone(dirname):
    dfs = []
    for filename in glob.glob(dirname + "/*.csv"):
        df = pd.read_csv(filename, index_col=0)
        df["category"] = os.path.basename(filename).replace(".csv", "")
        dfs.append(df)
    data_df = pd.concat(dfs)

    def as_example(row):
        # The 'one_scoped' file is from an earlier dataset, MoNLI, and
        # so is formatted a bit differently:
        suffix = "" if row["category"] == "one_scoped" else "_edited"
        # Reformat the hypothesis to be an embedded clause in a question:
        hkey = "sentence2" + suffix
        question = row[hkey][0].lower() + row[hkey][1:].strip(".")
        question = f"Can we logically conclude for sure that {question}?"
        # Binary task formulation:
        label = "Yes" if row["gold_label" + suffix] == "entailment" else "No"
        return dspy.Example(
            {
                "context": row["sentence1" + suffix],
                "question": question,
                "answer": label,
                "category": row["category"],
            }
        ).with_inputs("context", "question")

    return list(data_df.apply(as_example, axis=1).values)

In [5]:
all_train = load_scone("ScoNe/scone_nli/train")

random.seed(1)
random.shuffle(all_train)

# 200 random train, 50 random dev:
train, dev = all_train[:200], all_train[200:250]

len(train), len(dev)

(200, 50)

In [19]:
example = dev[0]
print("Context:", example.context)
print("Question:", example.question)
print("Answer:", example.answer)
print("Category:", example.category)

Context: The people are not happy when they play instruments.
Question: Can we logically conclude for sure that the people are not happy when they play accordions?
Answer: No
Category: one_not_scoped


In [6]:
random.seed(1)

test = load_scone(dirname="ScoNe/scone_nli/test")

# We're developing a system for the full ScoNe benchmark, but we'll
# evaluate only on one of the hardest and most informative ScoNe
# categories for now -- examples with a single negation that plays
# a crucial role in the reasoning:
test = [ex for ex in test if ex.category == "one_scoped"]

In [7]:
pd.Series([ex.answer for ex in test]).value_counts()

No     100
Yes    100
Name: count, dtype: int64

In [8]:
scone_accuracy = dspy.evaluate.metrics.answer_exact_match
evaluator = Evaluate(devset=test, num_threads=1, display_progress=True, display_table=0)

In [10]:
class ScoNeSignature(dspy.Signature):
    (
        """You are given some context (a premise) and a question (a hypothesis). """
        """You must indicate with Yes/No answer whether we can logically """
        """conclude the hypothesis from the premise."""
    )

    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Yes or No")

In [11]:
class ScoNeCoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(ScoNeSignature)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)

In [12]:
cot_zeroshot = ScoNeCoT()

In [13]:
evaluator(cot_zeroshot, metric=scone_accuracy)

Average Metric: 100 / 200  (50.0): 100%|██████████| 200/200 [03:21<00:00,  1.01s/it]


50.0

In [14]:
bootstrap_optimizer = BootstrapFewShotWithRandomSearch(
    max_bootstrapped_demos=8,
    max_labeled_demos=8,
    num_candidate_programs=10,
    num_threads=8,
    metric=scone_accuracy,
    teacher_settings=dict(lm=gpt4T),
)

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


In [15]:
cot_fewshot = bootstrap_optimizer.compile(cot_zeroshot, trainset=train, valset=dev)

Average Metric: 22 / 50  (44.0): 100%|██████████| 50/50 [00:06<00:00,  7.25it/s]


Score: 44.0 for set: [0]
New best sscore: 44.0 for seed -3
Scores so far: [44.0]
Best score: 44.0


Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:06<00:00,  7.78it/s]


Score: 56.0 for set: [8]
New best sscore: 56.0 for seed -2
Scores so far: [44.0, 56.0]
Best score: 56.0


  6%|▌         | 11/200 [00:33<09:41,  3.08s/it]


Bootstrapped 8 full traces after 12 examples in round 0.


Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:08<00:00,  5.79it/s]


Score: 54.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0]
Best score: 56.0
Average of max per entry across top 1 scores: 0.56
Average of max per entry across top 2 scores: 0.74
Average of max per entry across top 3 scores: 0.76
Average of max per entry across top 5 scores: 0.76
Average of max per entry across top 8 scores: 0.76
Average of max per entry across top 9999 scores: 0.76


  4%|▍         | 9/200 [00:47<16:52,  5.30s/it]


Bootstrapped 7 full traces after 10 examples in round 0.


Average Metric: 35 / 50  (70.0): 100%|██████████| 50/50 [00:07<00:00,  6.76it/s]


Score: 70.0 for set: [8]
New best sscore: 70.0 for seed 0
Scores so far: [44.0, 56.0, 54.0, 70.0]
Best score: 70.0
Average of max per entry across top 1 scores: 0.7
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.94
Average of max per entry across top 8 scores: 0.94
Average of max per entry across top 9999 scores: 0.94


  2%|▏         | 3/200 [00:09<10:32,  3.21s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:07<00:00,  6.75it/s]


Score: 56.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0]
Best score: 70.0
Average of max per entry across top 1 scores: 0.7
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.96
Average of max per entry across top 9999 scores: 0.96


  0%|          | 1/200 [00:02<08:45,  2.64s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 37 / 50  (74.0): 100%|██████████| 50/50 [00:08<00:00,  6.10it/s]


Score: 74.0 for set: [8]
New best sscore: 74.0 for seed 2
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.94
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98


  2%|▏         | 4/200 [00:11<09:39,  2.95s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 22 / 50  (44.0): 100%|██████████| 50/50 [00:07<00:00,  6.70it/s]


Score: 44.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.94
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98


  2%|▏         | 4/200 [00:12<09:53,  3.03s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:06<00:00,  7.21it/s]


Score: 60.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.9
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98


  2%|▎         | 5/200 [00:15<09:47,  3.01s/it]


Bootstrapped 5 full traces after 6 examples in round 0.


Average Metric: 32 / 50  (64.0): 100%|██████████| 50/50 [00:07<00:00,  6.71it/s]


Score: 64.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0, 64.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 2/200 [00:04<07:16,  2.21s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [00:07<00:00,  6.55it/s]


Score: 56.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0, 64.0, 56.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  4%|▎         | 7/200 [00:22<10:16,  3.19s/it]


Bootstrapped 6 full traces after 8 examples in round 0.


Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:08<00:00,  5.61it/s]


Score: 60.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0, 64.0, 56.0, 60.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  2%|▎         | 5/200 [00:14<09:36,  2.96s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 26 / 50  (52.0): 100%|██████████| 50/50 [00:07<00:00,  6.31it/s]


Score: 52.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0, 64.0, 56.0, 60.0, 52.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  4%|▍         | 8/200 [00:22<09:02,  2.82s/it]


Bootstrapped 8 full traces after 9 examples in round 0.


Average Metric: 29 / 50  (58.0): 100%|██████████| 50/50 [00:07<00:00,  6.43it/s]

Score: 58.0 for set: [8]
Scores so far: [44.0, 56.0, 54.0, 70.0, 56.0, 74.0, 44.0, 60.0, 64.0, 56.0, 60.0, 52.0, 58.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
13 candidate programs found.





In [16]:
evaluator(cot_fewshot, metric=scone_accuracy)

Average Metric: 139 / 200  (69.5): 100%|██████████| 200/200 [03:31<00:00,  1.06s/it]


69.5

In [17]:
turbo.inspect_history(n=1)




You are given some context (a premise) and a question (a hypothesis). You must indicate with Yes/No answer whether we can logically conclude the hypothesis from the premise.

---

Follow the following format.

Context: ${context}

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Yes or No

---

Context: It is a lie that a boy is not concentrating on a motor

Question: Can we logically conclude for sure that it is a lie that a boy is not concentrating on a machine?

Reasoning: Let's think step by step in order to produce the answer. We know that the statement "a boy is not concentrating on a motor" is a lie. This means the opposite is true, so the boy is concentrating on a motor. A motor is a type of machine. Therefore, if the boy is concentrating on a motor, he is concentrating on a machine.

Answer: Yes

---

Context: A man is holding something in his hands.
Question: Can we logically conclude for sure that a man is holdi

'\n\n\nYou are given some context (a premise) and a question (a hypothesis). You must indicate with Yes/No answer whether we can logically conclude the hypothesis from the premise.\n\n---\n\nFollow the following format.\n\nContext: ${context}\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\n\nAnswer: Yes or No\n\n---\n\nContext: It is a lie that a boy is not concentrating on a motor\n\nQuestion: Can we logically conclude for sure that it is a lie that a boy is not concentrating on a machine?\n\nReasoning: Let\'s think step by step in order to produce the answer. We know that the statement "a boy is not concentrating on a motor" is a lie. This means the opposite is true, so the boy is concentrating on a motor. A motor is a type of machine. Therefore, if the boy is concentrating on a motor, he is concentrating on a machine.\n\nAnswer: Yes\n\n---\n\nContext: A man is holding something in his hands.\nQuestion: Can we logically conc