In [1]:
%load_ext autoreload
%autoreload 2

from typing import Literal, Union
from pathlib import Path
import csv
from datetime import datetime
import random


import pandas as pd
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import MIPROv2
from sklearn.model_selection import train_test_split

from programs import WrapperEnglishSPT, evaluate_answer
from custom_evaluation import custom_evaluate

In [2]:
lm = dspy.LM(
    "ollama_chat/deepseek-r1:14b",
    api_base="http://localhost:11434",
)
dspy.settings.configure(lm=lm)

In [3]:
lm("What is your name")

["<think>\n\n</think>\n\nGreetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have."]

In [4]:
data = pd.read_csv("dev_dwug_es.csv")
display(data.shape)

(8704, 8)

In [5]:
training_set = []

for _, row in data.iterrows():
    training_set.append(
        dspy.Example(
            sentence1=row["context_x"],
            sentence2=row["context_y"],
            target_word=row["lemma"],
            answer=int(row["judgment"]),
        ).with_inputs("sentence1", "sentence2", "target_word")
    )

In [6]:
classes_1_es = [item for item in training_set if item.answer == 1]
classes_2_es = [item for item in training_set if item.answer == 2]
classes_3_es = [item for item in training_set if item.answer == 3]
classes_4_es = [item for item in training_set if item.answer == 4]

print(len(classes_1_es))
print(len(classes_2_es))
print(len(classes_3_es))
print(len(classes_4_es))

classes_1_train, classes_1_dev = train_test_split(
    classes_1_es,
    test_size=0.2,
    random_state=42,
)

classes_1_train, classes_1_test = train_test_split(
    classes_1_train, test_size=0.2, random_state=42
)


classes_2_train, classes_2_dev = train_test_split(
    classes_2_es,
    test_size=0.2,
    random_state=42,
)
classes_2_train, classes_2_test = train_test_split(
    classes_2_train, test_size=0.2, random_state=42
)


classes_3_train, classes_3_dev = train_test_split(
    classes_3_es,
    test_size=0.2,
    random_state=42,
)
classes_3_train, classes_3_test = train_test_split(
    classes_3_train, test_size=0.2, random_state=42
)


classes_4_train, classes_4_dev = train_test_split(
    classes_4_es,
    test_size=0.2,
    random_state=42,
)
classes_4_train, classes_4_test = train_test_split(
    classes_4_train,
    test_size=0.2,
    random_state=42,
)

print(len(classes_1_train), len(classes_1_dev), len(classes_1_test))
print(len(classes_2_train), len(classes_2_dev), len(classes_2_test))
print(len(classes_3_train), len(classes_3_dev), len(classes_3_test))
print(len(classes_4_train), len(classes_4_dev), len(classes_4_test))

1406
1522
2343
3433
899 282 225
973 305 244
1499 469 375
2196 687 550


In [7]:
program_spt_prompt_en_assertions = WrapperEnglishSPT().activate_assertions()

In [8]:
custom_evaluate(
    random.choices(classes_1_test, k=225)
    + random.choices(classes_2_test, k=225)
    + random.choices(classes_3_test, k=225)
    + random.choices(classes_4_test, k=225),
    evaluate_answer,
    program_spt_prompt_en_assertions,
    debug=False,
)

Evaluating: 900 examples


2025/04/22 16:49:33 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:50:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:53:10 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:54:20 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:55:35 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:55:35 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:55:35 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:58:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:58:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:58:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:59:16 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 16:59:16 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:03:19 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:03:19 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:03:19 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:03:19 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:05:38 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:06:57 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:10:39 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:10:39 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:10:39 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:10:39 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:13:17 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:20:45 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:20:45 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:20:45 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:20:45 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:22:02 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:22:02 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:22:02 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:22:02 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:24:31 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:24:31 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:35:48 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 17:37:02 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 18:18:18 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 18:19:32 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 18:22:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 18:23:16 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 18:24:23 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 19:32:39 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 19:33:49 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 19:35:03 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:43:48 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:45:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:45:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


Accurate examples: 350
Bad-formatted examples: 3
Accuracy: 39.01895206243032


In [9]:
# %reload_ext autoreload

# start_time = datetime.now()

# teleprompter = MIPROv2(
#     metric=evaluate_answer,
#     task_model=lm,
#     num_candidates=10,
#     init_temperature=0.7,
#     max_bootstrapped_demos=3,
#     max_labeled_demos=4,
#     verbose=False,
# )

# print("Optimizing program with MIPRO...")
# optimized_program = teleprompter.compile(
#     program_spt_prompt_en_assertions.deepcopy(),
#     trainset=random.choices(classes_1_train, k=500)
#     + random.choices(classes_2_train, k=500)
#     + random.choices(classes_3_train, k=500)
#     + random.choices(classes_4_train, k=500),
#     valset=random.choices(classes_1_dev, k=200)
#     + random.choices(classes_2_dev, k=200)
#     + random.choices(classes_3_dev, k=200)
#     + random.choices(classes_4_dev, k=200),
#     num_trials=15,
#     minibatch_size=25,
#     minibatch_full_eval_steps=10,
#     minibatch=True,
#     requires_permission_to_run=False,
# )

# optimized_program.save(f"compile-models/sp/es_spt_mipro_optimized_prompt_en_deepseek-q4")

# print(f"Elapsed time: {datetime.now() - start_time}")

In [10]:
# import matplotlib.pyplot as plt


# trial_logs = optimized_program.trial_logs

# trial_numbers = list(trial_logs.keys())
# scores = [trial_logs[trial]["score"] for trial in trial_numbers]

# full_eval = [trial_logs[trial]["full_eval"] for trial in trial_numbers]

# for trial_number, score, pruned in zip(trial_numbers, scores, full_eval):
#     if pruned is False:
#         plt.scatter(
#             trial_number,
#             score,
#             color="grey",
#             label=(
#                 "Pruned Batch"
#                 if "Pruned Batch" not in plt.gca().get_legend_handles_labels()[1]
#                 else ""
#             ),
#         )
#     else:
#         plt.scatter(
#             trial_number,
#             score,
#             color="green",
#             label=(
#                 "Successful Batch"
#                 if "Successful Batch" not in plt.gca().get_legend_handles_labels()[1]
#                 else ""
#             ),
#         )

# plt.xlabel("Batch Number")
# plt.ylabel("Score")
# plt.title("Batch Scores")
# plt.grid(True)
# plt.legend()
# plt.show()

In [11]:
# best_score = 0
# best_program_so_far = None


# def get_signature(predictor):
#     if hasattr(predictor, "extended_signature"):
#         return predictor.extended_signature
#     elif hasattr(predictor, "signature"):
#         return predictor.signature


# # print(f"Baseline program | Score: {best_score}:")
# # for i, predictor in enumerate(WrapperEnglishSPT().predictors()):
# #     print(f"Prompt {i+1} Instruction: {get_signature(predictor).instructions}")
# # print()

# print("----------------")

# for trial_num in optimized_program.trial_logs:
#     program_score = optimized_program.trial_logs[trial_num]["score"]
#     program_pruned = optimized_program.trial_logs[trial_num]["full_eval"]
#     # if (
#     #     program_score > best_score
#     #     and program_pruned is True
#     #     # and optimized_program.trial_logs[trial_num]["full_eval"]
#     # ):
#     if program_pruned is True:
#         best_score = program_score
#         best_program_so_far = optimized_program.trial_logs[trial_num]["program"]
#     # if trial_num % 5 == 0:
#     #     print(f"Best program after {trial_num} batches | Score: {best_score}:")
#     #     for i, predictor in enumerate(best_program_so_far.predictors()):
#     #         print(f"Prompt {i+1} Instruction: {get_signature(predictor).instructions}")
#     #     print()
    
#         # print(f"Best program with best score: {best_score}")
#         for i, predictor in enumerate(best_program_so_far.predictors()):
#             print(f"Prompt {trial_num} Instruction: {get_signature(predictor).instructions}")
#             print(best_score)
#         print()

In [12]:
program_spt_prompt_en_assertions.load(
    "compile-models/sp/es_spt_mipro_optimized_prompt_en_deepseek-q4"
)

In [13]:

custom_evaluate(
    random.choices(classes_1_test, k=225)
    + random.choices(classes_2_test, k=225)
    + random.choices(classes_3_test, k=225)
    + random.choices(classes_4_test, k=225),
    evaluate_answer,
    program_spt_prompt_en_assertions,
    debug=False,
)

2025/04/22 20:46:18 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


Evaluating: 900 examples


2025/04/22 20:49:57 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:51:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:51:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:53:38 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:53:38 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:53:38 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:53:38 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:54:47 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:59:36 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:59:36 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 20:59:36 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:00:52 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:02:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:02:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:02:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:02:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:02:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:03:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:03:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:03:11 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:05:40 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:06:59 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:06:59 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 21:09:26 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:15:12 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:25:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:25:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:25:00 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:35:55 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:35:55 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


2025/04/22 22:35:55 INFO dspy.primitives.assertions: SuggestionFailed: The output shoulb be 1 or 2 or 3 or 4. Please revise accordingly.


Accurate examples: 366
Bad-formatted examples: 2
Accuracy: 40.75723830734967
