In [None]:
%run __include.ipynb

In [None]:
from langfuse import Langfuse

langfuse = Langfuse()

In [None]:
import dspy

# MODEL = "qwen2.5:14b"
# MODEL = "gemma3:12b"
# MODEL = "granite3.2:2b"
# MODEL="smollm2:1.7b"
# MODEL="smollm2:360m"
# MODEL="smollm2:135m"
MODEL = "phi4:14b"

lm = dspy.LM(f"ollama/{MODEL}", cache=False)
dspy.settings.configure(lm=lm, track_usage=False)

In [None]:
lm("Hello world!")

In [None]:
from sms_classifier import SMSClassifier

sms_classifier = SMSClassifier(lm)

In [None]:
sms_classifier(
    sms_text="""Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question"""
)

In [None]:
dataset = langfuse.get_dataset("sms_phishing_test")
from dspy import Example

test_ds = []
for item in dataset.items:
    test_ds.append(
        Example(sms_text=item.input, category=item.expected_output).with_inputs(
            "sms_text"
        )
    )

In [None]:
from evaluation_helpers import validate_answer
from langfuse_extensions import EvaluateWithLangfuse

from datetime import datetime


session_id = f"Run-{MODEL}-{int(datetime.now().timestamp())}"
# print(session)


evaluator = EvaluateWithLangfuse(
    devset=test_ds,
    num_threads=1,
    display_progress=True,
    session_id=session_id,
    lm=lm,
    provide_traceback=True,
)

dspy.configure(callbacks=[evaluator])
evaluator(program=sms_classifier, metric=validate_answer)

In [None]:
from notebooks.cybersecurity.evaluation_helpers import calculate_metrics, fetch_traces

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_qwen25_1_5 = fetch_traces(run_id="Run-qwen2.5:1.5b-1743285619")
metrics_qwen25_1_5 = calculate_metrics(traces_qwen25_1_5, classes)
metrics_qwen25_1_5["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_qwen25_3 = fetch_traces(run_id="Run-qwen2.5:3b-1743285575")
metrics_qwen25_3 = calculate_metrics(traces_qwen25_3, classes)
metrics_qwen25_3["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_qwen25_0_5 = fetch_traces(run_id="Run-qwen2.5:0.5b-1743285668")
metrics_qwen25_0_5 = calculate_metrics(traces_qwen25_0_5, classes)
metrics_qwen25_0_5["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_gemma_1 = fetch_traces(run_id="Run-gemma3:1b-1743286064")
metrics_gemma_1 = calculate_metrics(traces_gemma_1, classes)
metrics_gemma_1["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_gemma_4 = fetch_traces(run_id="Run-gemma3:4b-1743286278")
metrics_gemma_4 = calculate_metrics(traces_gemma_4, classes)
metrics_gemma_4["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_granite32_2 = fetch_traces(run_id="Run-granite3.2:2b-1743286838")
metrics_granite32_2 = calculate_metrics(traces_granite32_2, classes)
metrics_granite32_2["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_gemma_12 = fetch_traces(run_id="Run-gemma3:12b-1743287333")
metrics_gemma_12 = calculate_metrics(traces_gemma_12, classes)
metrics_gemma_12["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_qwen25_14 = fetch_traces(run_id="Run-qwen2.5:14b-1743287701")
metrics_qwen25_14 = calculate_metrics(traces_qwen25_14, classes)
metrics_qwen25_14["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_smollm2_1_7 = fetch_traces(run_id="Run-smollm2:1.7b-1743288291")
metrics_smollm2_1_7 = calculate_metrics(traces_smollm2_1_7, classes)
metrics_smollm2_1_7["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_smollm2_360 = fetch_traces(run_id="Run-smollm2:360m-1743288463")
metrics_smollm2_360 = calculate_metrics(traces_smollm2_360, classes)
metrics_smollm2_360["macro"]

In [None]:
classes = ["ham", "spam", "smishing"]
# specify the run_id of the model to evaluate
traces_phi4_14 = fetch_traces(run_id="Run-phi4:14b-1743289137")
metrics_phi4_14 = calculate_metrics(traces_phi4_14, classes)
metrics_phi4_14["macro"]

In [None]:
metrics_baseline = dict()
metrics_baseline["smollm2:360m"] = metrics_smollm2_360["macro"]
metrics_baseline["smollm2:1.7b"] = metrics_smollm2_1_7["macro"]
metrics_baseline["qwen2.5:0.5b"] = metrics_qwen25_0_5["macro"]
metrics_baseline["qwen2.5:1.5b"] = metrics_qwen25_1_5["macro"]
# metrics_baseline["qwen2.5:3b"] = metrics_qwen25_3["macro"]
metrics_baseline["gemma3:1b"] = metrics_gemma_1["macro"]
metrics_baseline["gemma3:4b"] = metrics_gemma_4["macro"]
metrics_baseline["granite3.2:2b"] = metrics_granite32_2["macro"]
metrics_baseline["gemma3:12b"] = metrics_gemma_12["macro"]
metrics_baseline["qwen2.5:14b"] = metrics_qwen25_14["macro"]
metrics_baseline["phi4:14b"] = metrics_phi4_14["macro"]

In [None]:
from evaluation_helpers import plot_metrics

plot_metrics(
    metrics_baseline,
    ["Precision", "Recall", "F1"],
    "Scores by model before optimization",
)