In [None]:
!pip install mlflow dspy

In [20]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.dspy.autolog(
    # Log the optimization progress
    log_compiles=True,
    # Log the evaluation results
    log_evals=True,
    # Log traces from module executions
    log_traces=True
)

In [31]:
# api_key = input("Enter your OpenAI API key: ")
from dotenv import load_dotenv  
load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")
import os
import dspy
lm = dspy.LM("openai/gpt-4.1-nano", temperature=1, api_key=api_key)
dspy.configure(lm=lm)

In [22]:
import requests
import dspy
import json
import random

def init_dataset():
    # Load from the url
    url = "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/dataset.json"
    dataset = json.loads(requests.get(url).text)
    dspy_dataset = [
        dspy.Example({
            "message": d['fields']['input'],
            "answer": d['answer'],
        }).with_inputs("message")
        for d in dataset
    ]
    random.Random(0).shuffle(dspy_dataset)
    train_set = dspy_dataset[:int(len(dspy_dataset) * 0.33)]
    val_set = dspy_dataset[int(len(dspy_dataset) * 0.33):int(len(dspy_dataset) * 0.66)]
    test_set = dspy_dataset[int(len(dspy_dataset) * 0.66):]

    return train_set, val_set, test_set

In [23]:
train_set, val_set, test_set = init_dataset()

len(train_set), len(val_set), len(test_set)

(66, 66, 68)

In [24]:
print("Input Message:")
print(train_set[0]['message'])

print("\n\nGold Answer:")
for k, v in json.loads(train_set[0]['answer']).items():
    print(f"{k}: {v}")

Input Message:
Subject: Adjusting Bi-Weekly Cleaning Schedule for My Office

Dear ProCare Facility Solutions Support Team,

I hope this message finds you well. My name is Dr. Alex Turner, and I have been utilizing your services for my office space for the past year. I must say, your team's dedication to maintaining a pristine environment has been commendable and greatly appreciated.

I am reaching out to discuss the scheduling of our regular cleaning services. While I find the logistical challenges of coordinating these services intellectually stimulating, I believe we could optimize the current schedule to better suit the needs of my team and our workflow. Specifically, I would like to explore the possibility of adjusting our cleaning schedule to a bi-weekly arrangement, ideally on Tuesdays and Fridays, to ensure our workspace remains consistently clean without disrupting our research activities.

Previously, I have attempted to adjust the schedule through the online portal, but I enc

In [27]:
from typing import List, Literal


class FacilitySupportAnalyzerUrgency(dspy.Signature):
    """
    Read the provided message and determine the urgency.
    """
    message: str = dspy.InputField()
    urgency: Literal['low', 'medium', 'high'] = dspy.OutputField()

class FacilitySupportAnalyzerSentiment(dspy.Signature):
    """
    Read the provided message and determine the sentiment.
    """
    message: str = dspy.InputField()
    sentiment: Literal['positive', 'neutral', 'negative'] = dspy.OutputField()

class FacilitySupportAnalyzerCategories(dspy.Signature):
    """
    Read the provided message and determine the set of categories applicable to the message.
    """
    message: str = dspy.InputField()
    categories: List[Literal["emergency_repair_services", "routine_maintenance_requests", "quality_and_safety_concerns", "specialized_cleaning_services", "general_inquiries", "sustainability_and_environmental_practices", "training_and_support_requests", "cleaning_services_scheduling", "customer_feedback_and_complaints", "facility_management_issues"]] = dspy.OutputField()

class FacilitySupportAnalyzerMM(dspy.Module):
    def __init__(self):
        self.urgency_module = dspy.ChainOfThought(FacilitySupportAnalyzerUrgency)
        self.sentiment_module = dspy.ChainOfThought(FacilitySupportAnalyzerSentiment)
        self.categories_module = dspy.ChainOfThought(FacilitySupportAnalyzerCategories)
    
    def forward(self, message: str):
        urgency = self.urgency_module(message=message)
        sentiment = self.sentiment_module(message=message)
        categories = self.categories_module(message=message)

        return dspy.Prediction(
            urgency=urgency.urgency,
            sentiment=sentiment.sentiment,
            categories=categories.categories
        )

program = FacilitySupportAnalyzerMM()

In [28]:
from ragas.metrics import numeric_metric, MetricResult

@numeric_metric(name="urgency_accuracy", allowed_values=(0.0, 1.0))
def urgency_accuracy_metric(gold_urgency: str, pred_urgency: str) -> MetricResult:
    """
    Ragas metric for urgency classification accuracy.
    Returns 1.0 for correct classification, 0.0 for incorrect.
    """
    score = 1.0 if gold_urgency == pred_urgency else 0.0
    if gold_urgency == pred_urgency:
        feedback = f"You correctly classified the urgency of the message as `{gold_urgency}`. This message is indeed of `{gold_urgency}` urgency."
    else:
        feedback = f"You incorrectly classified the urgency of the message as `{pred_urgency}`. The correct urgency is `{gold_urgency}`. Think about how you could have reasoned to get the correct urgency label."
    return MetricResult(value=score, reason=feedback)

@numeric_metric(name="sentiment_accuracy", allowed_values=(0.0, 1.0))
def sentiment_accuracy_metric(gold_sentiment: str, pred_sentiment: str) -> MetricResult:
    """
    Ragas metric for sentiment classification accuracy.
    Returns 1.0 for correct classification, 0.0 for incorrect.
    """
    score = 1.0 if gold_sentiment == pred_sentiment else 0.0
    if gold_sentiment == pred_sentiment:
        feedback = f"You correctly classified the sentiment of the message as `{gold_sentiment}`. This message is indeed `{gold_sentiment}`."
    else:
        feedback = f"You incorrectly classified the sentiment of the message as `{pred_sentiment}`. The correct sentiment is `{gold_sentiment}`. Think about how you could have reasoned to get the correct sentiment label."
    return MetricResult(value=score, reason=feedback)

@numeric_metric(name="categories_accuracy", allowed_values=(0.0, 1.0))
def categories_accuracy_metric(gold_categories: dict, pred_categories: list) -> MetricResult:
    """
    Ragas metric for category classification accuracy.
    Computes the fraction of correctly classified categories.
    """
    # Single pass through gold_categories to build all lists
    correctly_included, incorrectly_included, incorrectly_excluded, correctly_excluded = [], [], [], []
    
    for k, v in gold_categories.items():
        if v and k in pred_categories:
            correctly_included.append(k)
        elif not v and k in pred_categories:
            incorrectly_included.append(k)
        elif v and k not in pred_categories:
            incorrectly_excluded.append(k)
        else:  # not v and k not in pred_categories
            correctly_excluded.append(k)
    
    # Compute accuracy
    score = (len(correctly_included) + len(correctly_excluded)) / len(gold_categories)
    
    # Generate feedback
    if score == 1.0:
        fb_text = f"The category classification is perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`."
    else:
        fb_text = f"The category classification is not perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`.\n"
        if incorrectly_included:
            fb_text += f"However, you incorrectly identified that the message falls under the following categories: `{repr(incorrectly_included)}`. The message DOES NOT fall under these categories.\n"
        if incorrectly_excluded:
            prefix = "Additionally, " if incorrectly_included else "However, "
            fb_text += f"{prefix}you didn't identify the following categories that the message actually falls under: `{repr(incorrectly_excluded)}`.\n"
        fb_text += "Think about how you could have reasoned to get the correct category labels."
    
    return MetricResult(value=score, reason=fb_text)

def metric(example, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Ragas-based metric function for DSPy evaluation and GEPA optimization.
    
    Returns overall score (float) for evaluation, or dspy.Prediction with 
    module-specific feedback for GEPA optimization.
    """
    gold = json.loads(example['answer'])
    
    # Score using ragas metrics - each returns a MetricResult with .value and .reason
    urgency_result = urgency_accuracy_metric.score(
        gold_urgency=gold['urgency'], 
        pred_urgency=pred.urgency
    )
    sentiment_result = sentiment_accuracy_metric.score(
        gold_sentiment=gold['sentiment'], 
        pred_sentiment=pred.sentiment
    )
    categories_result = categories_accuracy_metric.score(
        gold_categories=gold['categories'], 
        pred_categories=pred.categories
    )
    
    # Overall score: average of the three accuracies
    total = (urgency_result.value + sentiment_result.value + categories_result.value) / 3
    
    # If no pred_name, just return the score (for evaluation)
    if pred_name is None:
        return total
    
    # For GEPA optimization, return score + module-specific feedback
    feedback_map = {
        'urgency_module.predict': urgency_result.reason,
        'sentiment_module.predict': sentiment_result.reason,
        'categories_module.predict': categories_result.reason,
    }
    feedback = feedback_map.get(pred_name, f"No specific feedback available for module: {pred_name}")
    
    return dspy.Prediction(score=total, feedback=feedback)

In [29]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 52.33 / 68 (77.0%): 100%|██████████| 68/68 [00:01<00:00, 59.28it/s]

2025/10/03 19:02:02 INFO dspy.evaluate.evaluate: Average Metric: 52.33333333333333 / 68 (77.0%)





Unnamed: 0,message,answer,urgency,sentiment,categories,metric
0,"Hey ProCare Support Team, Hope you all are doing great! My name is...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [1.000]
1,"Hey ProCare Team, Hope you’re all doing well! My name’s Jake, and ...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,positive,[routine_maintenance_requests],✔️ [1.000]
2,"Subject: Assistance Needed for HVAC Maintenance Hi [Receiver], I h...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,positive,[routine_maintenance_requests],✔️ [0.667]
3,Subject: A Green Inquiry from a Bill Maher Enthusiast Hey ProCare ...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,"[sustainability_and_environmental_practices, general_inquiries]",✔️ [0.967]
4,Subject: Inquiry on Sustainability Practices Dear ProCare Facility...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[sustainability_and_environmental_practices],✔️ [1.000]
...,...,...,...,...,...,...
63,Subject: Inquiry About Your Eco-Friendly Practices Dear ProCare Fa...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [0.600]
64,Subject: Assistance Needed for Facility Management Issue Dear ProC...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,neutral,[facility_management_issues],✔️ [0.667]
65,"Subject: Request for Training and Support Hi ProCare Support Team,...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,positive,[training_and_support_requests],✔️ [0.667]
66,Subject: Concerns About Studio Maintenance and Rent Increase Dear ...,"{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,negative,"[routine_maintenance_requests, quality_and_safety_concerns, facili...",✔️ [0.633]


🏃 View run eval at: http://127.0.0.1:5000/#/experiments/0/runs/a261d13e1a4b48aa8a512977c943d673
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


EvaluationResult(score=76.96, results=<list of 68 results>)

In [35]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric,  # Same metric function works for both evaluation and GEPA optimization!
    # auto="light", # <-- We will use a light budget for this tutorial. However, we typically recommend using auto="heavy" for optimized performance!
    max_full_evals=3,
    num_threads=32,
    track_stats=True,
    use_merge=False,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=api_key)
)

In [36]:
optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/10/03 19:13:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ccfb1b68544f4348beecfcbbd18203c2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current dspy workflow
2025/10/03 19:13:28 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 396 metric calls of the program. This amounts to 3.00 full evals on the train+val set.
2025/10/03 19:13:28 INFO dspy.teleprompt.gepa.gepa: Using 66 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/396 [00:00<?, ?rollouts/s]

GEPA Optimization:  31%|███       | 504/1643 [10:38<24:02,  1.27s/rollouts]
2025/10/03 19:13:29 INFO dspy.evaluate.evaluate: Average Metric: 48.733333333333334 / 66 (73.8%)
2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.7383838383838384
GEPA Optimization:  17%|█▋        | 66/396 [00:01<00:05, 59.78rollouts/s]2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.7383838383838384


🏃 View run eval_0 at: http://127.0.0.1:5000/#/experiments/0/runs/68f696eeba74441eab1ff9624cd5ad95
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
Average Metric: 2.67 / 3 (88.9%): 100%|██████████| 3/3 [00:00<00:00, 70.20it/s]

2025/10/03 19:13:29 INFO dspy.evaluate.evaluate: Average Metric: 2.6666666666666665 / 3 (88.9%)
2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for urgency_module.predict: Task: Read the provided message and determine its urgency for a facilities management context (e.g., ProCare Facility Solutions). Output your assessment using the required format.

Output format (use exactly these keys):
- reasoning: 1–3 concise sentences explaining the key factors (safety risk, operational impact, time constraints, mitigation steps).
- urgency: one of the lowercase labels: low, medium, high.

How to assess urgency
Consider these factors:
- Safety risk: Any indication of immediate danger to people or property (fire/smoke, gas leak/odor, active water leak/flood, electrical arcing, structural failure, biohazard, elevator entrapment).
- Operational impact: Whether normal operations are halted or severely impaired (e.g., facility cannot open, critical area unusable, mis


🏃 View run eval_1 at: http://127.0.0.1:5000/#/experiments/0/runs/94672fdb23714b31bac51904751567af
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run eval_2 at: http://127.0.0.1:5000/#/experiments/0/runs/4e681d89fb1449aab34d3cf9dd12d743
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score is not better, skipping
GEPA Optimization:  18%|█▊        | 72/396 [00:01<00:06, 52.00rollouts/s]2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.7383838383838384


Average Metric: 2.60 / 3 (86.7%): 100%|██████████| 3/3 [00:00<00:00, 77.32it/s]

2025/10/03 19:13:29 INFO dspy.evaluate.evaluate: Average Metric: 2.6 / 3 (86.7%)
2025/10/03 19:13:29 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for sentiment_module.predict: Task: Read the provided “message” and determine its sentiment.

Input format:
- You will receive a single field named “message” containing a professional email-style text (often to ProCare Facility Solutions Support) about facility/maintenance topics (e.g., HVAC performance, cleaning residues affecting artifacts, minor leaks, follow-up maintenance requests, exhibit environment concerns). The tone is frequently polite and formal.

Output format:
- Produce two fields:
  - reasoning: 1–2 concise sentences explaining the label choice.
  - sentiment: one of exactly [positive, neutral, negative] in lowercase.

Labeling guidelines:
- Neutral:
  - Default when no explicit emotional language is present.
  - Informational, professional, or request-oriented messages (e.g., reporting an issue, asking for se


🏃 View run eval_3 at: http://127.0.0.1:5000/#/experiments/0/runs/856b9acd8e6449cb976d6e2d626ede3f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run eval_4 at: http://127.0.0.1:5000/#/experiments/0/runs/838404739d4e4e2197b2f0834311aa99
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:30 INFO dspy.evaluate.evaluate: Average Metric: 51.733333333333334 / 66 (78.4%)
2025/10/03 19:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New program is on the linear pareto front
2025/10/03 19:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full valset score for new program: 0.7838383838383839
2025/10/03 19:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full train_val score for new program: 0.7838383838383839
2025/10/03 19:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Individual valset scores for new program: [0.6666666666666666, 0.3333333333333333, 0.5333333333333333, 0.6333333333333333, 1.0, 0.6, 0.9666666666666667, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 1.0, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 1.0, 1.0, 0.3333333333333333, 0.9, 0.6666666666666666, 0.6666666666666666, 0.26666666666666666, 0.6333333333333333, 0.6666666666666666, 0.96666666666666

🏃 View run eval_5 at: http://127.0.0.1:5000/#/experiments/0/runs/96e3ceff8d604a1db82c52fd322041ab
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
Average Metric: 2.53 / 3 (84.4%): 100%|██████████| 3/3 [00:00<00:00, 73.36it/s]

2025/10/03 19:13:30 INFO dspy.evaluate.evaluate: Average Metric: 2.533333333333333 / 3 (84.4%)
2025/10/03 19:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for categories_module.predict: Task: Classify a single message into one or more predefined categories.

Input format:
- You will receive an object with a single field:
  - message: a free-form text string (e.g., an email to ProCare Facility Solutions).

Output format:
- Return ONLY a JSON object with one key:
  - categories: an array of strings containing all applicable category labels from the Allowed Categories list below.
- Do not include any additional text, explanations, or keys.

Allowed Categories and decision rules:
1) cleaning_services_scheduling
   - Use when the primary purpose is to schedule, reschedule, or adjust the timing/frequency of cleaning services.
   - Examples: requesting new bookings, changing cleaning times, confirming availability for a cleaning appointment.
   - Do NOT use if schedulin


🏃 View run eval_6 at: http://127.0.0.1:5000/#/experiments/0/runs/d6e4d21ae3774c8caa1f0e540c828074
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run eval_7 at: http://127.0.0.1:5000/#/experiments/0/runs/e98cb3547a7b4da68a8c6fe6bd3d2273
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:31 INFO dspy.evaluate.evaluate: Average Metric: 51.3 / 66 (77.7%)
2025/10/03 19:13:31 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Full valset score for new program: 0.7772727272727272
2025/10/03 19:13:31 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Full train_val score for new program: 0.7772727272727272
2025/10/03 19:13:31 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Individual valset scores for new program: [0.6333333333333333, 0.3333333333333333, 0.5666666666666667, 0.6666666666666666, 1.0, 0.6, 0.9666666666666667, 0.6333333333333333, 0.6333333333333333, 0.6333333333333333, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.9, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.3333333333333333, 0.9333333333333332, 0.6333333333333333, 0.6333333333333333, 0.3, 0.6666666666666666, 0.6333333333333333, 0.9666666666666667, 0.9666666666666667, 0.6333333333333333, 1.0, 0.6333333333333333, 0.9666666666666667, 0.9

🏃 View run eval_8 at: http://127.0.0.1:5000/#/experiments/0/runs/0730db59ee254c84847b4f3234560949
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
Average Metric: 2.27 / 3 (75.6%): 100%|██████████| 3/3 [00:00<00:00, 30.98it/s]

2025/10/03 19:13:31 INFO dspy.evaluate.evaluate: Average Metric: 2.2666666666666666 / 3 (75.6%)



🏃 View run eval_9 at: http://127.0.0.1:5000/#/experiments/0/runs/eadad9e404704b57bab34f7f9e3f6ee2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:31 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for urgency_module.predict: You are given a single input:
- message: A user’s email or note, often related to facility management/services (e.g., HVAC, cleaning, security, space utilization, sustainability).

Your task:
- Read the message and assign an urgency level: low, medium, or high.
- Provide a brief justification.

Output format (exactly these two keys):
- reasoning: 1–3 concise sentences explaining why you chose the urgency, referencing the message’s cues (do not quote the entire message).
- urgency: one of low, medium, high (lowercase).

Decision rubric:
Classify based on explicit urgency cues, impact/severity, time sensitivity, safety/operational risk, and escalation history. When in doubt, prioritize safety/operations and explicit urgency language.

High urgency:
- Explicit urgency or escalation: uses “urgent,” “immediate,” “ASAP,” “emergency,” “critical,” or sets a very short deadline (e.g., 

🏃 View run eval_10 at: http://127.0.0.1:5000/#/experiments/0/runs/e29e04d4942c4a9f9382bc015bf1b8bc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:32 INFO dspy.evaluate.evaluate: Average Metric: 56.06666666666666 / 66 (84.9%)
2025/10/03 19:13:32 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New program is on the linear pareto front
2025/10/03 19:13:32 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full valset score for new program: 0.8494949494949494
2025/10/03 19:13:32 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full train_val score for new program: 0.8494949494949494
2025/10/03 19:13:32 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Individual valset scores for new program: [1.0, 0.6666666666666666, 0.5333333333333333, 0.9666666666666667, 0.6666666666666666, 0.9333333333333332, 0.9666666666666667, 0.6666666666666666, 1.0, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 1.0, 1.0, 0.6666666666666666, 0.9, 0.6666666666666666, 1.0, 0.6, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.93333333333

🏃 View run eval_11 at: http://127.0.0.1:5000/#/experiments/0/runs/ea873c0fde404861b96806ee348af60a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
Average Metric: 2.33 / 3 (77.8%): 100%|██████████| 3/3 [00:00<00:00, 77.19it/s]

2025/10/03 19:13:32 INFO dspy.evaluate.evaluate: Average Metric: 2.333333333333333 / 3 (77.8%)
2025/10/03 19:13:32 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for sentiment_module.predict: Task
- Read the provided message (a professional, facility/maintenance-related email, often to ProCare Facility Solutions Support) and classify its sentiment.

Input format
- Single field: message
- Messages are typically polite, formal, and revolve around facility/maintenance topics (e.g., HVAC performance, cleaning residues affecting artifacts, minor leaks, follow-up maintenance requests, exhibit environment concerns). Closings often include routine thanks.

Output format
- Produce exactly two fields:
  - reasoning: 1–2 concise sentences explaining why the label was chosen, tied to specific language in the message.
  - sentiment: exactly one label in lowercase from [positive, neutral, negative].

Label definitions
- Neutral:
  - Default when no explicit emotional language is pres


🏃 View run eval_12 at: http://127.0.0.1:5000/#/experiments/0/runs/90c99cf06e404f01a7bf456c55a63b4f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run eval_13 at: http://127.0.0.1:5000/#/experiments/0/runs/dd9fb3b5e1714983b37f65ca5360b9d9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:33 INFO dspy.evaluate.evaluate: Average Metric: 57.4 / 66 (87.0%)
2025/10/03 19:13:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: New program is on the linear pareto front
2025/10/03 19:13:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full valset score for new program: 0.8696969696969696
2025/10/03 19:13:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full train_val score for new program: 0.8696969696969696
2025/10/03 19:13:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Individual valset scores for new program: [1.0, 1.0, 0.8666666666666667, 0.9666666666666667, 0.6666666666666666, 0.9333333333333332, 0.9666666666666667, 1.0, 1.0, 1.0, 0.9666666666666667, 1.0, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 1.0, 1.0, 0.6666666666666666, 0.9, 0.6666666666666666, 1.0, 0.6, 0.9666666666666667, 1.0, 0.9666666666666667, 0.6, 0.9666666666666667, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.93333333333333

🏃 View run eval_14 at: http://127.0.0.1:5000/#/experiments/0/runs/d1b9f0f088f048c9bdc329973fe92254
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
Average Metric: 2.30 / 3 (76.7%): 100%|██████████| 3/3 [00:00<00:00, 87.43it/s]

2025/10/03 19:13:33 INFO dspy.evaluate.evaluate: Average Metric: 2.3 / 3 (76.7%)
2025/10/03 19:13:33 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for categories_module.predict: Instruction: Classify facility-related messages into all applicable categories

Task
- Read the provided message (typically an email with subject and body) addressed to ProCare Facility Solutions.
- Determine all category labels that apply to the content. This is multi-label classification: select every relevant category, not just the primary one.

Domain context
- Messages concern facility services provided by ProCare Facility Solutions (e.g., cleaning, maintenance, facility management coordination).
- Typical senders are building occupants, managers, or clients (e.g., recording studio, residential complex, government office).
- Requests may be routine, urgent, or involve health/safety risks.

Categories and decision cues
- specialized_cleaning_services
  - Requests for non-routine or expert c


🏃 View run eval_15 at: http://127.0.0.1:5000/#/experiments/0/runs/dc9d432cfd7a4fa599409b85eb6babf9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run eval_16 at: http://127.0.0.1:5000/#/experiments/0/runs/91a851bd7e664bbaa778d46ecd025dde
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


2025/10/03 19:13:34 INFO dspy.evaluate.evaluate: Average Metric: 56.93333333333333 / 66 (86.3%)
2025/10/03 19:13:34 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Full valset score for new program: 0.8626262626262626
2025/10/03 19:13:34 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Full train_val score for new program: 0.8626262626262626
2025/10/03 19:13:34 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Individual valset scores for new program: [0.9666666666666667, 1.0, 0.9, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 1.0, 1.0, 1.0, 1.0, 0.9666666666666667, 1.0, 0.9666666666666667, 0.6666666666666666, 0.9666666666666667, 0.6666666666666666, 0.9333333333333332, 0.9666666666666667, 0.9666666666666667, 0.6666666666666666, 0.9, 0.6333333333333333, 0.9666666666666667, 0.6333333333333333, 1.0, 1.0, 0.9666666666666667, 0.6333333333333333, 0.9666666666666667, 1.0, 0.6666666666666666, 0.9666666666666667, 0.9, 0.9333333333333332, 1.0, 0.3333333333333333, 0.9333333333333332, 0.9, 0.966

🏃 View run eval_17 at: http://127.0.0.1:5000/#/experiments/0/runs/2475ed19aa0b49948452c5c0b79daba1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
🏃 View run kindly-robin-533 at: http://127.0.0.1:5000/#/experiments/0/runs/ccfb1b68544f4348beecfcbbd18203c2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0





In [37]:
evaluate(optimized_program)

Average Metric: 60.00 / 68 (88.2%): 100%|██████████| 68/68 [00:07<00:00,  8.60it/s]

2025/10/03 19:13:47 INFO dspy.evaluate.evaluate: Average Metric: 60.0 / 68 (88.2%)





Unnamed: 0,message,answer,urgency,sentiment,categories,metric
0,"Hey ProCare Support Team, Hope you all are doing great! My name is...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[sustainability_and_environmental_practices],✔️ [0.667]
1,"Hey ProCare Team, Hope you’re all doing well! My name’s Jake, and ...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,[routine_maintenance_requests],✔️ [0.667]
2,"Subject: Assistance Needed for HVAC Maintenance Hi [Receiver], I h...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,[routine_maintenance_requests],✔️ [1.000]
3,Subject: A Green Inquiry from a Bill Maher Enthusiast Hey ProCare ...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,"[sustainability_and_environmental_practices, general_inquiries]",✔️ [0.967]
4,Subject: Inquiry on Sustainability Practices Dear ProCare Facility...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[sustainability_and_environmental_practices],✔️ [1.000]
...,...,...,...,...,...,...
63,Subject: Inquiry About Your Eco-Friendly Practices Dear ProCare Fa...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[sustainability_and_environmental_practices],✔️ [0.933]
64,Subject: Assistance Needed for Facility Management Issue Dear ProC...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,neutral,[facility_management_issues],✔️ [0.667]
65,"Subject: Request for Training and Support Hi ProCare Support Team,...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[training_and_support_requests],✔️ [1.000]
66,Subject: Concerns About Studio Maintenance and Rent Increase Dear ...,"{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,"[routine_maintenance_requests, quality_and_safety_concerns, facili...",✔️ [0.967]


🏃 View run eval at: http://127.0.0.1:5000/#/experiments/0/runs/4930a46bf1034e41b810fc1c29b4906a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


EvaluationResult(score=88.24, results=<list of 68 results>)